In [4]:
#Importing the libraries
import sqlite3
import pandas as pd
#from surprise import Reader
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking
from implicit.evaluation import train_test_split, precision_at_k
import time

In [5]:
#Fetching the data and establishing connection
sql_connection = sqlite3.connect('yelpHotelData.db')
cursorObj = sql_connection.cursor()
cursorObj.execute("SELECT name FROM sqlite_master WHERE type = 'table';")
print(cursorObj.fetchall())

[('review',), ('sqlite_stat1',), ('sqlite_stat2',), ('reviewer',), ('hotel',)]


In [6]:
review_df = pd.read_sql_query("Select * from review",sql_connection)
print(review_df.head())

        date                reviewID              reviewerID  \
0   6/8/2011            MyNjnxzZVTPq  IFTr6_6NI4CgCVavIL9k5g   
1  8/30/2011  BdD7fsPqHQL73hwENEDT-Q  c_-hF15XgNhlyy_TqzmdaA   
2  6/26/2009                BfhqiyfC  CiwZ6S5ZizAFL5gypf8tLA   
3  9/16/2010                      Ol  nf3q2h-kSQoZK2jBY92FOg   
4   2/5/2010  i4HIAcNTjabdpG1K4F5Q2g  Sb3DJGdZ4Rq__CqxPbae-g   

                                       reviewContent  rating  usefulCount  \
0  Let me begin by saying that there are two kind...       5           18   
1  The only place inside the Loop that you can st...       3            0   
2  I have walked by the Tokyo Hotel countless tim...       5           12   
3  If you are considering staying here, watch thi...       1            8   
4  This place is disgusting, absolutely horrible,...       3           11   

   coolCount  funnyCount flagged                 hotelID  
0         11          28       N  tQfLGoolUMu2J0igcWcoZg  
1          3           4       N  

In [7]:
sqlite_stat1_df = pd.read_sql_query("Select * from sqlite_stat1",sql_connection)
print(sqlite_stat1_df.head())

        tbl                          idx      stat
0  reviewer  sqlite_autoindex_reviewer_1    5126 1
1    review    sqlite_autoindex_review_1  426430 1


In [8]:
sqlite_stat2_df = pd.read_sql_query("Select * from sqlite_stat2",sql_connection)
print(sqlite_stat2_df.head())

     tbl                       idx sampleno                  sample
0  hotel  sqlite_autoindex_hotel_1        0  28eUMim7qhJD_Ly5dSRpgQ
1  hotel  sqlite_autoindex_hotel_1        1  8W76AO7Bbn_IodoZrCoLag
2  hotel  sqlite_autoindex_hotel_1        2  ExkYx-EPG20eap-BuLeuBQ
3  hotel  sqlite_autoindex_hotel_1        3  LIEqsMIHMoxeVg-vT-LyHQ
4  hotel  sqlite_autoindex_hotel_1        4  RjF1clwgexRkXYMtnwvl-g


In [9]:
reviewer_df = pd.read_sql_query("Select * from reviewer",sql_connection)
print(reviewer_df.head())

               reviewerID                         name               location  \
0  yevHGEUQQmnVlBXIrJ885A                     Kevin T.         Oconomowoc, WI   
1  yoB_PYQHjnPjh78ATA0Jgw                  Veronica B.         Saint Paul, MN   
2  XrFCag4AMW5qta9QXokWPA        Paul The Commander M.        Saint Louis, MO   
3  y5ptsWmvGEAftOQaiFhBcg          Stella BraveTart J.  Lexington-Fayette, KY   
4  uUVZJm9yxNl5FBsXbt4WBg  Ginger 'where's my meds' v.      San Francisco, CA   

     yelpJoinDate  friendCount  reviewCount  firstCount  usefulCount  \
0        May 2011            4           86           3          129   
1    January 2010            5           49           5           63   
2     August 2008           15          135          29          235   
3  September 2009           49          104          36          282   
4     August 2008           22           34           2           81   

   coolCount  funnyCount  complimentCount  tipCount  fanCount  
0         47    

In [10]:
hotel_df = pd.read_sql_query("Select * from hotel",sql_connection)
print(hotel_df.head())

                  hotelID                                           name  \
0  pSLh_XyV_3QS1hNsBOGHiQ                                Old Chicago Inn   
1  tQfLGoolUMu2J0igcWcoZg                                    Tokyo Hotel   
2  33Xc1Bk_gkSY5xb2doQ7Ng  The Tremont Chicago Hotel at Magnificent Mile   
3  2nnXespKBBNtDQTtrumNFg                            Inn At Lincoln Park   
4  SNuJYJewLhunxlhEezo15w                                 Carleton Hotel   

                                            location  reviewCount  rating  \
0           Old Chicago Inn - Lakeview - Chicago, IL            1     3.0   
1        Tokyo Hotel - Near North Side - Chicago, IL            6     3.0   
2  The Tremont Chicago Hotel at Magnificent Mile ...           44     3.0   
3   Inn At Lincoln Park - Lincoln Park - Chicago, IL           20     2.0   
4                      Carleton Hotel - Oak Park, IL           31     4.0   

                                          categories  \
0  Event Planning & Serv

In [11]:
#Data Analysis
review_df.duplicated(["reviewID","reviewerID","hotelID","rating","date"]).sum()

0

In [14]:
#Merging the two tables
temp = pd.merge(review_df,hotel_df, on=['hotelID'], how='left')
hotel_review = temp[['reviewerID', 'hotelID', 'rating_x', 'categories']]
hotel_review = hotel_review.rename(columns={'rating_x':'rating'})

In [15]:
#Drop the na values
hotel_review = hotel_review.dropna()
hotel_review

Unnamed: 0,reviewerID,hotelID,rating,categories
0,IFTr6_6NI4CgCVavIL9k5g,tQfLGoolUMu2J0igcWcoZg,5,"Event Planning & Services, Hotels, Hotels & Tr..."
1,c_-hF15XgNhlyy_TqzmdaA,tQfLGoolUMu2J0igcWcoZg,3,"Event Planning & Services, Hotels, Hotels & Tr..."
2,CiwZ6S5ZizAFL5gypf8tLA,tQfLGoolUMu2J0igcWcoZg,5,"Event Planning & Services, Hotels, Hotels & Tr..."
3,nf3q2h-kSQoZK2jBY92FOg,tQfLGoolUMu2J0igcWcoZg,1,"Event Planning & Services, Hotels, Hotels & Tr..."
4,Sb3DJGdZ4Rq__CqxPbae-g,tQfLGoolUMu2J0igcWcoZg,3,"Event Planning & Services, Hotels, Hotels & Tr..."
...,...,...,...,...
688324,e7B7IsZlRT8LbFj8FcY78w,9xny0IlJqTInobC6W-UxbA,5,"Event Planning & Services, Hotels, Hotels & Tr..."
688325,e7B7IsZlRT8LbFj8FcY78w,PmmTXis1gCL34mg2bZ9gtw,2,"Event Planning & Services, Hotels, Hotels & Tr..."
688326,e7B7IsZlRT8LbFj8FcY78w,Mr6zu_hWk2CodBdqqMWQjg,5,"Nightlife, Bars, Dance Clubs, METADATA"
688327,e7B7IsZlRT8LbFj8FcY78w,-zetzVfO4X0dpiiTmjdeKg,5,"Restaurants, Sushi Bars, Japanese, METADATA"


In [16]:
#Splitting the data
index = int(len(hotel_review)*0.80)
training_data = hotel_review[:index]
testing_data = hotel_review[index:]

In [17]:
#Ratings per user
s = review_df.groupby('reviewerID')['rating'].mean()
review_df['rating'].fillna(review_df['reviewerID'].map(s), inplace=True)
ratings_per_user = training_data.groupby(by = "reviewerID")["rating"].count().sort_values(ascending = False)
ratings_per_user.head()

reviewerID
P5bUL3Engv-2z6kKohB6qQ    5434
XYSDrIef7g4Gmp3lNFVO6A    2602
om5ZiponkpRqUNa3pVPiRg    2073
EiwxlbR8fb68lMgEXhcWKA    2031
6s-g2vFu12OemhiK3FJuOQ    2018
Name: rating, dtype: int64

In [18]:
#Ratings per hotel
t = review_df.groupby('hotelID')['rating'].mean()
review_df['rating'].fillna(review_df['hotelID'].map(t), inplace=True)
ratings_per_hotel = training_data.groupby(by = "hotelID")["rating"].count().sort_values(ascending = False)
ratings_per_hotel.head()

hotelID
2m_KBLL9Bk_dwZ5n5b11nw    464
NkOir65b_YAAQVlJR_zmJA    365
YfzsjiJl3m-Yza64aIL7SQ    261
mLdvQGAFadIJkOlEoHsI7w    246
31yYZQjEIyYE0N3p_vmoOA    242
Name: rating, dtype: int64

In [19]:
class Item:
    def __init__(self,id,category,score=None):
        self.id = id
        self.category = category
        self.score = score
    def __repr__(self):
        return self.id

In [20]:
#Creating mapping
mapping = {}
for i in range(len(training_data)):
    item = training_data.iloc[i,1]
    cat = training_data.iloc[i,3]
    cat_split = cat.split(',')
    prob = 1. / len(cat_split)
    cat_ratios = {j: prob for j in cat_split}
    no_str = Item(item,cat_ratios)
    mapping[item] = no_str

In [21]:
#converting into categories
training_data['reviewerID'] = training_data['reviewerID'].astype('category')
training_data['hotelID'] = training_data['hotelID'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['reviewerID'] = training_data['reviewerID'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  training_data['hotelID'] = training_data['hotelID'].astype('category')


In [22]:
#Converting the ID's for sparse matrix
user_index = training_data['reviewerID'].cat.categories
item_index = training_data['hotelID'].cat.categories

In [23]:
#Creating the sparse matrix
rows = training_data['reviewerID'].cat.codes
cols = training_data['hotelID'].cat.codes
values = training_data['rating'].astype(np.float32)
sparse_matrix = csr_matrix((values, (rows, cols)))
sparse_matrix

<5107x244976 sparse matrix of type '<class 'numpy.float32'>'
	with 549282 stored elements in Compressed Sparse Row format>

In [24]:
#BPR Model generation
np.random.seed(123)
bpr = BayesianPersonalizedRanking(iterations=50)
bpr.fit(sparse_matrix.T.tocsr())

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [25]:
#Collaborative recommendation 
user_id = int(input('Enter user id for collaborative recommendations '))
visit_id = sparse_matrix[user_id].nonzero()[1]
visit_hotels = [mapping[training_data['hotelID'][index]] for index in visit_id]
visit_hotels

Enter user id for collaborative recommendations 0


[zf62D_W7iGzgdV6U_CeApQ,
 QyP2x7Jd7byqeqM3RtPEWQ,
 dgePOeVxgz4oUamFJL15-g,
 ElCbMJRjgu9LDj4MxOKXew,
 3GBS2-N2c7NumwB-ZuxhTg]

In [26]:
#Top recommended hotels
top = int(input('Enter top number of recommendations '))
recommended_id = bpr.recommend(user_id, sparse_matrix, N=top)
collaborative_recommendations = [mapping[training_data['hotelID'][index]] for index, _ in recommended_id]
collaborative_recommendations

Enter top number of recommendations 10


[mnGRyIurwNLqPlQzJY1waQ,
 5JywOYVclW1qdTdxjDSSOg,
 eUFIUizut46TWSaQstFPCA,
 SOwI03p00DU4WHRn5KWvvw,
 _IoOnWFUjAZDH_UCy8xmNA,
 VaBEvR1fyFY9zy9cQKmEcQ,
 wUopsQnUYkRX3J4EDCOqWg,
 41zKVRChsGjrNQh4nqooFg,
 sDYjKyI_67eUhFAqVneYjA,
 nEVME0CyJZ2jbXTClx_aFw]

In [27]:
#Catergory distribution
def distribution(hotels):
    prob = {}
    for i in hotels:
        for cat, dist in i.category.items():
            cat_prob = prob.get(cat, 0.)
            prob[cat] = cat_prob + dist
    total = sum(prob.values())
    for i, cat_prob in prob.items():
        norm = round(cat_prob / total, 3)
        prob[i] = norm
    return prob

In [28]:
#Distribution of the visited and recommended hotels
visited_hotels = distribution(visit_hotels)
recommended_hotel = distribution(collaborative_recommendations)

In [29]:
recommended_hotel

{'Nightlife': 0.02,
 ' Bars': 0.037,
 ' Arts & Entertainment': 0.02,
 ' Jazz & Blues': 0.02,
 ' METADATA': 0.303,
 'Local Flavor': 0.05,
 'Food': 0.025,
 ' Beer': 0.025,
 ' Wine & Spirits': 0.025,
 'Restaurants': 0.183,
 ' Seafood': 0.033,
 'Active Life': 0.025,
 ' Fitness & Instruction': 0.025,
 ' Gyms': 0.025,
 ' American (Traditional)': 0.033,
 ' American (New)': 0.05,
 ' Italian': 0.033,
 ' Nightlife': 0.017,
 ' Lounges': 0.017,
 ' Sushi Bars': 0.033}

In [30]:
visited_hotels

{'Restaurants': 0.233,
 ' Indian': 0.079,
 ' Pakistani': 0.05,
 ' METADATA': 0.262,
 ' Mexican': 0.067,
 ' Barbeque': 0.067,
 ' Southern': 0.05,
 ' Breakfast & Brunch': 0.05,
 'Food': 0.029,
 ' Grocery': 0.029,
 ' Restaurants': 0.029,
 ' Buffets': 0.029,
 ' Desserts': 0.029}

In [32]:
#Calculation of the KL_Divergence
def calc_KL(visited_hotels, recommended_hotels, alpha):
    kl = 0.
    for cat, prob in visited_hotels.items():
        recommendation_score = recommended_hotels.get(cat, 0.)
        recommendation_score = (1 - alpha) * recommendation_score + alpha * prob
        kl = kl + prob * np.log2(prob / recommendation_score)
    return kl

In [33]:
#KL_Divergence values using the collaborative filtering
calc_KL(visited_hotels, recommended_hotel, 0.03)

2.4542553475772766

In [34]:
#removing the the visited hotels form the recomemnded hotels
list_of_hotels = sparse_matrix.shape[1]

user_factor = bpr.user_factors[user_id]
scores = bpr.item_factors.dot(user_factor)

visited_id = set()
visited_id = set(sparse_matrix[user_id].indices)

hotel_id = set(np.arange(list_of_hotels))
hotel_id = hotel_id - visited_id

recommended_hotels = []
for i in hotel_id:
    hotel = mapping[item_index[i]]
    hotel.score = scores[i]
    recommended_hotels.append(hotel)

In [35]:
#Calculation of the utitilty 
def utility_calculation(hotels_recommended, visited, lmbda, alpha):
    recommendations = distribution(hotels_recommended)
    kl_diverge = calc_KL(visited, recommendations,alpha)
    
    tot_score = 0.0
    for hotel in hotels_recommended:
        tot_score = tot_score + hotel.score
    
    utility = (1-lmbda) * tot_score - lmbda * kl_diverge
    return utility

In [36]:
#Caliberated recommendations
n_top = 10
def caliberated_recommendations(hotels, visited, n_top, lmbda, alpha):
    cal_recommend = []
    for i in range (n_top):
        max_utility = -np.inf
        for hotel in hotels:
            if hotel in cal_recommend:
                continue
            utility = utility_calculation(cal_recommend + [hotel],visited,lmbda, alpha)
            if utility  > max_utility:
                max_utility = utility
                liked_hotel = hotel
        cal_recommend.append(liked_hotel)
    return cal_recommend   

In [37]:
# Recommendations got using the caliberated recommendations
cal_recommend_hotels = caliberated_recommendations(recommended_hotels, visited_hotels, n_top, lmbda = 0.99, alpha = 0.01)
cal_recommend_hotels

[jYl6b2TidOPPZbN6fJcheQ,
 WBWZ_65VNXKJFuH0RiIhZw,
 m149-afaa2ssPY6WBnd_wA,
 057iUDZ5sXMVdXUtKmCCUA,
 41b2SLmjLcxTGLVRxASiDA,
 eBxc95u_Gcs9bLST0UntOQ,
 Yq7j28Ws9TbM551SjkML8A,
 VOtKVEDHPozjZHMbZ2GsYw,
 7-2fthHiX4XTlkPkdBrATw,
 3GBS2-N2c7NumwB-ZuxhTg]

In [38]:
#KL_Divergence using caliberated recommendations
x = distribution(cal_recommend_hotels)
calc_KL(visited_hotels, x, 0.01)

0.06343693676317233

In [48]:
#Range for the lambda and alpha values
lamb = np.round(np.arange(0.15,0.98,0.1), 2).tolist()
alpha = np.round(np.arange(0.01, 0.3, 0.02), 2).tolist()

In [49]:
#Lambda list
lamb

[0.15, 0.25, 0.35, 0.45, 0.55, 0.65, 0.75, 0.85, 0.95]

In [50]:
#Alpha list
alpha

[0.01,
 0.03,
 0.05,
 0.07,
 0.09,
 0.11,
 0.13,
 0.15,
 0.17,
 0.19,
 0.21,
 0.23,
 0.25,
 0.27,
 0.29]

In [None]:
#Hyperparameter tuning
calibrated_recommendation = []
kl_divergence = []
lam = []
alp = []
for i in lamb:
    for j in alpha:
        lam.append(i)
        alp.append(j)
        x = caliberated_recommendations(recommended_hotels, visited_hotels, n_top,lmbda = i, alpha = j)
        calibrated_recommendation.append(x)
        distributions = distribution(x)
        kl = calc_KL(visited_hotels,distributions, alpha = j)
        kl_divergence.append(kl)

In [None]:
#Dataframe of the combination of the lambda and alpha list for the kl_divergence
df = pd.DataFrame(list(zip(lam,alp,kl_divergence,calibrated_recommendation)), columns=['lambda','alpha','kl_divergence','calibrated_recommendation'])

In [None]:
df