In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
csmatrix = pd.read_pickle("./csmatrix.pkl")
ratings = pd.read_pickle("./ratings_df.pkl")
business_df = pd.read_pickle("./business_df_clean.pkl")

In [None]:
# csmatrix --> 10863 business_id x 10863 business_id

# restaurants_df --> 10863 business_ids x 291 food features

# final_df --> 856484 reviews x 11 columns (probably unnecessary)

# business_df_clean --> 10863 business_id x 9 columns

# ratings_df --> 856484 reviews × 3 columns (USE)

# Generate Recommendations for User

In [None]:
def generate_recommendations(user,csmatrix,ratings,business_df):
    # Get top rated restaurant by user
    user_ratings = ratings.loc[ratings['user_id']==user]
    user_ratings = user_ratings.sort_values(by='review_stars',axis=0,ascending=False)
    toprated = user_ratings.iloc[0,:]['business_id']
    # Find most similar restaurants to the user's top rated movie
    sims = csmatrix[toprated]
    mostsimilar = sims.sort_values(ascending=False).index.values
    # Get 10 most similar restaurants excluding the movie itself
    mostsimilar = mostsimilar[1:11]
    # Get titles of restaurants from ids
    toprated_name = business_df[business_df["business_id"]==toprated]["name"].values[0]
    
    recommendations_dict = {}
    
    for i in range(len(mostsimilar)):
        recommendations_dict[business_df[business_df["business_id"]==mostsimilar[i]]["name"].values[0]] = [str(business_df[business_df["business_id"]==mostsimilar[i]]["stars"].values[0]) + " stars", str(business_df[business_df["business_id"]==mostsimilar[i]]["review_count"].values[0]) + " total reviews"]

    return toprated_name, recommendations_dict

In [None]:
user = "4wMvgdEVpFLCIhFANNBvGA"
toprated, recs_dict = generate_recommendations(user,csmatrix=csmatrix,ratings=ratings,business_df=business_df)
print("User's highest rated place was: {}".format(toprated))

User's highest rated place was: Seikaku Sushi & Steak


In [None]:
recs_dict

{"Sam's Sushi": ['4.5 stars', '43 total reviews'],
 'Osaka Sushi & Thai': ['3.5 stars', '162 total reviews'],
 'Seikaku Sushi & Steak': ['4.5 stars', '38 total reviews'],
 'Samurai Blue Sushi and Sake Bar': ['3.5 stars', '99 total reviews'],
 'Sushi Fune': ['4.0 stars', '6 total reviews'],
 'Bonsai Sushi': ['4.0 stars', '233 total reviews'],
 'Sushi Alive': ['4.5 stars', '138 total reviews'],
 'Saki Endless Sushi & Hibachi Eatery': ['3.0 stars', '105 total reviews'],
 'Fuji Sushi': ['3.5 stars', '5 total reviews'],
 'Snookers Grill': ['3.5 stars', '6 total reviews']}

# Calculate RMSE

In [None]:
# Split our data into training and validation sets
X = ratings.drop(labels=['review_stars'],axis=1)
y = ratings['review_stars']
X_train, X_val, y_train, y_val = train_test_split(X,y,random_state=0, test_size=0.2)

In [None]:
def predict_rating(user_item_pair,simtable=csmatrix,X_train=X_train, y_train=y_train):
    movie_to_rate = user_item_pair['business_id']
    user = user_item_pair['user_id']
    # Filter similarity matrix to only movies already reviewed by user
    movies_watched = X_train.loc[X_train['user_id']==user, 'business_id'].tolist()
    simtable_filtered = simtable.loc[movie_to_rate,movies_watched]
    # Get the most similar movie already watched to current movie to rate
    try:
        most_similar_watched = simtable_filtered.index[np.argmax(simtable_filtered)]
    except:
        return 3
    # Get user's rating for most similar movie
    idx = X_train.loc[(X_train['user_id']==user) & (X_train['business_id']==most_similar_watched)].index.values[0]
    most_similar_rating = y_train.loc[idx]
    return most_similar_rating

In [None]:
# Get the predicted ratings for each movie in the validation set and calculate the RMSE
ratings_valset = X_val.sample(1000, random_state=0).apply(lambda x: predict_rating(x),axis=1)
val_rmse = np.sqrt(mean_squared_error(y_val.sample(1000, random_state=0),ratings_valset))
print('RMSE of predicted ratings is {:.3f}'.format(val_rmse))

RMSE of predicted ratings is 1.706


# Content Filtering RMSE is: 1.706