In [189]:
import surprise
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.accuracy import mae, rmse, mse

In [190]:
import random
# set random seed
my_seed = 0
random.seed(my_seed)

# Baseline SVD model

In [191]:
df = pd.read_csv("../data/reviews_stratified_sampled.csv")
col = ["date", "text", "look", "smell", "taste", "feel", "overall", "has_smell", "has_taste", "has_look", "has_feel", "mentioned_aspects", "sentiment"]
df = df.drop(col, axis=1)

In [192]:
data = Dataset.load_from_df(df[['username','beer_id','score']], Reader(rating_scale=(0, 5)))
X_train, X_test = surprise.model_selection.train_test_split(data, test_size=0.2, random_state=my_seed)


In [193]:
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(X_train)
predictions = algo.test(X_test)
mae(predictions, verbose=True)
mse(predictions, verbose=True)
rmse(predictions, verbose=True)

MAE:  0.8178
MSE: 0.8802
RMSE: 0.9382


0.9381952164007703

In [212]:
raw_ratings = data.raw_ratings

# shuffle ratings if you want
random.shuffle(raw_ratings)

# A = 80% of the data, B = 20% of the data
threshold = int(0.8 * len(raw_ratings))
A_raw_ratings = raw_ratings[:threshold]
B_raw_ratings = raw_ratings[threshold:]

data.raw_ratings = A_raw_ratings  # data is now the set A

In [214]:
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.005, 0.01], 'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'mse'], cv=5)
gs.fit(data)
print(gs.best_score['rmse'])
print(gs.best_score['mae'])
print(gs.best_score['mse'])
print(gs.best_params['rmse'])

0.8891970884889793
0.7526488553678874
0.7906914975251021
{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.4}


In [215]:
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.4)
algo.fit(X_train)
predictions = algo.test(X_test)
mae(predictions, verbose=True)
mse(predictions, verbose=True)
rmse(predictions, verbose=True)

MAE:  0.7346
MSE: 0.7570
RMSE: 0.8700


0.8700293716854168

# SVD while applying sentiment analysis results

In [194]:
columns = ["feel_true_rating", "look_true_rating", "smell_true_rating", "taste_true_rating", "overall", "score", "aspect", "date", "text", "true_rating"]
columns2 = ["feel_true_rating", "look_true_rating", "smell_true_rating", "taste_true_rating", "overall", "aspect", "date", "text", "true_rating"]

In [195]:
train_df = pd.read_csv("../data/train_wide.csv")
test_df = pd.read_csv("../data/test_wide.csv")

In [196]:
model_evaluation_df = pd.read_csv("../data/model_evaluation_results.csv")
model_evaluation_df = model_evaluation_df.drop_duplicates(subset=['beer_id', 'username'])

In [197]:
# Perform a left join of train_df with model_evaluation_df on 'beer_id' and 'username'
train_merged = pd.merge(train_df, model_evaluation_df, on=['beer_id', 'username'], how='left')

# Perform a left join of test_df with model_evaluation_df on 'beer_id' and 'username'
test_merged = pd.merge(test_df, model_evaluation_df, on=['beer_id', 'username'], how='left')

In [198]:
# create new dataframe dropping columns from columns
train_merged1 = train_merged.drop(columns=columns)
test_merged1 = test_merged.drop(columns=columns)

In [199]:
# compute score for train_merged1 by computing the average of the predicted ratings
train_merged1['score'] = train_merged1[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating', 'predicted_rating']].mean(axis=1)

In [200]:
test_merged1['score'] = test_merged1[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating', 'predicted_rating']].mean(axis=1)

In [201]:
X_train1 = Dataset.load_from_df(train_merged1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
X_test1 = Dataset.load_from_df(test_merged1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
trainset1 = X_train1.build_full_trainset()
testset1 = X_test1.build_full_trainset().build_testset()

In [202]:
# why are metrics fluctuating?
# Perform SVD algorithm
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset1)
predictions1 = algo.test(testset1)
mae(predictions1, verbose=True)
mse(predictions1, verbose=True)
rmse(predictions1, verbose=True)

MAE:  0.4967
MSE: 0.3508
RMSE: 0.5922


0.5922449004932442

In [203]:
# Perform GridSearch on the SVD algorithm
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.005, 0.01], 'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'mse'], cv=5)
gs.fit(X_train1)
print(gs.best_score['rmse'])
print(gs.best_score['mae'])
print(gs.best_score['mse'])
print(gs.best_params['rmse'])

0.5390181478192837
0.4395770743682167
0.29056630444390463
{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.4}


In [204]:
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.4)
algo.fit(trainset1)
predictions1 = algo.test(testset1)
mae(predictions1, verbose=True)
mse(predictions1, verbose=True)
rmse(predictions1, verbose=True)

MAE:  0.4247
MSE: 0.2783
RMSE: 0.5276


0.5275729470256693

# Model where average takes original score

In [205]:
train_merged2 = train_merged.drop(columns=columns2)
test_merged2 = test_merged.drop(columns=columns2)

In [206]:
# set score to averge of the predicted ratings and the score itself
train_merged2['score'] = train_merged2[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating', 'predicted_rating', 'score']].mean(axis=1)
test_merged2['score'] = test_merged2[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating', 'predicted_rating', 'score']].mean(axis=1)

In [207]:
X_train2 = Dataset.load_from_df(train_merged1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
X_test2 = Dataset.load_from_df(test_merged1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
trainset2 = X_train2.build_full_trainset()
testset2 = X_test2.build_full_trainset().build_testset()

In [208]:
# why are metrics fluctuating?
# Perform SVD algorithm
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset2)
predictions2 = algo.test(testset2)
mae(predictions2, verbose=True)
mse(predictions2, verbose=True)
rmse(predictions2, verbose=True)

MAE:  0.4970
MSE: 0.3516
RMSE: 0.5929


0.5929178576988832

In [209]:
# Perform GridSearch on the SVD algorithm
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.005, 0.01], 'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'mse'], cv=5)
gs.fit(X_train2)
print(gs.best_score['rmse'])
print(gs.best_score['mae'])
print(gs.best_score['mse'])
print(gs.best_params['rmse'])

0.5390246978462327
0.4394882958713289
0.2905716897654559
{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.4}


In [210]:
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.4)
algo.fit(trainset2)
predictions2 = algo.test(testset2)
mae(predictions2, verbose=True)
mse(predictions2, verbose=True)
rmse(predictions2, verbose=True)

MAE:  0.4251
MSE: 0.2784
RMSE: 0.5277


0.5276769065166959