In [271]:
import surprise
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.accuracy import mae, rmse, mse
from surprise.model_selection import GridSearchCV
from scipy.stats import pearsonr
import numpy

In [272]:
import random
# set random seed
my_seed = 0
random.seed(my_seed)

In [273]:
from collections import defaultdict

def get_top_k(predictions, k=10):
    '''Return the top-K recommended items for each user from predictions.'''
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Sort and pick top k
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = [iid for (iid, _) in user_ratings[:k]]

    return top_k

In [274]:
def get_true_positives(testset, threshold=4.0):
    '''Return items considered relevant per user from the testset.'''
    relevant = defaultdict(set)
    for uid, iid, true_r in testset:
        if true_r >= threshold:
            relevant[uid].add(iid)
    return relevant

In [275]:
def precision_at_k(top_k_preds, relevant_items, k):
    precisions = []
    for uid in top_k_preds:
        if uid in relevant_items:
            hits = len(set(top_k_preds[uid]) & relevant_items[uid])
            precisions.append(hits / k)

    if len(precisions) == 0:
        print("Warning: No overlap between predicted users and relevant users.")
        return 0.0

    return sum(precisions) / len(precisions)

def hit_rate_at_k(top_k_preds, relevant_items):
    hits = 0
    total = 0
    for uid in relevant_items:
        total += 1
        if set(top_k_preds[uid]) & relevant_items[uid]:
            hits += 1

    if hits == 0:
        print("Warning: No overlap between predicted users and relevant users.")
        return 0.0
    return hits / total

# Baseline SVD model

In [276]:
df_train = pd.read_csv("../data/rec_train_preprocessed_binned.csv")
df_test = pd.read_csv("../data/rec_test_preprocessed_binned.csv")
col = ["date", "text", "look", "smell", "taste", "feel", "overall", "has_smell", "has_taste", "has_look", "has_feel", "mentioned_aspects", "sentiment", "score_bin"]
df_train = df_train.drop(col, axis=1)
df_test = df_test.drop(col, axis=1)
df_train

Unnamed: 0,beer_id,username,score
0,62040,GregSVT,4.32
1,292393,NeroFiddled,4.14
2,55291,matjack85,1.90
3,223330,BlurryVisi0n,4.11
4,222637,Brutaltruth,3.34
...,...,...,...
15995,612,number1bum,3.68
15996,5,damndirtyape,3.78
15997,25608,CloudStrife,3.47
15998,30517,Thorpe429,4.37


In [277]:
X_train = Dataset.load_from_df(df_train, reader=Reader(rating_scale=(0, 5)))
trainset = X_train.build_full_trainset()
X_test = Dataset.load_from_df(df_test, reader=Reader(rating_scale=(0, 5)))
testset = X_test.build_full_trainset().build_testset()

In [278]:
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset)
predictions = algo.test(testset)
mae(predictions, verbose=True)
mse(predictions, verbose=True)
rmse(predictions, verbose=True)

MAE:  0.6974
MSE: 0.6957
RMSE: 0.8341


0.8340850307794914

In [279]:
K = 10
top_k_preds = get_top_k(predictions, k=K)
relevant_items = get_true_positives(testset, threshold=4.0)

prec = precision_at_k(top_k_preds, relevant_items, k=K)
hit = hit_rate_at_k(top_k_preds, relevant_items)

print(f'Precision@{K}: {prec:.4f}')
print(f'Hit Rate@{K}: {hit:.4f}')

Precision@10: 0.1301
Hit Rate@10: 1.0000


* For every user only 13% were actually relevant
* Every user got at least one relevant recommendation

# SVD while applying sentiment analysis results

In [280]:
columns = ["feel_true_rating", "look_true_rating", "smell_true_rating", "taste_true_rating", "overall", "score", "date", "text"]

In [281]:
train_df = pd.read_csv("../data/RecSys_AspectSentiment_train.csv")
test_df = pd.read_csv("../data/RecSys_AspectSentiment_test.csv")

In [282]:
# create new dataframe dropping columns from columns
train_df1 = train_df.drop(columns=columns)
test_df1 = test_df.drop(columns=columns)

In [283]:
# compute score for train_merged1 by computing the average of the predicted ratings
train_df1['score'] = train_df1[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating']].mean(axis=1)

In [284]:
test_df1['score'] = test_df1[['feel_predicted_rating', 'look_predicted_rating', 'smell_predicted_rating', 'taste_predicted_rating']].mean(axis=1)

In [285]:
X_train1 = Dataset.load_from_df(train_df1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
X_test1 = Dataset.load_from_df(test_df1[["username", "beer_id", "score"]], reader=Reader(rating_scale=(0, 5)))
trainset1 = X_train1.build_full_trainset()
testset1 = X_test1.build_full_trainset().build_testset()

In [286]:
# why are metrics fluctuating?
# Perform SVD algorithm
algo = SVD(n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset1)
predictions1 = algo.test(testset1)
mae(predictions1, verbose=True)
mse(predictions1, verbose=True)
rmse(predictions1, verbose=True)

MAE:  0.4443
MSE: 0.2818
RMSE: 0.5308


0.5308068825457658

In [287]:
K = 10
top_k_preds = get_top_k(predictions1, k=K)
relevant_items = get_true_positives(testset1, threshold=4.0)

prec = precision_at_k(top_k_preds, relevant_items, k=K)
hit = hit_rate_at_k(top_k_preds, relevant_items)

print(f'Precision@{K}: {prec:.4f}')
print(f'Hit Rate@{K}: {hit:.4f}')

Precision@10: 0.1348
Hit Rate@10: 1.0000


In [288]:
# Perform GridSearch on the SVD algorithm
from surprise.model_selection import GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.005, 0.01], 'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'mse'], cv=5)
gs.fit(X_train1)
print(gs.best_params['rmse'])

{'n_epochs': 20, 'lr_all': 0.01, 'reg_all': 0.4}


In [289]:
algo = SVD(n_epochs=20, lr_all=0.01, reg_all=0.4)
algo.fit(trainset1)
predictions1 = algo.test(testset1)
mae(predictions1, verbose=True)
mse(predictions1, verbose=True)
rmse(predictions1, verbose=True)

MAE:  0.3821
MSE: 0.2265
RMSE: 0.4759


0.47591290445964357

In [290]:
K = 10
top_k_preds = get_top_k(predictions1, k=K)
relevant_items = get_true_positives(testset1, threshold=4.0)

prec = precision_at_k(top_k_preds, relevant_items, k=K)
hit = hit_rate_at_k(top_k_preds, relevant_items)

print(f'Precision@{K}: {prec:.4f}')
print(f'Hit Rate@{K}: {hit:.4f}')

Precision@10: 0.1352
Hit Rate@10: 1.0000
