In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from surprise import Dataset, Reader, accuracy, SVD


In [2]:
ratings = pd.read_csv("ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
movies = pd.read_csv("movies.csv")
movies = movies.drop("genres", axis=1)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [4]:
df = ratings.drop("timestamp", axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [5]:
# using S_train = 80%, S_dev = 10%, S_test = 10%
trainset, test_val = train_test_split(df, train_size=0.8)

print("Training set size: ", trainset.shape)

valset, testset = train_test_split(test_val, test_size=0.5)

print("Validation set size : ", valset.shape)
print("Test set size: ", testset.shape)

Training set size:  (80668, 3)
Validation set size :  (10084, 3)
Test set size:  (10084, 3)


In [6]:
reader = Reader(rating_scale = (1, 5))

train_data = Dataset.load_from_df(trainset, reader)
val_data = Dataset.load_from_df(valset, reader)
test_data = Dataset.load_from_df(testset, reader)

In [7]:
train_data = train_data.build_full_trainset()
val_data = val_data.build_full_trainset()
test_data = test_data.build_full_trainset()

In [8]:
val_data = val_data.build_testset()
test_data = test_data.build_testset()

In [9]:
list1 = []

# Training using 1 to 100 factors on S_train and checking RMSE values on S_dev to get optimal number of factors
for factors in range(1, 101):
    algo = SVD(n_factors=factors)
    algo.fit(train_data)
    predictions = algo.test(val_data)
    acc_values = accuracy.rmse(predictions, verbose=False)
    list1.append(acc_values)

print(list1)


[0.8533826273110863, 0.8522645895155568, 0.8541860568182594, 0.852673511614505, 0.8529242694294602, 0.853839694807917, 0.8529917230002703, 0.8541107167248102, 0.8550068062623731, 0.8544582997940177, 0.8544859107764579, 0.8556536124425389, 0.8523365646068605, 0.8541441395151004, 0.8522869557890147, 0.8532932762558627, 0.8530960465733397, 0.8543381684553676, 0.8541206132802449, 0.8541618199912884, 0.8550719924959068, 0.8540075063379995, 0.8518583442263608, 0.8541984677539193, 0.8560450954502263, 0.8511811657803823, 0.8540050035964792, 0.8514141595289232, 0.8553531679294425, 0.8539589962756501, 0.8540521424991019, 0.8531747255667345, 0.8537858513839326, 0.8551331892608574, 0.8541832486151045, 0.8543145804703612, 0.854595208089001, 0.8531147272253298, 0.8550175366992496, 0.8538465087548012, 0.8570628075320701, 0.8535057585564456, 0.8506587420109343, 0.856644740626378, 0.853011839102515, 0.8553938922134642, 0.8559036878173352, 0.8552697147631352, 0.852707668084492, 0.8527200443764593, 0.853

In [10]:
# Number of factors [optimal] (gives lowest RMSE on validation set S_dev)
opt_factors = list1.index(min(list1)) + 1
print(opt_factors)

43


In [11]:
# Now training on train_data using optimal number of factors
final_algo = SVD(n_factors=opt_factors)
final_algo.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e27e0b1be0>

In [12]:
# Finally, predicting and calculating accuracy on test_data (S_test)
pred_test = final_algo.test(test_data)
accuracy.rmse(pred_test)

RMSE: 0.8754


0.875350389884773

In [16]:
# All data
data = Dataset.load_from_df(df, reader)
data = data.build_full_trainset()

final_algo.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1e27e0b1be0>

In [17]:
from collections import defaultdict


def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [21]:
anti_testset = data.build_anti_testset()
predictions1 = algo.test(anti_testset)

top_n = get_top_n(predictions1, n=10)

# Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])


# print(top_n)

movie_recs = {}
for uid, user_ratings in top_n.items():
    movie_recs[uid] = [iid for (iid, _) in user_ratings]

print(movie_recs)

{1: [904, 930, 1250, 750, 4993, 4995, 1148, 1201, 1267, 1276], 2: [858, 912, 750, 898, 7153, 1213, 1193, 2571, 2959, 3681], 3: [1283, 260, 858, 318, 1228, 34, 1097, 750, 898, 1198], 4: [1673, 112552, 3328, 928, 2160, 4306, 318, 1387, 89904, 3037], 5: [7153, 1266, 58559, 60069, 904, 750, 3275, 1272, 1136, 2571], 6: [1283, 1250, 899, 246, 910, 1104, 912, 3328, 1204, 858], 7: [1197, 1080, 527, 1250, 1136, 318, 2542, 933, 1276, 1204], 8: [750, 1204, 2160, 1198, 2542, 5618, 1261, 260, 1208, 904], 9: [1197, 2571, 58559, 1196, 260, 318, 1221, 4226, 924, 111], 10: [1283, 899, 1222, 1233, 1225, 720, 1218, 1203, 6296, 4011], 11: [1198, 1258, 1204, 1197, 858, 260, 904, 1291, 1104, 1214], 12: [50, 260, 527, 1196, 1197, 1198, 1210, 1222, 1258, 1270], 13: [1252, 58559, 1197, 1225, 4878, 2329, 1089, 50, 1203, 2959], 14: [858, 750, 246, 260, 48516, 1197, 1252, 904, 1208, 1213], 15: [1221, 6711, 1204, 1276, 741, 1617, 1258, 1213, 928, 2160], 16: [1258, 1104, 1266, 3451, 1204, 1200, 2160, 246, 3147, 933