In [1]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load the movielens-1M dataset
data = pd.read_csv('C:/Users/prtyagi/Desktop/New folder/archive1/ratings_small.csv')

In [4]:
# sample random trainset and testset
# test set is made of 20% of the ratings.
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size=0.2)

In [5]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [6]:
trainset, testset = convert_traintest_dataframe_forsurprise(train_data, test_data)

In [7]:
def recommendation(algo, trainset, testset):
  # Train the algorithm on the trainset, and predict ratings for the testset
  algo.fit(trainset)

  # Predictions on testing set
  test_predictions = algo.test(testset)
  test_rmse = accuracy.rmse(test_predictions)
  test_mae = accuracy.mae(test_predictions)
  
  return test_rmse, test_mae, test_predictions

#### Experimenting

In [8]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }
algo = BaselineOnly(bsl_options=bsl_options)
test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)

Using ALS
Estimating biases using als...
RMSE: 0.8820
MAE:  0.6803


In [9]:
print('Using SGD')
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
               }
algo = BaselineOnly(bsl_options=bsl_options)
test_rmse, test_mae, test_pred = recommendation(algo, trainset, testset)

Using SGD
Estimating biases using sgd...
RMSE: 1.0067
MAE:  0.8082


##### Calculating predictions for the top methods:

In [10]:
# KNNBaseline

algo = KNNBaseline()
test_knn_rmse, test_knn_mae, test_knn_pred = recommendation(algo, trainset, testset)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8920
MAE:  0.6820


In [11]:
# SlopeOne

algo = SlopeOne()
test_slopeone_rmse, test_slopeone_mae, test_slopeone_pred = recommendation(algo, trainset, testset)

RMSE: 0.9259
MAE:  0.7080


In [12]:
# SVD

algo = SVD()
test_svd_rmse, test_svd_mae, test_svd_pred  = recommendation(algo, trainset, testset)

RMSE: 0.8892
MAE:  0.6853


In [13]:
# SVDpp

algo = SVDpp()
test_svdpp_rmse, test_svdpp_mae, test_svdpp_pred = recommendation(algo, trainset, testset)

RMSE: 0.8794
MAE:  0.6737


In [14]:
# BaselineOnly()

algo = BaselineOnly()
test_base_rmse, test_base_mae, test_base_pred  = recommendation(algo, trainset, testset)

Estimating biases using als...
RMSE: 0.8880
MAE:  0.6862


In [15]:
test_pred_df = pd.DataFrame(columns= ['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating', 'baseline_rating'])

In [16]:
num_test = len(test_base_pred)
print(num_test)

20001


##### Storing testing set predictions:

In [17]:
for i in range(num_test): 
  svd = test_svd_pred[i]
  slopeone = test_slopeone_pred[i]
  knn = test_knn_pred[i]
  svdpp = test_svdpp_pred[i]
  baseline = test_base_pred[i]
  df = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, knn.est, svdpp.est, slopeone.est, baseline.est]], columns=['uid', 'iid', 'og_rating', 'svd_rating', 'knn_rating', 'svdpp_rating', 'slopeone_rating','baseline_rating'])
  # print(df)
  test_pred_df = pd.concat([df, test_pred_df], ignore_index=True)

In [18]:
test_pred_df

Unnamed: 0,uid,iid,og_rating,svd_rating,knn_rating,svdpp_rating,slopeone_rating,baseline_rating
0,527,1704,4.0,3.640624,3.416435,3.391777,3.574057,3.604491
1,452,2120,1.0,2.895874,3.122418,3.184163,2.911402,3.113861
2,282,5445,3.0,3.502742,3.510027,3.751624,3.610374,3.675542
3,187,7153,1.0,3.689533,3.913941,2.728789,4.323326,4.157547
4,547,26784,3.0,3.293416,3.288193,3.190122,3.544142,3.288193
...,...,...,...,...,...,...,...,...
19996,652,36527,4.0,4.202399,4.176398,4.450678,4.205742,4.268476
19997,78,2011,4.5,4.088458,4.291281,4.423929,4.033263,4.187499
19998,306,804,3.0,3.005642,2.758387,3.018154,3.075277,3.170685
19999,603,1073,3.0,4.010676,3.714640,4.034260,4.159339,3.994523
