In [1]:
# install surprise for this notebook

!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/f5/da/b5700d96495fb4f092be497f02492768a3d96a3f4fa2ae7dea46d4081cfa/scikit-surprise-1.1.0.tar.gz (6.4MB)
[K     |████████████████████████████████| 6.5MB 2.5MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.0-cp36-cp36m-linux_x86_64.whl size=1678569 sha256=86e0a2c85888acb7a9e501b3daed51c71a3c4d4f4e0ab0ddc94d2a651052b81b
  Stored in directory: /root/.cache/pip/wheels/cc/fa/8c/16c93fccce688ae1bde7d979ff102f7bee980d9cfeb8641bcf
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.0 surprise-0.1


In [0]:
# imports

from surprise.model_selection import cross_validate
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
import pandas as pd
import json

from surprise import \
SVD, \
SVDpp, \
NMF, \
SlopeOne, \
KNNBasic, \
KNNBaseline, \
KNNWithMeans, \
KNNWithZScore, \
CoClustering

In [0]:
# size of the raw dataset

raw_data_size = '1M'
# raw_data_size = '27M'

In [4]:
# load training dataset

PATH_DIR = '/content/drive/My Drive/'

df_train = pd.read_csv(PATH_DIR + 'train_' + raw_data_size + '.csv')[['userId', 'movieId', 'rating']]
df_train

Unnamed: 0,userId,movieId,rating
0,1,2692,4
1,1,1566,4
2,1,2762,4
3,1,588,4
4,1,938,4
...,...,...,...
800188,6040,1966,4
800189,6040,2973,4
800190,6040,17,3
800191,6040,3388,1


In [0]:
reader = Reader(rating_scale=(1, 5))

In [0]:
trainset = Dataset.load_from_df(df_train, reader).build_full_trainset()

In [0]:
# choose algorithm for training

training_algo = 'SVD'
# training_algo = 'NMF'
# training_algo = 'SlopeOne'
# training_algo = 'KNNBasic'
# training_algo = 'KNNBaseline'
# training_algo = 'KNNWithMeans'
# training_algo = 'KNNWithZScore'
# training_algo = 'CoClustering'

In [8]:
# train the model with the chosen algorithm

algo = SVD()
# algo = NMF()
# algo = SlopeOne()
# algo = KNNBasic()
# algo = KNNBaseline()
# algo = KNNWithMeans()
# algo = KNNWithZScore()
# algo = CoClustering()

algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fdec19983c8>

In [9]:
# load testing dataset

df_test = pd.read_csv((PATH_DIR + 'test_' + raw_data_size + '.csv'))[['userId', 'movieId', 'rating']]
df_test

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,3408,4
2,1,919,4
3,1,2797,4
4,1,720,3
...,...,...,...
200011,6040,2791,4
200012,6040,3751,4
200013,6040,541,4
200014,6040,1077,5


In [10]:
# predict ratings for testing dataset using the trained model 

predicted_attributes = []

def get_predicted_ratings(x):
  prediction_list = []

  prediction = algo.predict(x[0], x[1])

  prediction_list.append(int(prediction[0]))
  prediction_list.append(int(prediction[1]))
  prediction_list.append(prediction[3])
  predicted_attributes.append(prediction_list)

df_test.apply(get_predicted_ratings, axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
200011    None
200012    None
200013    None
200014    None
200015    None
Length: 200016, dtype: object

In [11]:
# save predicted output

df_result = pd.DataFrame.from_records(predicted_attributes, columns=['userId', 'movieId', 'predicted_rating'])
df_result['true_rating'] = df_test['rating']
df_result

Unnamed: 0,userId,movieId,predicted_rating,true_rating
0,1,1193,4.410473,5
1,1,3408,4.285585,4
2,1,919,4.528286,4
3,1,2797,4.229386,4
4,1,720,4.238047,3
...,...,...,...,...
200011,6040,2791,2.894329,4
200012,6040,3751,3.179212,4
200013,6040,541,4.473313,4
200014,6040,1077,3.846907,5


In [0]:
df_result.to_csv(PATH_DIR + training_algo + '_' + raw_data_size + '.csv', index=False)