In [None]:
pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3095437 sha256=60bee322c1062156e84347e766cdb1141f3095a5f6b0a6b80e5feb01f534ad53
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
from surprise import Reader
from surprise import Dataset, accuracy
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, BaselineOnly
from surprise import SVDpp, SlopeOne, NMF, CoClustering
from surprise.accuracy import rmse
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [None]:
path_movies = '/content/gdrive/MyDrive/DataScience/Homeworks/7th/ml-latest-small/movies.csv'
path_ratings = '/content/gdrive/MyDrive/DataScience/Homeworks/7th/ml-latest-small/ratings.csv'
path_tags = '/content/gdrive/MyDrive/DataScience/Homeworks/7th/ml-latest-small/tags.csv'

### data for ratings.csv

In [None]:
data = pd.read_csv(path_ratings)
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
data.drop(['timestamp'], axis=1, inplace=True)
data.columns=['userID', 'movie', 'rating']
data.head()

Unnamed: 0,userID,movie,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
data.shape

(100836, 3)

In [None]:
reader = Reader(rating_scale=(1,5))
rating_data = Dataset.load_from_df(data[["userID", "movie", "rating"]], reader)

trainset = rating_data.build_full_trainset()
testset = trainset.build_anti_testset()

algo = SVD()
predictions = algo.fit(trainset).test(testset)

accuracy.rmse(predictions)

RMSE: 0.4865


0.4864659308265093

In [38]:
benchmark = []

algorithms = [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), 
              KNNBaseline(), KNNBasic(), KNNWithMeans(), BaselineOnly(), CoClustering()]

for algorithm in algorithms:
  results = cross_validate(algorithm, rating_data, measures=['RMSE'], cv=3, verbose=False)

  tmp = pd.DataFrame.from_dict(results).mean(axis=0)
  algorithm_name = str(algorithm).split(' ')[0].split('.')[-1]
  tmp = pd.concat([tmp, pd.Series([algorithm_name], index=['Algorithm'])])

  benchmark.append(tmp)

surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.869475,61.402436,17.222875
BaselineOnly,0.87703,0.307254,0.130154
SVD,0.881136,1.777138,0.224325
KNNBaseline,0.882845,0.391657,2.612969
KNNWithMeans,0.904341,0.115519,2.016331
SlopeOne,0.909204,3.637396,10.563504
NMF,0.936449,2.184707,0.200948
CoClustering,0.950917,2.685209,0.199202
KNNBasic,0.95665,0.100146,2.449051
NormalPredictor,1.420285,0.147858,0.302836


In [None]:
#SVDpp test

param_grid = {'n_epochs': [5, 10],
              'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(rating_data)

training_parameters = gs.best_params["rmse"]

print("BEST RMSE: \t", gs.best_score["rmse"])
print("BEST MAE: \t", gs.best_score["mae"])
print("BEST params: \t", gs.best_params["rmse"])

BEST RMSE: 	 0.8940150076474125
BEST MAE: 	 0.6920174583332498
BEST params: 	 {'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [34]:
svd = SVD(n_factors=10, reg_all=0.05)
svd.fit(trainset)
predictions = svd.test(testset)
predictions[0:3]

[Prediction(uid=1, iid=318, r_ui=3.501556983616962, est=5, details={'was_impossible': False}),
 Prediction(uid=1, iid=1704, r_ui=3.501556983616962, est=4.818864025515979, details={'was_impossible': False}),
 Prediction(uid=1, iid=6874, r_ui=3.501556983616962, est=4.8081923579866865, details={'was_impossible': False})]