In [40]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import io, pickle, zipfile
from zipfile import ZipFile

from sklearn.metrics import mean_squared_error

from surprise import KNNWithMeans, KNNBasic, BaselineOnly
from surprise import Dataset, SVD
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV


import warnings
warnings.filterwarnings("ignore")

In [2]:
# посмотрим какие файлы есть в архиве 
with ZipFile('ml-latest-small.zip', 'r') as myzip:
    for item in myzip.infolist():
        print(f'File Name: {item.filename} ')

File Name: ml-latest-small/ 
File Name: ml-latest-small/links.csv 
File Name: ml-latest-small/tags.csv 
File Name: ml-latest-small/ratings.csv 
File Name: ml-latest-small/README.txt 
File Name: ml-latest-small/movies.csv 


In [3]:
# извлечение из архива
with ZipFile('ml-latest-small.zip', 'r') as myzip:
    myzip.extractall()

movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [4]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483


In [8]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [9]:
dataset

Unnamed: 0,uid,iid,rating
0,1,Toy Story (1995),4.0
1,5,Toy Story (1995),4.0
2,7,Toy Story (1995),4.5
3,15,Toy Story (1995),2.5
4,17,Toy Story (1995),4.5
...,...,...,...
100831,184,Black Butler: Book of the Atlantic (2017),4.0
100832,184,No Game No Life: Zero (2017),3.5
100833,184,Flint (2017),3.5
100834,184,Bungo Stray Dogs: Dead Apple (2018),3.5


In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)
data

<surprise.dataset.DatasetAutoFolds at 0x14c2836e040>

In [11]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

In [41]:
sim_options = {
    "name": "cosine",
    "user_based": True,  # compute  similarities between items
}
algo_KNNB = KNNBasic(sim_options=sim_options)

cross_validate(algo_KNNB, data, measures=['RMSE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9727  0.9786  0.9704  0.9767  0.9692  0.9735  0.0036  
Fit time          0.66    0.67    0.66    0.66    0.66    0.66    0.00    
Test time         2.06    2.04    2.04    2.04    2.04    2.04    0.01    


{'test_rmse': array([0.97271634, 0.9786495 , 0.97043392, 0.97668311, 0.96922285]),
 'fit_time': (0.6587879657745361,
  0.6677863597869873,
  0.6647875308990479,
  0.6637890338897705,
  0.6617879867553711),
 'test_time': (2.0610361099243164,
  2.03654146194458,
  2.0434000492095947,
  2.0416970252990723,
  2.038269519805908)}

In [42]:
bsl_options = {
    "method": "sgd",
    "learning_rate": 0.00005,
}
algo_BLO = BaselineOnly(bsl_options=bsl_options)
cross_validate(algo_BLO, data, measures=['RMSE'], cv=5, verbose=True)

Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9866  0.9838  0.9846  0.9903  0.9911  0.9873  0.0030  
Fit time          0.67    0.69    0.68    0.66    0.69    0.68    0.01    
Test time         0.15    0.34    0.15    0.15    0.34    0.23    0.10    


{'test_rmse': array([0.98664362, 0.98383974, 0.98455511, 0.99034015, 0.99112438]),
 'fit_time': (0.6703591346740723,
  0.6872422695159912,
  0.67917799949646,
  0.6588068008422852,
  0.6938068866729736),
 'test_time': (0.14695262908935547,
  0.34289002418518066,
  0.14695286750793457,
  0.1471874713897705,
  0.3449263572692871)}

In [43]:
algo_KNNWM = KNNWithMeans(k=50, sim_options={
    'name': 'cosine',
    'user_based': True  # вычисление сходства между пользователями
})
algo_KNNWM.fit(trainset)

test_pred = algo_KNNWM.test(testset)
accuracy.rmse(test_pred, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8980


0.8980170879371258

In [44]:
# попробуем подобрать лучшие параметры и затем применить их
param_grid = {
    'name': ['cosine'], # способы оценки похожести (в GridSearch)    
    'user_based': [True]
}

gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)
gs.fit(data)
     
# результат
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
0.8960924599205426
{'name': 'cosine', 'user_based': True}


In [45]:
# обучим с лучшими параметрами
algo_KNNWM_GS = gs.best_estimator['rmse']
algo_KNNWM_GS.fit(trainset)

# получим предикт и посмотрим метрику
predictions = algo_KNNWM_GS.test(testset)
accuracy.rmse(predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8940


0.8940113758272057

In [46]:
# получаем предикт
uid = 184  # id пользователя 
iid = 'Flint (2017)'  # название фильма

# получим предсказание на основе обученных данных
# r_ui = 3.5 - это фактический рейтинг. Сравним с предсказанием
pred = algo_KNNWM_GS.predict(uid, iid, r_ui=3.5, verbose=True)

user: 184        item: Flint (2017) r_ui = 3.50   est = 3.50   {'actual_k': 1, 'was_impossible': False}


In [47]:
# Применим алгоритм SVD, который часто называют лучшим
algo_SVD = SVD()

In [48]:
cross_validate(algo_SVD, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8792  0.8815  0.8644  0.8726  0.8759  0.8747  0.0060  
Fit time          1.49    1.51    1.51    1.54    1.62    1.53    0.04    
Test time         0.46    0.27    0.27    0.30    0.27    0.31    0.07    


{'test_rmse': array([0.87916336, 0.88147293, 0.86441341, 0.87259932, 0.87587964]),
 'fit_time': (1.494697093963623,
  1.5135149955749512,
  1.5075159072875977,
  1.5355093479156494,
  1.6154823303222656),
 'test_time': (0.4598526954650879,
  0.27091288566589355,
  0.269913911819458,
  0.29990434646606445,
  0.2719109058380127)}

##### На третьем фолде достигли метрики ниже 0,87, что и требовалось по заданию. Попробуем получить лучшую метрику

In [49]:
# используем процедуру К-кратной перекрестной проверки
# с 5 разделениями

kf = KFold(n_splits=5)
algo_SVD_KF = SVD()

for trainset, testset in kf.split(data):    
    algo_SVD_KF.fit(trainset)
    predictions = algo_SVD_KF.test(testset)    
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.8730
RMSE: 0.8753
RMSE: 0.8777
RMSE: 0.8730
RMSE: 0.8714


##### Лучше не стало, попробуем подобрать параметры с помощью GridSearchCV

In [50]:
param_grid = {'n_epochs': [1, 20], 'lr_all': [0.001, 0.007], 'reg_all': [0.1, 0.8]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# комбинация параметров, обеспечивающих лучшее RMSE score
print(gs.best_params['rmse'])

0.8662354760471775
{'n_epochs': 20, 'lr_all': 0.007, 'reg_all': 0.1}


In [51]:
# применим лучшие параметры

algo_best = gs.best_estimator['rmse']
algo_best.fit(data.build_full_trainset())

predictions = algo_best.test(testset)    
accuracy.rmse(predictions, verbose=True)

RMSE: 0.7738


0.7738092108641198

##### Получили метрику 0,77. Это самое низкое значение