In [47]:
from surprise import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split, GridSearchCV 
from surprise.model_selection.validation import cross_validate

import pandas as pd
import numpy as np

In [48]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')

In [49]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [50]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [51]:
movies_with_ratings.head(2)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982703.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847434962.0


In [52]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [53]:
dataset.head(2)

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),4.0
1,5.0,Toy Story (1995),4.0


In [54]:
ratings.rating.min()

0.5

In [55]:
ratings.rating.max()

5.0

In [56]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [None]:
benchmark = []

# выберем с помощью cross_validate алгоримтм с наименьшей ошибкой
for alg in [KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore()]:
    results = cross_validate(alg, data, measures=['RMSE'], cv=5, verbose=False)
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(alg).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)

In [58]:
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
KNNBaseline,0.873837,0.706557,3.974364
KNNWithZScore,0.89511,0.421098,3.495849
KNNWithMeans,0.896113,0.31249,3.121951
KNNBasic,0.948902,0.25125,2.91041


In [None]:
trainset, testset = train_test_split(data, test_size=.15)

In [None]:
alg = KNNBaseline()
predictions = alg.fit(trainset).test(testset)

In [59]:
# RMSE для KNNBaseline на тестовой выборке
accuracy.rmse(predictions)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8640


0.863977756092938

In [None]:
# подберем оптимальные гиперпараметры с помощью GridSearchCV
param_grid = {
            'k': np.arange(10, 65, 5),
            'sim_options': {'name': ['pearson_baseline', 'cosine'], 'user_based': [True, False]},  
            }
grid_search = GridSearchCV(KNNBaseline, param_grid, measures=['rmse'], cv=5, joblib_verbose=5)
grid_search.fit(data)

In [61]:
# Оптимальные гиперпараметры (среди рассмотренных)
grid_search.best_params['rmse']

{'k': 55, 'sim_options': {'name': 'pearson_baseline', 'user_based': False}}

In [62]:
best_alg = KNNBaseline(**grid_search.best_params['rmse'])
best_alg.fit(trainset)
predictions_test = best_alg.test(testset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


In [63]:
# RMSE для KNNBaseline с оптимальными гиперпараметрами на тестовой выборке
accuracy.rmse(predictions_test)

RMSE: 0.8436


0.8436227984544176

In [87]:
def rate_predictions(uid, df, alg):
    """
    Функция, возвращающая DataFrame с оценками 
    для фильмов, которые еще не смотрел пользователь uid.
    """
    predictions = []
    film_ids = set(df['iid'].unique())
    watched_film_ids = set(df[df['uid']==uid]['iid'].unique())
    not_watched_film_ids = film_ids - watched_film_ids
    for film_id in not_watched_film_ids:
        predictions.append(alg.predict(uid, film_id))
    return pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])[['iid', 'est']]

In [94]:
result = rate_predictions(100.0, dataset, best_alg)

In [95]:
result.head()

Unnamed: 0,iid,est
0,Hot Shots! Part Deux (1993),3.096404
1,My Science Project (1985),3.986646
2,"Fury, The (1978)",3.434263
3,Steve Jobs (2015),3.662897
4,Peeping Tom (1960),3.208198


In [96]:
# top-10 фильмов с наивысшими предсказанными оценками.
result.nlargest(10, 'est')

Unnamed: 0,iid,est
3397,Children of the Corn IV: The Gathering (1996),4.945497
7039,"Good Year, A (2006)",4.736215
9065,She's So Lovely (1997),4.679356
7526,Destiny Turns on the Radio (1995),4.597138
8324,"Dark Knight, The (2008)",4.521695
2106,"Shawshank Redemption, The (1994)",4.512345
4332,It (2017),4.508193
5342,Enough (2002),4.500053
4214,Kinky Boots (2005),4.429253
8918,Raiders of the Lost Ark: The Adaptation (1989),4.427526
