## Получаем данные

In [1]:
import pandas as pd

In [2]:
import numpy as np
from tqdm import tqdm_notebook

In [3]:
df_ratings = pd.read_csv('ratings.csv')
df_movies = pd.read_csv('movies.csv')

In [4]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [5]:
del df['timestamp']
del df['genres']

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


## User 2 Item

In [16]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [17]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [18]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [19]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [20]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [21]:
reader = Reader(rating_scale=(0.5, 5))

In [22]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [23]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [37]:
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True}, user_based=True)

In [35]:
from surprise.model_selection import KFold

In [36]:
kfold = KFold(5)

In [38]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9743
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9846
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9752
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9733
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9591



In [42]:
np.mean(scores)

0.9733082765092804

### RMSE=0.97 хуже требуемого

## Item-based рекомендация

In [43]:
from surprise import KNNWithMeans

In [54]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8828
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8826
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8957
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8725
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8746


In [55]:
np.mean(scores)

0.8816260988954913

## RMSE=0.88 
## Другие алгоритмы дали большую ошибку
попробуем другие алгоритмы

In [60]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8833
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8889
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8909
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8772
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8714


In [57]:
np.mean(scores)

0.8831659236796392

In [47]:
#MSD
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'msd', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8883
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8930
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8865
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9012
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8947


In [48]:
np.mean(scores)

0.8927424642237025

In [49]:
#cosine
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'cosine', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.rmse(predictions))

HBox(children=(IntProgress(value=1, bar_style=u'info', max=1), HTML(value=u'')))

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8991
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9089
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9080
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9017
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9020


In [50]:
np.mean(scores)

0.9039497960448648