## Получаем данные

In [1]:
import pandas as pd

In [2]:
import numpy as np
from tqdm import tqdm_notebook

In [3]:
df_ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('../data/ml-latest-small/movies.csv')

In [4]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [5]:
del df['timestamp']
del df['genres']

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


## Формируем векторное описание для фильма

In [7]:
df['userId'].describe()

count    100836.000000
mean        326.127564
std         182.618491
min           1.000000
25%         177.000000
50%         325.000000
75%         477.000000
max         610.000000
Name: userId, dtype: float64

In [8]:
MAX_USER_ID = 610
MIN_USER_ID = 1

In [9]:
movie_names = df['title'].unique()

In [10]:
movie_names = movie_names.tolist()

In [11]:
movie_to_vector = {}

In [12]:
for movie in tqdm_notebook(movie_names):
    movie_to_vector[movie] = np.zeros((MAX_USER_ID,))
    for r in df[df['title'] == movie].iterrows():
        movie_to_vector[movie][r[1]['userId'] - MIN_USER_ID] = r[1]['rating']

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




## Ищем похожие

In [14]:
def find_similar(movie, dist_func, top=10):
    distances = {}
    target_movie = movie_to_vector[movie]
    for m in movie_names:
        distances[m] = dist_func(target_movie, movie_to_vector[m])
        
    distances_with_idx = [(i, distances[m]) for i, m in enumerate(movie_names)]
    distances_with_idx = sorted(distances_with_idx, key=lambda t: t[1], reverse=False)
    distances_with_idx = distances_with_idx[:top]
    
    return [(movie_names[i], d) for i, d in distances_with_idx]

In [15]:
from scipy.spatial.distance import cosine, euclidean, cityblock

In [16]:
find_similar('Toy Story (1995)', cityblock)

[('Toy Story (1995)', 0.0),
 ('Toy Story 2 (1999)', 608.5),
 ("Bug's Life, A (1998)", 698.5),
 ('Groundhog Day (1993)', 714.0),
 ('Nutty Professor, The (1996)', 714.0),
 ('Willy Wonka & the Chocolate Factory (1971)', 718.0),
 ('Mission: Impossible (1996)', 722.0),
 ('Babe (1995)', 722.5),
 ('Monsters, Inc. (2001)', 725.0),
 ('Toy Story 3 (2010)', 728.0)]

## User 2 Item

In [17]:
!pip install surprise

Collecting surprise
  Downloading https://files.pythonhosted.org/packages/61/de/e5cba8682201fcf9c3719a6fdda95693468ed061945493dea2dd37c5618b/surprise-0.1-py2.py3-none-any.whl
Collecting scikit-surprise (from surprise)
[?25l  Downloading https://files.pythonhosted.org/packages/4d/fc/cd4210b247d1dca421c25994740cbbf03c5e980e31881f10eaddf45fdab0/scikit-surprise-1.0.6.tar.gz (3.3MB)
[K    100% |████████████████████████████████| 3.3MB 3.9MB/s ta 0:00:011
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/truename/.cache/pip/wheels/ec/c0/55/3a28eab06b53c220015063ebbdb81213cd3dcbb72c088251ec
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.0.6 surprise-0.1
[33mYou are using pip version 19.0.2, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m

In [18]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [19]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [20]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [21]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [22]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [23]:
reader = Reader(rating_scale=(0.5, 5))

In [24]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [25]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [26]:
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True}, user_based=True)

In [27]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f1595c9fac8>

In [28]:
predictions = algo.test(testset)

In [29]:
accuracy.mae(predictions)

MAE:  0.7500


0.7500243846275398

In [30]:
testset

[(137, 2064, 5.0),
 (132, 3977, 2.5),
 (63, 102217, 5.0),
 (599, 207, 2.5),
 (414, 3388, 2.0),
 (291, 106489, 4.0),
 (606, 781, 4.5),
 (387, 6731, 3.5),
 (122, 108932, 4.5),
 (377, 5049, 4.0),
 (474, 1875, 3.5),
 (141, 6365, 3.0),
 (217, 170, 3.0),
 (591, 368, 5.0),
 (217, 3362, 4.0),
 (124, 1091, 3.0),
 (596, 1265, 3.0),
 (290, 1617, 4.0),
 (177, 2160, 4.0),
 (414, 4995, 4.0),
 (1, 1024, 5.0),
 (34, 8636, 5.0),
 (357, 4979, 3.5),
 (160, 1385, 1.0),
 (274, 74075, 1.0),
 (115, 593, 5.0),
 (119, 8622, 3.5),
 (23, 4027, 3.0),
 (111, 183197, 3.5),
 (610, 45722, 3.5),
 (274, 76175, 3.0),
 (109, 348, 3.0),
 (357, 1333, 3.5),
 (182, 3623, 2.5),
 (599, 2915, 3.0),
 (82, 2378, 3.5),
 (274, 3671, 4.0),
 (351, 89087, 3.0),
 (217, 1359, 1.0),
 (45, 3534, 4.0),
 (432, 5418, 4.5),
 (89, 7381, 4.0),
 (414, 7137, 3.5),
 (606, 1913, 4.0),
 (277, 780, 4.0),
 (249, 1911, 3.0),
 (434, 54272, 3.5),
 (380, 3052, 5.0),
 (109, 733, 3.0),
 (448, 3752, 4.0),
 (480, 6873, 2.0),
 (580, 587, 1.5),
 (153, 140174, 4

In [31]:
algo.predict(353, 904)

Prediction(uid=353, iid=904, r_ui=None, est=4.300275050990251, details={'actual_k': 40, 'was_impossible': False})

In [32]:
algo.get_neighbors(353, 40)

[11,
 27,
 35,
 44,
 50,
 55,
 60,
 64,
 73,
 86,
 98,
 104,
 108,
 109,
 117,
 120,
 130,
 141,
 144,
 157,
 158,
 159,
 165,
 171,
 172,
 173,
 175,
 178,
 180,
 187,
 190,
 191,
 196,
 201,
 217,
 227,
 230,
 231,
 233,
 245]

In [101]:
algo.predict(algo.trainset.to_inner_uid(292), algo.trainset.to_inner_iid(164909))

Prediction(uid=58, iid=3282, r_ui=None, est=3.5022871522784746, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [108]:
algo.get_neighbors(iid=algo.trainset.to_inner_uid(353), k=5)

[15, 43, 84, 87, 94]

In [112]:
pd.merge(df[df['userId']==353], df[df['userId']==algo.trainset.to_raw_uid(87)], on='movieId')

Unnamed: 0,userId_x,movieId,rating_x,title_x,userId_y,rating_y,title_y
0,353,1,5.0,Toy Story (1995),44,3.0,Toy Story (1995)
1,353,6,4.0,Heat (1995),44,3.0,Heat (1995)
2,353,112,5.0,Rumble in the Bronx (Hont faan kui) (1995),44,5.0,Rumble in the Bronx (Hont faan kui) (1995)


In [123]:
def get_votes_for_film_user(uid, iid):
    neighbors = algo.get_neighbors(iid=algo.trainset.to_inner_uid(uid), k=40)
    for n in neighbors:
        res = df_ratings[(df_ratings['userId'] == n) & (df_ratings['movieId'] == iid)]
        print (n, res['rating'].tolist()[0] if len(res['rating'].tolist()) > 0 else "None")

In [124]:
get_votes_for_film_user(353, 1)

15 2.5
43 5.0
84 None
87 None
94 None
96 5.0
100 None
101 None
107 4.0
122 None
139 None
141 4.0
159 4.5
208 None
219 3.5
221 None
231 None
233 3.0
239 4.0
241 None
243 None
251 None
273 5.0
283 3.0
296 None
307 4.0
311 None
313 None
329 None
334 3.5
339 4.0
340 None
343 None
348 None
361 None
364 5.0
369 None
379 None
382 4.5
394 None


In [125]:
algo.predict(353, 1)

Prediction(uid=353, iid=1, r_ui=None, est=4.149814663807331, details={'actual_k': 40, 'was_impossible': False})

## Item-based рекомендация

In [33]:
algo = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [34]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f1595c34630>

In [35]:
predictions = algo.test(testset)

In [36]:
accuracy.mae(predictions)

MAE:  0.6936


0.6936091191258036

In [37]:
from surprise import KNNWithMeans

In [38]:
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [39]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f1595c342b0>

In [40]:
predictions = algo.test(testset)

In [41]:
accuracy.mae(predictions)

MAE:  0.6694


0.6693700626606007

In [42]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [43]:
similar_films = algo.get_neighbors(algo.trainset.to_inner_iid(1), k=10)

In [44]:
similar_films = [algo.trainset.to_raw_iid(i) for i in similar_films]

In [45]:
similar_films

[588, 3114, 4306, 8961, 4886, 551, 2716, 1073, 1148, 73017]

In [46]:
df_movies[df_movies['movieId'].isin(similar_films)]

Unnamed: 0,movieId,title,genres
483,551,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
815,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
868,1148,Wallace & Gromit: The Wrong Trousers (1993),Animation|Children|Comedy|Crime
2038,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3194,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...
3568,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
5374,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy
7214,73017,Sherlock Holmes (2009),Action|Crime|Mystery|Thriller


In [47]:
from surprise.model_selection import KFold

In [48]:
kfold = KFold(5)

In [49]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.mae(predictions))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6689
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6725
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6673
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6684
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6709



In [50]:
scores

[0.6688794184828853,
 0.6725420217269258,
 0.6673299189712403,
 0.6683900054729893,
 0.670899248158954]