## Получаем данные

In [1]:
import pandas as pd

In [2]:
import numpy as np
from tqdm import tqdm_notebook

In [3]:
df_ratings = pd.read_csv('../data/ml-latest-small/ratings.csv')
df_movies = pd.read_csv('../data/ml-latest-small/movies.csv')

In [4]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [5]:
del df['timestamp']
del df['genres']

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


## Формируем векторное описание для фильма

In [7]:
df['userId'].describe()

count    100836.000000
mean        326.127564
std         182.618491
min           1.000000
25%         177.000000
50%         325.000000
75%         477.000000
max         610.000000
Name: userId, dtype: float64

In [8]:
MAX_USER_ID = 610
MIN_USER_ID = 1

In [9]:
movie_names = df['title'].unique()

In [10]:
movie_names = movie_names.tolist()

In [11]:
movie_to_vector = {}

In [12]:
for movie in tqdm_notebook(movie_names):
    movie_to_vector[movie] = np.zeros((MAX_USER_ID,))
    for r in df[df['title'] == movie].iterrows():
        movie_to_vector[movie][r[1]['userId'] - MIN_USER_ID] = r[1]['rating']

HBox(children=(IntProgress(value=0, max=9719), HTML(value='')))




## Ищем похожие

In [13]:
def find_similar(movie, dist_func, top=10):
    distances = {}
    target_movie = movie_to_vector[movie]
    for m in movie_names:
        distances[m] = dist_func(target_movie, movie_to_vector[m])
        
    distances_with_idx = [(i, distances[m]) for i, m in enumerate(movie_names)]
    distances_with_idx = sorted(distances_with_idx, key=lambda t: t[1], reverse=False)
    distances_with_idx = distances_with_idx[:top]
    
    return [(movie_names[i], d) for i, d in distances_with_idx]

In [14]:
from scipy.spatial.distance import cosine, euclidean, cityblock

In [15]:
find_similar('Toy Story (1995)', cityblock)

[('Toy Story (1995)', 0.0),
 ('Toy Story 2 (1999)', 608.5),
 ("Bug's Life, A (1998)", 698.5),
 ('Groundhog Day (1993)', 714.0),
 ('Nutty Professor, The (1996)', 714.0),
 ('Willy Wonka & the Chocolate Factory (1971)', 718.0),
 ('Mission: Impossible (1996)', 722.0),
 ('Babe (1995)', 722.5),
 ('Monsters, Inc. (2001)', 725.0),
 ('Toy Story 3 (2010)', 728.0)]

## User 2 Item

In [16]:
!pip install surprise

[33mYou are using pip version 19.0.2, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [17]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

In [18]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [19]:
df_for_surpise = df_ratings[['userId', 'movieId', 'rating']]

In [20]:
df_for_surpise.columns = ['uid', 'iid', 'rating']

In [21]:
df_for_surpise.head()

Unnamed: 0,uid,iid,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [22]:
reader = Reader(rating_scale=(0.5, 5))

In [23]:
dataset = Dataset.load_from_df(df_for_surpise, reader)

In [24]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [25]:
algo = KNNBasic(k=40, sim_options={'name': 'cosine', 'user_based': True}, user_based=True)

In [26]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7ffa632c92b0>

In [27]:
predictions = algo.test(testset)

In [28]:
accuracy.mae(predictions)

MAE:  0.7488


0.7487718446404766

In [29]:
testset

[(64, 25, 3.5),
 (43, 165, 5.0),
 (495, 1059, 5.0),
 (424, 288, 4.0),
 (292, 80489, 3.5),
 (298, 112552, 3.5),
 (312, 2628, 3.0),
 (381, 2100, 3.5),
 (600, 480, 3.0),
 (135, 3624, 4.0),
 (44, 1391, 2.0),
 (414, 7137, 3.5),
 (462, 337, 2.5),
 (276, 364, 5.0),
 (336, 2762, 4.5),
 (249, 1199, 3.5),
 (241, 318, 3.5),
 (438, 4011, 3.5),
 (385, 541, 5.0),
 (608, 3264, 2.5),
 (364, 1210, 3.0),
 (4, 2843, 5.0),
 (495, 2918, 4.5),
 (19, 47, 3.0),
 (89, 146244, 4.0),
 (555, 2402, 3.0),
 (288, 3699, 3.0),
 (263, 8796, 2.0),
 (453, 2447, 3.0),
 (429, 468, 3.0),
 (80, 85414, 4.5),
 (453, 1405, 5.0),
 (249, 2949, 4.0),
 (460, 6539, 4.0),
 (113, 1043, 2.0),
 (477, 2, 4.0),
 (532, 1200, 5.0),
 (380, 5476, 3.0),
 (216, 3067, 3.0),
 (232, 39446, 4.0),
 (586, 2058, 4.0),
 (476, 594, 4.0),
 (63, 2997, 3.0),
 (448, 148888, 2.0),
 (18, 68159, 4.0),
 (105, 2762, 4.5),
 (599, 60069, 4.0),
 (18, 63131, 3.5),
 (572, 1958, 4.0),
 (600, 26662, 4.5),
 (233, 1206, 4.0),
 (307, 2716, 3.0),
 (606, 6934, 2.0),
 (469, 

In [30]:
algo.predict(353, 904)

Prediction(uid=353, iid=904, r_ui=None, est=4.3997351048638, details={'actual_k': 40, 'was_impossible': False})

In [31]:
algo.get_neighbors(353, 40)

[11,
 19,
 30,
 37,
 41,
 49,
 52,
 54,
 57,
 61,
 62,
 65,
 67,
 72,
 74,
 75,
 87,
 92,
 97,
 104,
 115,
 116,
 124,
 126,
 133,
 134,
 138,
 142,
 149,
 152,
 158,
 164,
 165,
 167,
 170,
 172,
 176,
 177,
 185,
 196]

In [32]:
algo.predict(algo.trainset.to_inner_uid(292), algo.trainset.to_inner_iid(164909))

Prediction(uid=169, iid=3542, r_ui=None, est=3.500216938562999, details={'was_impossible': True, 'reason': 'User and/or item is unkown.'})

In [33]:
algo.get_neighbors(iid=algo.trainset.to_inner_uid(353), k=5)

[10, 37, 52, 66, 74]

In [34]:
pd.merge(df[df['userId']==353], df[df['userId']==algo.trainset.to_raw_uid(87)], on='movieId')

Unnamed: 0,userId_x,movieId,rating_x,title_x,userId_y,rating_y,title_y
0,353,1,5.0,Toy Story (1995),103,4.0,Toy Story (1995)
1,353,70,4.0,From Dusk Till Dawn (1996),103,3.5,From Dusk Till Dawn (1996)
2,353,110,3.0,Braveheart (1995),103,4.5,Braveheart (1995)
3,353,216,3.0,Billy Madison (1995),103,4.0,Billy Madison (1995)
4,353,296,5.0,Pulp Fiction (1994),103,5.0,Pulp Fiction (1994)
5,353,367,3.0,"Mask, The (1994)",103,3.5,"Mask, The (1994)"
6,353,590,4.0,Dances with Wolves (1990),103,4.0,Dances with Wolves (1990)
7,353,593,5.0,"Silence of the Lambs, The (1991)",103,5.0,"Silence of the Lambs, The (1991)"
8,353,318,5.0,"Shawshank Redemption, The (1994)",103,5.0,"Shawshank Redemption, The (1994)"
9,353,595,4.0,Beauty and the Beast (1991),103,3.0,Beauty and the Beast (1991)


In [35]:
def get_votes_for_film_user(uid, iid):
    neighbors = algo.get_neighbors(iid=algo.trainset.to_inner_uid(uid), k=40)
    for n in neighbors:
        res = df_ratings[(df_ratings['userId'] == n) & (df_ratings['movieId'] == iid)]
        print (n, res['rating'].tolist()[0] if len(res['rating'].tolist()) > 0 else "None")

In [36]:
get_votes_for_film_user(353, 1)

10 None
37 None
52 None
66 4.0
74 None
86 4.0
96 5.0
99 None
104 None
116 None
120 None
136 None
137 4.0
157 None
165 None
167 3.5
182 4.0
185 4.0
203 None
210 None
212 None
226 3.5
242 None
250 None
267 None
276 4.0
277 4.0
279 3.0
287 None
297 None
305 None
306 None
314 3.0
327 None
337 4.0
339 4.0
347 5.0
348 None
351 None
353 5.0


In [37]:
algo.predict(353, 1)

Prediction(uid=353, iid=1, r_ui=None, est=4.112634892631316, details={'actual_k': 40, 'was_impossible': False})

## Item-based рекомендация

In [38]:
algo = KNNBasic(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [39]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7ffa6305d438>

In [40]:
predictions = algo.test(testset)

In [41]:
accuracy.mae(predictions)

MAE:  0.6925


0.6925000175547114

In [42]:
from surprise import KNNWithMeans

In [43]:
algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})

In [44]:
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7ffa6305dfd0>

In [45]:
predictions = algo.test(testset)

In [46]:
accuracy.mae(predictions)

MAE:  0.6674


0.6673617427252511

In [47]:
df_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [48]:
similar_films = algo.get_neighbors(algo.trainset.to_inner_iid(1), k=10)

In [49]:
similar_films = [algo.trainset.to_raw_iid(i) for i in similar_films]

In [50]:
similar_films

[588, 3114, 8961, 6377, 1270, 2716, 2115, 34, 8636, 1097]

In [51]:
df_movies[df_movies['movieId'].isin(similar_films)]

Unnamed: 0,movieId,title,genres
32,34,Babe (1995),Children|Drama
506,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
836,1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi
969,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
1576,2115,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy
2038,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi
2355,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
4360,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy
5260,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX
5374,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy


In [52]:
from surprise.model_selection import KFold

In [53]:
kfold = KFold(5)

In [54]:
scores = []
for trainset, testset in tqdm_notebook(kfold.split(dataset)):
    algo = KNNWithMeans(k=40, sim_options={'name': 'pearson_baseline', 'user_based': False})
    algo.fit(trainset)
    predictions = algo.test(testset)
    scores.append(accuracy.mae(predictions))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6703
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6723
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6694
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6713
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
MAE:  0.6663



In [55]:
scores

[0.6702720954252053,
 0.6723209317479646,
 0.669376474555652,
 0.6713177935709984,
 0.6663283086935524]