In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')

In [3]:
ratings = pd.read_csv('ratings.csv')

In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
ratings.describe()['timestamp']['75%']

1435994144.5

In [6]:
tags = pd.read_csv('tags.csv')

In [7]:
movies = pd.read_csv('movies.csv')

In [8]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


# User Genre Matrix

In [9]:
genres = []

In [10]:
for i in movies.index:
    genres.extend(movies['genres'][i].split('|'))

In [11]:
genres = list(set(genres))

In [12]:
genres.remove('(no genres listed)')

In [13]:
len(genres)

19

In [14]:
users = list(ratings['userId'].unique())

In [15]:
usergenre_matrix = pd.DataFrame(columns = genres, index = users)

In [16]:
temp = ratings[ratings['userId'] == 1].loc[:,['movieId', 'rating']]
temp

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0
...,...,...
227,3744,4.0
228,3793,5.0
229,3809,4.0
230,4006,4.0


In [17]:
movies_temp = movies.copy()

In [18]:
for i in range(9742):
    movies_temp['genres'].iloc[i] = (movies_temp['genres'].iloc[i]).split('|')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [19]:
movies_temp = movies_temp.explode('genres')
movies_temp

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
9738,193583,No Game No Life: Zero (2017),Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Animation


In [20]:
result = pd.merge(movies_temp, ratings, left_on = 'movieId', right_on = 'movieId')
result

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure,1,4.0,964982703
1,1,Toy Story (1995),Adventure,5,4.0,847434962
2,1,Toy Story (1995),Adventure,7,4.5,1106635946
3,1,Toy Story (1995),Adventure,15,2.5,1510577970
4,1,Toy Story (1995),Adventure,17,4.5,1305696483
...,...,...,...,...,...,...
274475,193583,No Game No Life: Zero (2017),Fantasy,184,3.5,1537109545
274476,193585,Flint (2017),Drama,184,3.5,1537109805
274477,193587,Bungo Stray Dogs: Dead Apple (2018),Action,184,3.5,1537110021
274478,193587,Bungo Stray Dogs: Dead Apple (2018),Animation,184,3.5,1537110021


In [21]:
x = result.groupby(['userId', 'genres']).mean()

In [22]:
x = x.drop(columns = ['movieId', 'timestamp'])

In [23]:
x

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,genres,Unnamed: 2_level_1
1,Action,4.322222
1,Adventure,4.388235
1,Animation,4.689655
1,Children,4.547619
1,Comedy,4.277108
...,...,...
610,Romance,3.731092
610,Sci-Fi,3.659363
610,Thriller,3.573529
610,War,3.776596


In [24]:
x.loc[1, 'Action'][0]

4.322222222222222

In [25]:
for i in genres:
    for j in users:
        try:
            usergenre_matrix[i][j] = x.loc[j, i][0]
        except:
            usergenre_matrix[i][j] = 0.0

In [26]:
usergenre_matrix

Unnamed: 0,Crime,War,Horror,Fantasy,Thriller,Animation,Sci-Fi,IMAX,Comedy,Mystery,Drama,Action,Romance,Documentary,Musical,Film-Noir,Children,Western,Adventure
1,4.35556,4.5,3.47059,4.29787,4.14545,4.68966,4.225,0,4.27711,4.16667,4.52941,4.32222,4.30769,0,4.68182,5,4.54762,4.28571,4.38824
2,3.8,4.5,3,0,3.7,0,3.875,3.75,4,4,3.88235,3.95455,4.5,4.33333,0,0,0,3.5,4.16667
3,0.5,0.5,4.6875,3.375,4.14286,0.5,4.2,0,1,5,0.75,3.57143,0.5,0,0.5,0,0.5,0,2.72727
4,3.81481,3.57143,4.25,3.68421,3.55263,4,2.83333,3,3.50962,3.47826,3.48333,3.32,3.37931,4,4,4,3.8,3.8,3.65517
5,3.83333,3.33333,3,4.14286,3.55556,4.33333,2.5,3.66667,3.46667,4,3.8,3.11111,3.09091,0,4.4,0,4.11111,3,3.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.65414,3.79231,3.34615,3.59794,3.52513,3.71429,3.55696,3.0625,3.56532,3.79121,3.78797,3.17881,3.74085,3.8,3.72727,3.8125,3.44898,3.41176,3.5034
607,3.81481,4.16667,4.11429,3.57143,4.11475,3.33333,3.25,5,3.32727,4.64706,4.0122,3.72222,3.51724,0,3.6,0,3.42105,4,3.46667
608,3.61301,3.57895,3.31959,3,3.53668,3.11818,3.29641,4,2.73662,3.55072,3.4375,3.33032,2.88679,3,2.75758,3.75,2.46023,2.63636,3.22099
609,3.5,3.5,3.5,3,3.28571,3,3,3,3.28571,0,3.36842,3.09091,3.2,3,0,0,3,4,3.2


In [27]:
usergenre_matrix=(usergenre_matrix-usergenre_matrix.min())/(usergenre_matrix.max()-usergenre_matrix.min())

In [28]:
usergenre_matrix

Unnamed: 0,Crime,War,Horror,Fantasy,Thriller,Animation,Sci-Fi,IMAX,Comedy,Mystery,Drama,Action,Romance,Documentary,Musical,Film-Noir,Children,Western,Adventure
1,0.871111,0.9,0.694118,0.859574,0.829091,0.937931,0.845,0,0.855422,0.833333,0.889273,0.864444,0.861538,0,0.936364,1,0.909524,0.857143,0.877647
2,0.76,0.9,0.6,0,0.74,0,0.775,0.75,0.8,0.8,0.737024,0.790909,0.9,0.866667,0,0,0,0.7,0.833333
3,0.1,0.1,0.9375,0.675,0.828571,0.1,0.84,0,0.2,1,0,0.714286,0.1,0,0.1,0,0.1,0,0.545455
4,0.762963,0.714286,0.85,0.736842,0.710526,0.8,0.566667,0.6,0.701923,0.695652,0.643137,0.664,0.675862,0.8,0.8,0.8,0.76,0.76,0.731034
5,0.766667,0.666667,0.6,0.828571,0.711111,0.866667,0.5,0.733333,0.693333,0.8,0.717647,0.622222,0.618182,0,0.88,0,0.822222,0.6,0.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.730827,0.758462,0.669231,0.719588,0.705025,0.742857,0.711392,0.6125,0.713064,0.758242,0.714815,0.635762,0.748169,0.76,0.745455,0.7625,0.689796,0.682353,0.70068
607,0.762963,0.833333,0.822857,0.714286,0.822951,0.666667,0.65,1,0.665455,0.929412,0.767575,0.744444,0.703448,0,0.72,0,0.684211,0.8,0.693333
608,0.722603,0.715789,0.663918,0.6,0.707336,0.623636,0.659281,0.8,0.547324,0.710145,0.632353,0.666065,0.577358,0.6,0.551515,0.75,0.492045,0.527273,0.644199
609,0.7,0.7,0.7,0.6,0.657143,0.6,0.6,0.6,0.657143,0,0.616099,0.618182,0.64,0.6,0,0,0.6,0.8,0.64


# Movie Co-occurance Matrix

In [None]:
movie_matrix = pd.DataFrame(columns = movies.movieId, index = movies.movieId)
movie_matrix

In [None]:
for i in movie_matrix.index:
    for j in movie_matrix.index:
        print(i, j)
        a = set(ratings[ratings['movieId'] == i]['userId'])
        b = set(ratings[ratings['movieId'] == j]['userId'])
        c = a.intersection(b)
        d = a.union(b)
        if len(d) == 0:
            movie_matrix[str(i)][j] = 0.0
        else:
            movie_matrix[str(i)][j] = round(len(c)/len(d), 2)
        movie_matrix[i][j] = len(c)

5582

In [None]:
(ratings[ratings['movieId'] == 1]['userId'])

In [None]:
a = set(ratings[ratings['movieId'] == 1]['userId'])
b = set(ratings[ratings['movieId'] == 1]['userId'])
c = a.intersection(b)
d = a.union(b)
round(len(c)/len(d), 2)

In [None]:
movie_matrix.to_csv('/content/drive/My Drive/DS Project/movie_matrix4.csv', index = False)

In [51]:
movie_matrix = pd.read_csv('movie_matrix_final.csv')

In [52]:
movies.movieId

0            1
1            2
2            3
3            4
4            5
         ...  
9737    193581
9738    193583
9739    193585
9740    193587
9741    193609
Name: movieId, Length: 9742, dtype: int64

In [53]:
movie_matrix.set_index(movies.movieId, inplace = True)

In [None]:
movie_matrix['5720']

In [None]:
ID = list(movies.movieId)

In [None]:
ID = [i for i in ID if i > 5720 ]

In [None]:
ID[0]

In [None]:
dff = ratings.copy()
dff =  ratings.groupby(['movieId'])['userId'].unique()
dff

In [None]:
len(set(dff[1]).union(set(dff[1])))

In [None]:
for movie, frame in ratings.groupby(['movieId']):
    print(frame)

In [None]:
for i in movie_matrix.index:
    for j in movie_matrix.index:
        try:
            print(i, j)
            movie_matrix[i][j] = round(len(set(dff[i]).intersection(set(dff[j])))/len(set(dff[i]).union(set(dff[j]))), 2)
        except KeyError:
            movie_matrix[i][j] = 0.0

In [None]:
len(ratings.groupby('movieId')['userId'])

# Movie Genre Matrix

In [29]:
movies_temp = movies.copy()

In [30]:
movies_temp = movies_temp.set_index('movieId')

In [31]:
movies_temp

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
...,...,...
193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
193585,Flint (2017),Drama
193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [32]:
movie_genre = pd.DataFrame(columns = genres, index = movies.movieId)

In [33]:
movie_genre[genres] = 0

In [34]:
for i in movies.movieId:
    for j in movies_temp['genres'][i].split('|'):
        try:
            movie_genre[j][i] = 1
        except KeyError:
            pass

In [35]:
movie_genre

Unnamed: 0_level_0,Crime,War,Horror,Fantasy,Thriller,Animation,Sci-Fi,IMAX,Comedy,Mystery,Drama,Action,Romance,Documentary,Musical,Film-Noir,Children,Western,Adventure
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1
2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0
193583,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
193587,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


# Predicting 10 Movies for each user

In [36]:
array = (cosine_similarity(np.array(usergenre_matrix.loc[1]).reshape(1, 19), movie_genre))[0].argsort()[-10:][::-1]

In [37]:
list(movies.ix[array.tolist()].movieId)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


[81132, 26701, 1907, 56152, 2987, 4719, 52462, 71999, 6902, 43932]

In [38]:
user_movie_pred = {}
for i in users:
    print(i)
    array = (cosine_similarity(np.array(usergenre_matrix.loc[i]).reshape(1, 19), movie_genre))[0].argsort()[-10:][::-1]
    user_movie_pred[i] = list(movies.ix[array.tolist()].movieId)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentat

In [39]:
user_movie_pred

{1: [81132, 26701, 1907, 56152, 2987, 4719, 52462, 71999, 6902, 43932],
 2: [81132, 26236, 79132, 31367, 4956, 49530, 459, 91542, 8481, 31921],
 3: [31804, 36509, 43932, 3113, 90345, 7001, 2232, 184253, 32213, 3837],
 4: [81132, 26701, 56152, 2987, 1907, 81847, 4719, 32031, 52462, 43932],
 5: [81847, 2987, 56152, 1907, 81132, 595, 32031, 1064, 364, 2092],
 6: [81132, 81847, 85261, 32031, 56152, 1907, 26701, 52462, 364, 79132],
 7: [81132, 26701, 43932, 2987, 52462, 56152, 1907, 6902, 4719, 36509],
 8: [81847, 1907, 364, 56152, 595, 81132, 2987, 85261, 32031, 631],
 9: [81132, 2987, 56152, 52462, 32031, 36397, 81847, 84637, 4306, 92348],
 10: [56152, 81847, 1907, 36397, 85261, 32031, 2987, 4719, 51939, 108932],
 11: [81132, 79132, 43932, 60684, 36509, 7235, 27317, 27683, 26236, 8481],
 12: [71999, 4719, 26236, 4956, 164226, 117646, 587, 47404, 26764, 161594],
 13: [81132, 43932, 36509, 71999, 6902, 31804, 91542, 198, 27683, 8481],
 14: [81132, 79132, 60684, 36509, 43932, 8481, 27683, 19

# Cartesian Product

In [54]:
dff = ratings.copy()
dff =  ratings.groupby(['movieId'])['userId'].unique()
dff

movieId
1         [1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...
2         [6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,...
3         [1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,...
4                           [6, 14, 84, 162, 262, 411, 600]
5         [6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,...
                                ...                        
193581                                                [184]
193583                                                [184]
193585                                                [184]
193587                                                [184]
193609                                                [331]
Name: userId, Length: 9724, dtype: object

In [55]:
temp1 = pd.DataFrame(dff)

In [56]:
temp2 = temp1.copy()

In [57]:
temp1 = temp1.reset_index()

In [58]:
temp2 = temp2.reset_index()

In [59]:
temp1 = temp1.rename(columns={'userId': 'userId1', 'movieId': 'movieId1'})

In [60]:
temp2 = temp2.rename(columns={'userId': 'userId2', 'movieId': 'movieId2'})

In [61]:
temp1 

Unnamed: 0,movieId1,userId1
0,1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ..."
1,2,"[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,..."
2,3,"[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,..."
3,4,"[6, 14, 84, 162, 262, 411, 600]"
4,5,"[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,..."
...,...,...
9719,193581,[184]
9720,193583,[184]
9721,193585,[184]
9722,193587,[184]


In [62]:
temp2

Unnamed: 0,movieId2,userId2
0,1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ..."
1,2,"[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,..."
2,3,"[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,..."
3,4,"[6, 14, 84, 162, 262, 411, 600]"
4,5,"[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,..."
...,...,...
9719,193581,[184]
9720,193583,[184]
9721,193585,[184]
9722,193587,[184]


In [63]:
def cartesian_product_basic(left, right):
    return (left.assign(key=1).merge(right.assign(key=1), on='key').drop('key', 1))

result = cartesian_product_basic(temp1, temp2)

In [64]:
result = result[['movieId1', 'movieId2', 'userId1', 'userId2']]

In [65]:
result = result.set_index(['movieId1', 'movieId2'])

In [66]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,userId1,userId2
movieId1,movieId2,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ..."
1,2,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[6, 8, 18, 19, 20, 21, 27, 51, 62, 68, 82, 91,..."
1,3,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[1, 6, 19, 32, 42, 43, 44, 51, 58, 64, 68, 91,..."
1,4,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[6, 14, 84, 162, 262, 411, 600]"
1,5,"[1, 5, 7, 15, 17, 18, 19, 21, 27, 31, 32, 33, ...","[6, 31, 43, 45, 58, 66, 68, 84, 103, 107, 111,..."
...,...,...,...
193609,193581,[331],[184]
193609,193583,[331],[184]
193609,193585,[331],[184]
193609,193587,[331],[184]


In [None]:
result['jaccard'] = result.apply(lambda row: round(len(set(row.userId1).intersection(set(row.userId2)))/len(set(row.userId1).union(set(row.userId2))), 2), axis = 1) 

# Timestamp

In [10]:
ratings_75 = ratings[ratings['timestamp'] <= ratings.describe()['timestamp']['75%']]

In [11]:
ratings_25 = ratings[ratings['timestamp'] > ratings.describe()['timestamp']['75%']]

In [12]:
ratings_75['userId'].unique()

array([  1,   3,   4,   5,   6,   7,   8,   9,  11,  12,  13,  14,  15,
        16,  17,  19,  20,  21,  22,  23,  26,  27,  28,  29,  31,  32,
        33,  34,  35,  36,  37,  38,  39,  40,  42,  43,  44,  45,  46,
        48,  51,  53,  54,  55,  56,  57,  58,  59,  60,  61,  64,  66,
        68,  69,  70,  71,  72,  74,  75,  78,  79,  80,  81,  82,  83,
        84,  85,  86,  87,  88,  90,  91,  92,  93,  94,  95,  96,  97,
        99, 100, 101, 102, 103, 104, 107, 108, 109, 110, 113, 115, 116,
       117, 118, 119, 120, 121, 124, 126, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 140, 142, 144, 145, 146, 147, 149,
       150, 151, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 166,
       167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
       180, 181, 182, 183, 185, 186, 187, 188, 191, 192, 193, 194, 195,
       197, 198, 199, 200, 201, 202, 203, 204, 206, 207, 208, 211, 213,
       214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 22

In [13]:
ratings_25['userId'].unique()

array([  2,  10,  15,  18,  21,  24,  25,  30,  41,  47,  49,  50,  52,
        62,  63,  65,  67,  68,  73,  76,  77,  89,  98, 103, 105, 106,
       111, 112, 114, 119, 122, 123, 125, 139, 141, 143, 148, 152, 153,
       154, 159, 177, 184, 189, 190, 193, 196, 205, 209, 210, 212, 227,
       233, 241, 247, 248, 249, 251, 252, 256, 258, 272, 279, 282, 285,
       291, 292, 296, 298, 305, 306, 318, 319, 326, 328, 329, 331, 338,
       339, 341, 352, 362, 363, 365, 366, 378, 380, 382, 400, 401, 408,
       413, 414, 417, 418, 424, 426, 433, 439, 441, 443, 445, 448, 459,
       461, 462, 466, 471, 475, 491, 495, 509, 511, 514, 515, 517, 519,
       522, 523, 525, 526, 534, 537, 548, 549, 550, 551, 556, 557, 560,
       561, 563, 564, 567, 581, 582, 583, 586, 596, 599, 601, 610],
      dtype=int64)

In [14]:
common = list(set(ratings_75['userId'].unique()).intersection(set(ratings_25['userId'].unique())))

In [15]:
common

[522,
 15,
 21,
 537,
 282,
 414,
 292,
 177,
 443,
 318,
 448,
 193,
 68,
 326,
 462,
 103,
 119,
 249,
 509]

In [16]:
rating75_groupby = pd.DataFrame(ratings_75.groupby(['userId', 'timestamp'])['movieId'].unique())

In [17]:
set(ratings_75[ratings_75['userId'] == 15].movieId.tolist())

{44,
 158,
 172,
 256,
 355,
 849,
 1347,
 1909,
 2001,
 2150,
 2278,
 3510,
 3555,
 3617,
 4018,
 69757}

In [18]:
(rating75_groupby.loc[15])[::-1].iloc[5]['movieId']

array([3510], dtype=int64)

In [19]:
movies_temp = movies.copy()
movie_temp = movies_temp.set_index('movieId')

In [20]:
user_movie_pred2 = {}
for i in common:
    user_movie_pred2[i] = []

In [21]:
num = 5
for user in common:
    #print(user)
    count = 0
    while len(user_movie_pred2[user]) != 15:
        print(user, count)
        movie_id = (rating75_groupby.loc[user])[::-1].iloc[count]['movieId']
        for i in movie_id:
            array = (cosine_similarity(np.array(movie_genre.loc[i]).reshape(1, 19), movie_genre))[0].argsort()[-num:][::-1]
            user_movie_pred2[user].extend(list(movies.ix[array.tolist()].movieId))
            if num > 1:
                num -= 1
        user_movie_pred2[user] = list(set(user_movie_pred2[user]).difference(set(ratings_75[ratings_75['userId'] == user].movieId.tolist())))
        count += 1

522 0


NameError: name 'movie_genre' is not defined

In [None]:
user_movie_pred2

In [None]:
movie_temp.ix[user_movie_pred2[21]]

In [54]:
ratings_75[ratings_75['userId'] == 15]

Unnamed: 0,userId,movieId,rating,timestamp
1435,15,44,1.0,1299424916
1437,15,158,1.0,1299424840
1438,15,172,1.0,1299424762
1439,15,256,3.0,1299425021
1444,15,355,1.0,1299425002
1452,15,849,2.0,1299425064
1462,15,1347,3.0,1299425144
1465,15,1909,2.5,1299424890
1466,15,2001,5.0,1299424826
1472,15,2150,5.0,1299425040


In [55]:
for user in user_movie_pred2:
    match = set(user_movie_pred2[user]).intersection(set(ratings_25[ratings_25['userId'] == 15].movieId.tolist()))
    print(user)
    print(list(match), len(match))

522
[] 0
15
[] 0
21
[] 0
537
[] 0
282
[89745] 1
414
[] 0
292
[89745] 1
177
[] 0
443
[89745] 1
318
[] 0
448
[] 0
193
[48780] 1
68
[] 0
326
[] 0
462
[89745] 1
103
[] 0
119
[] 0
249
[] 0
509
[] 0


In [56]:
ratings.describe()['timestamp']['min']

828124615.0

In [57]:
ratings.describe()['timestamp']['max']

1537799250.0

In [58]:
pd.to_datetime(ratings.describe()['timestamp']['min'], unit = 's')

Timestamp('1996-03-29 18:36:55')

In [59]:
pd.to_datetime(ratings.describe()['timestamp']['max'], unit = 's')

Timestamp('2018-09-24 14:27:30')

In [60]:
pd.to_datetime(ratings.describe()['timestamp']['75%'], unit = 's')

Timestamp('2015-07-04 07:15:44.500000')

# Predicting ratings given by Users

In [22]:
user_rating = pd.DataFrame(ratings_25.groupby(['userId', 'movieId'])['rating'].unique())

In [23]:
user_rating = pd.DataFrame(user_rating['rating'].apply(lambda x: user_rating['rating'][0][0]))

In [24]:
user_rating

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,movieId,Unnamed: 2_level_1
2,318,3.0
2,333,3.0
2,1704,3.0
2,3578,3.0
2,6874,3.0
...,...,...
610,166534,3.0
610,168248,3.0
610,168250,3.0
610,168252,3.0


In [25]:
movies_temp = movies.copy()

In [26]:
for i in range(9742):
    movies_temp['genres'].iloc[i] = (movies_temp['genres'].iloc[i]).split('|')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [27]:
movies_temp = movies_temp.explode('genres')
movies_temp

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy
...,...,...,...
9738,193583,No Game No Life: Zero (2017),Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Animation


In [28]:
result = pd.merge(movies_temp, ratings_75, left_on = 'movieId', right_on = 'movieId')
result

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure,1,4.0,964982703
1,1,Toy Story (1995),Adventure,5,4.0,847434962
2,1,Toy Story (1995),Adventure,7,4.5,1106635946
3,1,Toy Story (1995),Adventure,17,4.5,1305696483
4,1,Toy Story (1995),Adventure,19,4.0,965705637
...,...,...,...,...,...,...
202586,130842,Power/Rangers (2015),Sci-Fi,393,4.0,1430506958
202587,131739,Batman vs. Robin (2015),Action,497,3.0,1429127171
202588,131739,Batman vs. Robin (2015),Adventure,497,3.0,1429127171
202589,131739,Batman vs. Robin (2015),Animation,497,3.0,1429127171


In [29]:
usergenre_count = pd.DataFrame(result.groupby(['userId', 'genres'])['genres'].count())

In [30]:
usergenre_count = usergenre_count.rename(columns={'genres': 'count'})

In [31]:
usergenre_count

Unnamed: 0_level_0,Unnamed: 1_level_0,count
userId,genres,Unnamed: 2_level_1
1,Action,90
1,Adventure,85
1,Animation,29
1,Children,42
1,Comedy,83
...,...,...
609,Romance,5
609,Sci-Fi,5
609,Thriller,14
609,War,4


In [32]:
result[result['userId'] == 609][result['genres'] == 'Thriller']['movieId'].nunique()

  """Entry point for launching an IPython kernel.


14

In [33]:
genres = []
for i in movies.index:
    genres.extend(movies['genres'][i].split('|'))
genres = list(set(genres))
genres.remove('(no genres listed)')
users = list(ratings_75['userId'].unique())

In [34]:
usergenre_matrix_75 = pd.DataFrame(columns = genres, index = users)

In [35]:
x = result.groupby(['userId', 'genres']).mean()

In [36]:
x = x.drop(columns = ['movieId', 'timestamp'])

In [37]:
x

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
userId,genres,Unnamed: 2_level_1
1,Action,4.322222
1,Adventure,4.388235
1,Animation,4.689655
1,Children,4.547619
1,Comedy,4.277108
...,...,...
609,Romance,3.200000
609,Sci-Fi,3.000000
609,Thriller,3.285714
609,War,3.500000


In [38]:
for i in genres:
    for j in users:
        try:
            usergenre_matrix_75[i][j] = x.loc[j, i][0]
        except:
            usergenre_matrix_75[i][j] = 0.0

In [39]:
usergenre_matrix_75

Unnamed: 0,Drama,Musical,Mystery,War,Action,IMAX,Romance,Adventure,Fantasy,Film-Noir,Horror,Children,Comedy,Animation,Western,Thriller,Sci-Fi,Documentary,Crime
1,4.52941,4.68182,4.16667,4.5,4.32222,0,4.30769,4.38824,4.29787,5,3.47059,4.54762,4.27711,4.68966,4.28571,4.14545,4.225,0,4.35556
3,0.75,0.5,5,0.5,3.57143,0,0.5,2.72727,3.375,0,4.6875,0.5,1,0.5,0,4.14286,4.2,0,0.5
4,3.48333,4,3.47826,3.57143,3.32,3,3.37931,3.65517,3.68421,4,4.25,3.8,3.50962,4,3.8,3.55263,2.83333,4,3.81481
5,3.8,4.4,4,3.33333,3.11111,3.66667,3.09091,3.25,4.14286,0,3,4.11111,3.46667,4.33333,3,3.55556,2.5,0,3.83333
6,3.61429,4.16667,3.73333,3.58333,3.60938,4.66667,3.61429,3.89362,3.53846,2.5,3.26316,3.61702,3.37008,4.07143,3.81818,3.54412,3.47619,0,3.28571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,3.11864,3.07692,3.4375,2.77778,3.22464,3.67857,3.33051,3.26415,3.15217,0,2.86667,3.20755,3.25843,2.94231,3.33333,2.92,3.28846,3,3.04545
606,3.78797,3.72727,3.79121,3.79231,3.17881,3.0625,3.74085,3.5034,3.59794,3.8125,3.34615,3.44898,3.56532,3.71429,3.41176,3.52513,3.55696,3.8,3.65414
607,4.0122,3.6,4.64706,4.16667,3.72222,5,3.51724,3.46667,3.57143,0,4.11429,3.42105,3.32727,3.33333,4,4.11475,3.25,0,3.81481
608,3.4375,2.75758,3.55072,3.57895,3.33032,4,2.88679,3.22099,3,3.75,3.31959,2.46023,2.73662,3.11818,2.63636,3.53668,3.29641,3,3.61301


In [40]:
ratings_25

Unnamed: 0,userId,movieId,rating,timestamp
232,2,318,3.0,1445714835
233,2,333,4.0,1445715029
234,2,1704,4.5,1445715228
235,2,3578,4.0,1445714885
236,2,6874,4.0,1445714952
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [41]:
movies_temp = movies.copy()

In [42]:
for i in range(9742):
    movies_temp['genres'].iloc[i] = (movies_temp['genres'].iloc[i]).split('|')

In [43]:
result25 = pd.merge(movies_temp, ratings_25, left_on = 'movieId', right_on = 'movieId')

In [44]:
result25['predicted_rating'] = 0

In [45]:
result25 = result25.set_index('userId')

In [46]:
result25 = result25.ix[common]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [47]:
result25 = result25.reset_index()

In [56]:
result25

Unnamed: 0,userId,movieId,title,genres,rating,timestamp,predicted_rating
0,522,1200,Aliens (1986),"[Action, Adventure, Horror, Sci-Fi]",4.5,1449731613,3.875000
1,522,1214,Alien (1979),"[Horror, Sci-Fi]",5.0,1449731619,3.980392
2,522,55820,No Country for Old Men (2007),"[Crime, Drama]",4.5,1449724924,3.900000
3,522,97304,Argo (2012),"[Drama, Thriller]",2.5,1449724955,3.893382
4,522,106100,Dallas Buyers Club (2013),[Drama],0.5,1449724936,3.833333
...,...,...,...,...,...,...,...
1838,509,129229,Northmen - A Viking Saga (2014),"[Action, Adventure]",2.0,1435997905,3.300000
1839,509,130073,Cinderella (2015),"[Children, Drama, Fantasy, Romance]",4.0,1435997996,3.375776
1840,509,133419,Pitch Perfect 2 (2015),[Comedy],5.0,1435997941,3.159091
1841,509,136838,Kiss me Kismet (2006),"[Comedy, Romance]",2.5,1435998776,3.199029


In [107]:
result25[result25['userId'] == 15].movieId

0             1
209          47
649         260
740         293
773         296
          ...  
24086    152081
24257    158872
24372    160980
24573    166528
24609    166635
Name: movieId, Length: 119, dtype: int64

In [163]:
for i in result25[result25['userId'] == 15][result25['movieId'] == 1].genres:
    print(i)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


  """Entry point for launching an IPython kernel.


In [93]:
result25[(result25['userId'] == 331)&(result25['movieId'] == 193609)].index[0]

25208

In [113]:
result25.set_value(result25[(result25['userId'] == 331)&(result25['movieId'] == 193609)].index[0], 'predicted_rating', 120)

  """Entry point for launching an IPython kernel.


Unnamed: 0,movieId,title,genres,userId,rating,timestamp,predicted_rating
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,1510577970,0
1,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",18,3.5,1455209816,0
2,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",50,3.0,1514238116,0
3,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",63,5.0,1443199669,0
4,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",73,4.5,1464196374,0
...,...,...,...,...,...,...,...
25204,193581,Black Butler: Book of the Atlantic (2017),"[Action, Animation, Comedy, Fantasy]",184,4.0,1537109082,0
25205,193583,No Game No Life: Zero (2017),"[Animation, Comedy, Fantasy]",184,3.5,1537109545,0
25206,193585,Flint (2017),[Drama],184,3.5,1537109805,0
25207,193587,Bungo Stray Dogs: Dead Apple (2018),"[Action, Animation]",184,3.5,1537110021,0


In [179]:
z = result25[result25['userId'] == 522][result25['movieId'] == 1214].genres
for i in list(z)[0]:
    print(i)

Horror
Sci-Fi


  """Entry point for launching an IPython kernel.


In [55]:
for user in result25['userId'].unique():
    for movie in result25[result25['userId'] == user].movieId:
        #print(user, movie)
        temp = 0
        value = 0
        for genre in list(result25[result25['userId'] == user][result25['movieId'] == movie].genres)[0]:
            try:
                value += usergenre_matrix_75[genre][user]*usergenre_count.loc[user, genre]['count']
                temp += usergenre_count.loc[user, genre]['count']
            except:
                pass
        print(value, temp)
        try:
            value /= temp
        except ZeroDivisionError:
            value = 0
        print(value)
        result25.loc[(result25['userId'] == user) & (result25['movieId'] == movie), 'predicted_rating'] = value
        #result25.set_value(result25[(result25['userId'] == user)&(result25['movieId'] == movie)].index[0], 'predicted_rating', float(value))

  


697.5 180
3.875
203.0 51
3.980392156862745
507.0 130
3.9
529.5 136
3.8933823529411766
322.0 84
3.8333333333333335
689.5 178
3.8735955056179776
256.5 70
3.664285714285714
36.5 15
2.433333333333333
22.5 8
2.8125
35.5 15
2.3666666666666667
62.0 20
3.1
67.5 20
3.375
24.0 6
4.0
49.5 13
3.8076923076923075
25.0 9
2.7777777777777777
18.0 4
4.5
34.5 13
2.6538461538461537
26.5 11
2.409090909090909
4.0 4
1.0
55.5 22
2.522727272727273
24.0 6
4.0
35.5 15
2.3666666666666667
27.0 11
2.4545454545454546
38.5 16
2.40625
35.5 15
2.3666666666666667
11.5 5
2.3
46.5 18
2.5833333333333335
33.5 11
3.0454545454545454
41.0 15
2.7333333333333334
59.0 22
2.6818181818181817
42.5 14
3.0357142857142856
41.0 15
2.7333333333333334
41.0 15
2.7333333333333334
36.0 11
3.272727272727273
33.5 11
3.0454545454545454
11.0 6
1.8333333333333333
24.0 6
4.0
46.5 18
2.5833333333333335
19.5 5
3.9
22.0 5
4.4
55.5 22
2.522727272727273
62.0 20
3.1
24.0 6
4.0
30.5 9
3.388888888888889
37.0 11
3.3636363636363638
35.5 12
2.958333333333333

9023.0 2660
3.392105263157895
4994.0 1523
3.279054497701904
3810.5 1177
3.2374681393372984
8676.0 2627
3.302626570232204
4315.5 1224
3.525735294117647
8126.0 2401
3.3844231570179093
8061.5 2339
3.446558358272766
9484.0 2793
3.3956319369853203
3347.5 1032
3.243701550387597
4334.0 1277
3.39389193422083
4315.5 1224
3.525735294117647
3558.5 1090
3.264678899082569
6224.0 1893
3.287902799788695
5705.0 1706
3.344079718640094
5269.5 1627
3.2387830362630607
7322.5 2221
3.296938316073841
4110.0 1240
3.314516129032258
4315.5 1224
3.525735294117647
4315.5 1224
3.525735294117647
2614.0 775
3.3729032258064517
6204.0 1806
3.435215946843854
4335.5 1311
3.307017543859649
7800.0 2361
3.3036848792884372
4449.0 1348
3.300445103857567
4638.5 1418
3.2711565585331455
5670.0 1722
3.292682926829268
8425.5 2464
3.419439935064935
5682.5 1727
3.2903879559930513
6183.5 1902
3.2510515247108307
4315.5 1224
3.525735294117647
4196.0 1274
3.293563579277865
8925.0 2605
3.4261036468330133
6518.5 1970
3.3088832487309645
9

1984.0 552
3.5942028985507246
257.5 65
3.9615384615384617
257.5 65
3.9615384615384617
1984.0 552
3.5942028985507246
1450.5 393
3.6908396946564888
1798.0 475
3.785263157894737
2202.5 591
3.72673434856176
835.5 229
3.648471615720524
2057.0 554
3.7129963898916967
2131.5 561
3.799465240641711
1164.0 326
3.5705521472392636
1247.0 327
3.8134556574923546
2697.5 720
3.7465277777777777
1560.0 423
3.6879432624113475
2748.5 731
3.7599179206566347
393.0 101
3.891089108910891
257.5 65
3.9615384615384617
257.5 65
3.9615384615384617
2347.0 655
3.5832061068702292
2254.5 631
3.5729001584786055
1730.0 485
3.5670103092783507
1247.0 327
3.8134556574923546
884.5 234
3.77991452991453
2131.5 561
3.799465240641711
33.0 8
4.125
1700.0 461
3.6876355748373104
1325.5 367
3.611716621253406
280.0 75
3.7333333333333334
2370.0 650
3.646153846153846
1247.0 327
3.8134556574923546
257.5 65
3.9615384615384617
3525.0 947
3.7222808870116157
1247.0 327
3.8134556574923546
321.5 90
3.5722222222222224
1416.5 386
3.669689119170

802.5 262
3.062977099236641
1981.0 695
2.850359712230216
3931.5 1297
3.031225905936777
2540.0 839
3.0274135876042907
0 0
0
2699.0 956
2.823221757322176
2448.0 822
2.978102189781022
3033.5 1027
2.9537487828627067
3677.0 1241
2.9629331184528604
2540.0 839
3.0274135876042907
2448.0 822
2.978102189781022
4988.0 1661
3.0030102347983143
1483.5 475
3.123157894736842
1710.5 584
2.9289383561643834
4042.0 1370
2.9503649635036497
2828.5 974
2.904004106776181
1483.5 475
3.123157894736842
2448.0 822
2.978102189781022
1056.5 364
2.9024725274725274
2448.0 822
2.978102189781022
3033.5 1027
2.9537487828627067
2833.5 942
3.0079617834394905
2860.0 969
2.951496388028896
1056.5 364
2.9024725274725274
4042.0 1370
2.9503649635036497
2304.5 793
2.9060529634300125
2448.0 822
2.978102189781022
3950.0 1353
2.919438285291944
2448.0 822
2.978102189781022
1483.5 475
3.123157894736842
1197.0 425
2.816470588235294
4344.5 1438
3.0212100139082056
824.0 313
2.63258785942492
2540.0 839
3.0274135876042907
1056.5 364
2.902

340.5 82
4.152439024390244
155.0 38
4.078947368421052
182.5 43
4.244186046511628
285.0 69
4.130434782608695
340.5 82
4.152439024390244
248.0 60
4.133333333333334
296.5 70
4.235714285714286
119.0 28
4.25
0 0
0
336.0 81
4.148148148148148
276.0 67
4.119402985074627
70.5 16
4.40625
92.5 21
4.404761904761905
75.5 18
4.194444444444445
281.0 66
4.257575757575758
155.0 38
4.078947368421052
97.5 22
4.431818181818182
187.0 44
4.25
75.5 18
4.194444444444445
213.5 50
4.27
155.0 38
4.078947368421052
72.0 18
4.0
72.0 18
4.0
157.0 36
4.361111111111111
70.5 16
4.40625
218.5 50
4.37
406.5 97
4.190721649484536
94.0 22
4.2727272727272725
240.0 57
4.2105263157894735
180.0 43
4.186046511627907
340.5 82
4.152439024390244
64.5 16
4.03125
274.5 65
4.223076923076923
269.0 65
4.138461538461539
257.5 60
4.291666666666667
339.5 79
4.2974683544303796
197.5 48
4.114583333333333
179.0 44
4.068181818181818
207.5 49
4.23469387755102
346.5 83
4.174698795180723
180.5 42
4.2976190476190474
159.0 36
4.416666666666667
290.

390.5 121
3.227272727272727
518.5 158
3.2816455696202533
458.0 143
3.202797202797203
571.5 176
3.247159090909091
646.0 195
3.312820512820513
303.0 94
3.223404255319149
208.5 66
3.159090909090909
460.5 139
3.3129496402877696
303.0 94
3.223404255319149
253.0 80
3.1625
937.5 283
3.312720848056537
459.0 142
3.232394366197183
369.0 109
3.385321100917431
329.5 101
3.262376237623762
571.5 167
3.4221556886227544
477.0 144
3.3125
778.0 234
3.324786324786325
424.0 131
3.236641221374046
430.5 131
3.286259541984733
528.0 161
3.279503105590062
551.5 168
3.2827380952380953
414.0 122
3.3934426229508197
379.5 110
3.45
629.0 187
3.3636363636363638
460.5 139
3.3129496402877696
742.0 218
3.403669724770642
467.5 137
3.4124087591240877
366.0 111
3.2972972972972974
500.0 154
3.2467532467532467
449.0 133
3.3759398496240602
253.0 80
3.1625
170.5 51
3.343137254901961
350.5 109
3.2155963302752295
571.5 176
3.247159090909091
584.5 173
3.378612716763006
336.0 103
3.262135922330097
462.0 144
3.2083333333333335
686

In [58]:
result25['predicted_rating'].nunique()

989