In [3]:
import numpy as np
import pandas as pd

test_ratings = pd.read_csv('./data/ratings_test.csv')
train_ratings = pd.read_csv('./data/ratings_train.csv')

In [91]:
def get_movie_sim(train_ratings, fillMean=False):
    """ Pass trainig raitings that contains movieId, userId, rating columns
        Returns movie-movie similarity martix with movieId as indexes
        Cosine similarity is used
        Pass second argument as True if empty entries of sparce matrix should be filled by movie mean.
        Pass second argument as False if emoty entries of sparce matrix should be filled by 0."""
    movie_user = train_ratings.pivot('movieId', 'userId', 'rating')
    if fillMean:
        # set unrated movies to movie mean
        movie_means = movie_user.mean(axis=1)
        for i, col in enumerate(movie_user):
            # using i allows for duplicate columns
            movie_user.iloc[:, i] = movie_user.iloc[:, i].fillna(movie_means)
    else:
        # Fill NaNs with 0
        movie_user = movie_user.fillna(value=0)
    # Calculate cosine similarity
    # create normalazing vector
    norm = np.linalg.norm(movie_user, axis=1, keepdims=True)
    # normalize rates
    normalized_movie_repres = movie_user.div(norm)
    movie_sim = normalized_movie_repres.dot(normalized_movie_repres.T)
    # normalized_movie_repres = movie_user.dot(movie_user.T)
    # movie_sim = normalized_movie_repres.div(norm)
    return movie_sim

def get_movie_sim_v2(train_ratings, fillMean=False):
    """ Pass trainig raitings that contains movieId, userId, rating columns
        Returns movie-movie similarity martix with movieId as indexes
        Cosine similarity is used
        Pass second argument as True if empty entries of sparce matrix should be filled by movie mean.
        Pass second argument as False if emoty entries of sparce matrix should be filled by 0."""
    movie_user = train_ratings.pivot('movieId', 'userId', 'rating')
    if fillMean:
        # set unrated movies to movie mean
        movie_means = movie_user.mean(axis=1)
        for i, col in enumerate(movie_user):
            # using i allows for duplicate columns
            movie_user.iloc[:, i] = movie_user.iloc[:, i].fillna(movie_means)
    else:
        # Fill NaNs with 0
        movie_user = movie_user.fillna(value=0)
    # Calculate cosine similarity
    # create normalazing vector
    norm = np.linalg.norm(movie_user, axis=1, keepdims=True)
    # normalize rates
    # normalized_movie_repres = movie_user.div(norm)
    # movie_sim = normalized_movie_repres.dot(normalized_movie_repres.T)
    normalized_movie_repres = movie_user.dot(movie_user.T)
    movie_sim = normalized_movie_repres.div(norm)
    return movie_sim

def get_prediction(train_ratings, movie_sim, userId):
    """Returns predictions for a given user
        Requires: training ratings, movie_similarity, userId"""
    user_ratings = train_ratings[train_ratings.userId == userId]
    # retunr null of user does not exist in training set
    if user_ratings.empty:
        return user_ratings
    # get movie similarity user_rated_movieId x all_movies
    user_sim = movie_sim.loc[list(user_ratings.movieId),:]
    # drop if there are any mismatch
    user_sim = user_sim.dropna(how='any')
    # create pandas dataframe with 'movieId' as indexes and user ratings as 'rating' column
    user_ratings = pd.DataFrame({'rating': list(user_ratings.rating)},
                     index=user_ratings.movieId)
    # add one to ratings sum to prevent division by 0
    sim_sum = user_sim.sum() + 1
    # create pandas dataframe with 'movieId' as indexes and user ratings sum as 'rating' column
    sim_sum = pd.DataFrame({'rating': sim_sum.tolist()},
                          index=sim_sum.index)
    # multiply user_sim by user_ratings
    unnorm_ratings = user_sim.T.dot(user_ratings)
    # normalize user ratings
    user_all_movie_ratings = unnorm_ratings.div(sim_sum)
    # return user_rating predictions
    return user_all_movie_ratings

def calc_rmse(train_ratings, test_ratings, movie_sim, userId):
    """Calculate RMSE score for a single user
        Return: RMSE score for a user"""
    # get user predictions
    user_predicted_ratings = get_prediction(train_ratings, movie_sim, userId)
    # return None if unknown user
    if user_predicted_ratings.empty:
        return None
    # get user actual ratings
    test_user_ratings = test_ratings[test_ratings.userId == userId]
    
    # remove movies where predictions are not known
    unique = []
    for i in test_user_ratings.movieId:
        if i in movie_sim.index:
            unique.append(i)
    test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(unique)]
    # remove predictions that will not be used
    user_predicted_ratings = user_predicted_ratings[user_predicted_ratings.index.isin(test_user_ratings.movieId)]
    n = len(user_predicted_ratings)
    #err_square = (user_predicted_ratings.rating - test_user_ratings.rating)**2
    err_square = (np.array(user_predicted_ratings.rating) - np.array(test_user_ratings.rating))**2
    return (err_square.sum()/n)**(1/2)

def predict_movie_rate(train_ratings, movie_sim, userId, movieId):
    """Predict a rate for specific movieId"""
    user_recomendations = get_prediction(train_ratings, movie_sim, int(userId))
    if user_recomendations.empty:
        print("Cannon predict for userId=" + str(userId))
        return 'unknown'
    elif not int(movieId) in list(user_recomendations.index):
        print("Cannot predict for movieId="+ str(movieId))
        return 'unknown'
    rating = user_recomendations.loc[int(movieId)].rating
    return '{:.4f}'.format(rating)
movie_sim = get_movie_sim(train_ratings, False)
calc_rmse(train_ratings, test_ratings, movie_sim, 73)

0.7322648432687336

In [68]:
movie_sim = get_movie_sim(train_ratings, True)
calc_rmse(train_ratings, test_ratings, movie_sim, 73)

0.8103013004986394

In [65]:
movie_sim = get_movie_sim(train_ratings, False)
calc_rmse(train_ratings, test_ratings, movie_sim, 73)

0.7322648432687336

In [94]:
movie_sim = get_movie_sim(train_ratings)
ids = [['73', '7099'], ['15', '216'], ['15', '3265'], ['11', '785']]
for i in ids:
    rate = predict_movie_rate(train_ratings, movie_sim, i[0], i[1])
    print('{},{},{}'.format(i[0], i[1], rate))

movie_sim2 = get_movie_sim_v2(train_ratings)
for i in ids:
    rate = predict_movie_rate(train_ratings, movie_sim2, i[0], i[1])
    print('{},{},{}'.format(i[0], i[1], rate))

73,7099,3.4139
15,216,2.7173
15,3265,2.9864
Cannon predict for userId=11
11,785,unknown
73,7099,3.4521
15,216,2.7349
15,3265,3.0117
Cannon predict for userId=11
11,785,unknown


In [None]:
def get_prediction(test_ratings, train_ratings, movie_sim, userId):
    test_user_ratings = test_ratings[test_ratings.userId == userId]
    train_user_ratings = train_ratings[train_ratings.userId == userId]
    # get movieIds that are rated by user in test but not rated by this user in training set
    unique = []
    for i in test_user_ratings.movieId:
        if i not in train_user_ratings.movieId:
            unique.append(i)
    # remove movies that are rated by user in test but not rated by this user in training set
    test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(unique)]
    # get movie similarity user_rated_movieId x all_movies
    user_sim = movie_sim.loc[list(test_user_ratings.movieId),:]
    # drop movies that are in user test_ratings but not in movie_similarity matrix (did not appear while training)
    # bith in user_sim and test_user_ratings
    user_sim = user_sim.dropna(how='any')
    test_user_ratings = test_user_ratings[test_user_ratings.movieId.isin(user_sim.index)]
    # create pandas dataframe with 'movieId' as indexes and user ratings as 'rating' column
    test_user_ratings = pd.DataFrame({'rating': list(test_user_ratings.rating)},
                     index=test_user_ratings.movieId)
    # add one to ratings sum to prevent division by 0
    sim_sum = user_sim.sum() + 1
    # create pandas dataframe with 'movieId' as indexes and user ratings sum as 'rating' column
    sim_sum = pd.DataFrame({'rating': sim_sum.tolist()},
                          index=sim_sum.index)
    # multiply user_sim by user_ratings
    unnorm_ratings = user_sim.T.dot(test_user_ratings)
    # normalize user ratings
    user_all_movie_ratings = unnorm_ratings.div(sim_sum)
    # return user_rating predictions
    return user_all_movie_ratings
movie_sim = get_movie_sim(train_ratings)
user_all_movie_ratings = get_prediction(test_ratings, train_ratings, movie_sim, 1)
# test_user_ratings = test_ratings[test_ratings.userId == 73]
# error_predict = user_all_movie_ratings[user_all_movie_ratings.index.isin(test_user_ratings.index)]
# error_predict.shape
#error_square = (error_predict - test_user_ratings)**2
#error_square.div(197).sum()**(1/2)
#error_square.sum().div(197)**(1/2)

In [539]:
user_ratings = train_ratings[train_ratings.userId ==547]
user_rating = pd.DataFrame({'rating': list(user_ratings.rating)},
                     index=user_ratings.movieId)
user_sim = movie_sim.loc[user_ratings.movieId,:]
sim_sum = user_sim.sum()+1
sim_sum = pd.DataFrame({'rating': sim_sum.tolist()},
                     index=sim_sum.index)
unnorm_ratings = user_sim.T.dot(user_rating)
user_all_movie_ratings = unnorm_ratings.div(sim_sum)

# temp = user_sim.T 
# temp.dot(user_rating.T)

#recommendation = np.divide(np.matmul(user_sim.T, user_rating), np.add(sim_sum, 1).T)

In [2]:
def quadratic(a,b,c):
    return lambda x: a*(x**2) + b*x + c

f = quadratic(3,3,2)
f(2)

20

In [84]:
train_ratings.userId.value_counts().index

Int64Index([547, 564, 452, 468, 624,  15,  73, 311,  30, 294,
            ...
            296,  76, 221, 668, 249, 604, 540, 458, 637, 652],
           dtype='int64', length=547)


In [85]:
userIds = train_ratings.userId.value_counts().index
for id in userIds:
    if id in test_ratings.userId:
        print(id)

547
564
452
468
624
15
73
311
30
294
380
509
580
212
472
388
23
518
461
232
102
306
119
654
358
575
105
353
529
587
596
165
195
384
463
605
665
607
19
285
150
405
268
242
514
130
17
199
574
111
48
346
187
128
407
77
598
355
243
537
430
534
313
585
95
561
608
239
220
247
460
312
577
387
292
427
431
439
88
426
562
500
648
480
295
240
442
157
236
177
408
22
412
373
152
367
471
466
558
303
214
345
125
57
394
253
4
185
501
597
363
41
94
627
584
434
86
283
118
99
34
342
265
344
189
78
328
646
418
350
61
428
595
533
21
83
510
81
93
235
520
390
487
647
626
516
159
255
75
659
254
396
641
609
417
582
297
120
548
33
219
525
527
550
592
531
148
182
559
602
309
656
282
528
496
523
245
175
92
201
68
386
441
110
362
168
8
671
234
603
263
169
391
502
493
381
43
536
416
85
497
36
217
623
67
160
370
188
5
599
614
49
248
385
196
420
20
63
621
330
132
103
545
161
649
590
7
224
222
291
447
453
551
124
422
569
70
281
164
216
414
137
121
293
59
530
143
96
202
2
198
555
288
617
594
321
492
146
58
307
401
271


In [89]:
train_ratings[train_ratings.userId == 73]

Unnamed: 0,userId,movieId,rating
8143,73,1,5.0
8144,73,2,2.5
8145,73,6,4.5
8146,73,10,3.0
8147,73,15,2.5
...,...,...,...
9188,73,91500,4.0
9189,73,91630,4.0
9190,73,91658,4.0
9191,73,92309,3.0


In [517]:
user_ratings.sort_values(by='rating', ascending=False)[:15]

Unnamed: 0,userId,movieId,rating
62739,547,1950,5.0
62620,547,1299,5.0
62524,547,1104,5.0
62402,547,593,5.0
62403,547,594,5.0
62523,547,1103,5.0
63011,547,3060,5.0
62609,547,1282,5.0
62407,547,608,5.0
63096,547,3362,5.0


In [291]:
user_rating = pd.DataFrame({'rating': [5, 5, 5]},
                     index=[1,2,3])
user_rating.index.name = 'movieId'
temp.T.dot(user_rating).div(sim_sum)

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.175519
2,3.114411
3,3.052558
4,1.681405
5,2.410935
...,...
94018,0.435891
94478,0.435891
94677,0.435891
94777,1.083915
