In [71]:
import pandas as pd
import numpy as np
from math import sqrt

In [72]:
movie = pd.read_csv("movie_title.csv")
user = pd.read_csv("selected_user_ratings.csv")
user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,296,3.0,964982967
2,1,2000,4.0,964982211
3,4,296,1.0,945173350
4,4,1968,4.0,986934786
...,...,...,...,...
872,610,1,5.0,1479542900
873,610,296,5.0,1479545817
874,610,1274,5.0,1493846688
875,610,1968,4.0,1493850238


In [73]:
userInput = [{'title':'Breakfast Club, The', 'rating':5},
             {'title':'Toy Story', 'rating':1},
             {'title':'Jumanji', 'rating':1},
             {'title':'Pulp Fiction', 'rating':5},
             {'title':'Akira', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                 title  rating
0  Breakfast Club, The     5.0
1            Toy Story     1.0
2              Jumanji     1.0
3         Pulp Fiction     5.0
4                Akira     4.5


In [74]:
inputId = movie[movie['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies.drop('year', 1)
inputMovies = inputMovies[['movieId', 'title', 'rating']]
print(inputMovies)

   movieId                title  rating
0        1            Toy Story     1.0
1        2              Jumanji     1.0
2      296         Pulp Fiction     5.0
3     1274                Akira     4.5
4     1968  Breakfast Club, The     5.0


  inputMovies = inputMovies.drop('year', 1)


In [75]:
userSubset = user[user['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
296         307     307        307
1274         39      39         39
1968        113     113        113


In [76]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(91,      userId  movieId  rating   timestamp
119      91        1     4.0  1112713037
120      91        2     3.0  1112713392
121      91      296     4.5  1112711264
122      91     1274     5.0  1112713057
123      91     1968     3.0  1112713409), (177,      userId  movieId  rating   timestamp
242     177        1     5.0  1435533535
243     177        2     3.5  1435534109
244     177      296     5.0  1435530409
245     177     1274     2.0  1435535036
246     177     1968     3.5  1435534080), (219,      userId  movieId  rating   timestamp
306     219        1     3.5  1194681084
307     219        2     2.5  1194740185
308     219      296     4.0  1198522553
309     219     1274     2.5  1194686351
310     219     1968     3.0  1194931899), (274,      userId  movieId  rating   timestamp
377     274        1     4.0  1171410158
378     274        2     3.5  1171934785
379     274      296     5.0  1171493995
380     274     1274     4.0  1205707621
381     274     1968     4.

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [77]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    


In [78]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.351124      91
1        -0.254967     177
2         0.199967     219
3         0.616658     274
4         0.916619     298


In [79]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
49              1.0     160
43              1.0     132
70              1.0     373
82              1.0     489
63              1.0     305


In [80]:
topUsersRating=topUsers.merge(user, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0          1.000000     160        1     4.0   971115026
1          1.000000     160        2     4.0   971619578
2          1.000000     160      296     5.0   971113194
3          1.000000     160     2000     4.0   971116347
4          1.000000     132        1     2.0  1157921785
..              ...     ...      ...     ...         ...
95         0.807395     414        2     3.0   961594981
96         0.807395     414      296     5.0   961516693
97         0.807395     414     1274     4.0   997200022
98         0.807395     414     1968     5.0   961517252
99         0.807395     414     2000     3.0   961437472

[100 rows x 5 columns]


In [81]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     160        1     4.0   971115026             4.0
1              1.0     160        2     4.0   971619578             4.0
2              1.0     160      296     5.0   971113194             5.0
3              1.0     160     2000     4.0   971116347             4.0
4              1.0     132        1     2.0  1157921785             2.0


In [82]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  33.529084          114.442383
2                  27.842790           87.478362
296                35.450198          159.672967
1274               10.413839           44.111899
1968               25.705969          108.005467


In [83]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.413227        1
2                                     3.141868        2
296                                   4.504149      296
1274                                  4.235892     1274
1968                                  4.201571     1968
2000                                  3.681548     2000
2004                                  3.221759     2004
2007                                  3.147320     2007


In [84]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
296                                   4.504149      296
1274                                  4.235892     1274
1968                                  4.201571     1968
2000                                  3.681548     2000
1                                     3.413227        1
2004                                  3.221759     2004
2007                                  3.147320     2007
2                                     3.141868        2


In [85]:
recommended_movie=movie.loc[movie['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                      title  year
1475     2000              Lethal Weapon  1987
1479     2004  Gremlins 2: The New Batch  1990
1482     2007             Polish Wedding  1998
