In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [3]:
userInput = [{'title':'Jumanji (1995)', 'rating':2},
             {'title':'Toy Story (1995)', 'rating':1},
             {'title':'GoldenEye (1995)', 'rating':3},
             {'title':"Barney's Great Adventure (1998)", 'rating':5},
             {'title':'Akira (1988)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                             title  rating
0                   Jumanji (1995)     2.0
1                 Toy Story (1995)     1.0
2                 GoldenEye (1995)     3.0
3  Barney's Great Adventure (1998)     5.0
4                     Akira (1988)     4.5


In [4]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                            title  rating
0        1                 Toy Story (1995)     1.0
1        2                   Jumanji (1995)     2.0
2       10                 GoldenEye (1995)     3.0
3     1274                     Akira (1988)     4.5
4     1826  Barney's Great Adventure (1998)     5.0


In [5]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
10          132     132        132
1274         39      39         39
1826          1       1          1


In [6]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(91,        userId  movieId  rating   timestamp
14121      91        1     4.0  1112713037
14122      91        2     3.0  1112713392
14125      91       10     3.5  1112713269
14316      91     1274     5.0  1112713057), (219,        userId  movieId  rating   timestamp
31524     219        1     3.5  1194681084
31525     219        2     2.5  1194740185
31527     219       10     4.5  1194932162
31628     219     1274     2.5  1194686351), (274,        userId  movieId  rating   timestamp
39229     274        1     4.0  1171410158
39230     274        2     3.5  1171934785
39233     274       10     4.0  1171428459
39448     274     1274     4.0  1205707621), (414,        userId  movieId  rating  timestamp
62294     414        1     4.0  961438127
62295     414        2     3.0  961594981
62301     414       10     3.0  961515863
62769     414     1274     4.0  997200022), (434,        userId  movieId  rating   timestamp
67080     434        1     4.0  1270604402
67081     434        

  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [7]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0
    

In [8]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.604611      91
1        -0.204037     219
2         0.279073     274
3         0.096674     414
4         0.563061     434


In [9]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
84              1.0     323
13              1.0      82
55              1.0      50
60              1.0     107
63              1.0     119


In [10]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     323        1     3.5  1422640363
1               1.0     323        2     4.0  1422640110
2               1.0     323       17     3.5  1422640288
3               1.0     323       19     2.5  1422640116
4               1.0     323       22     3.0  1422640551
..              ...     ...      ...     ...         ...
95              1.0     323   115617     3.5  1422640828
96              1.0     323   116797     4.5  1422640799
97              1.0     323   117176     4.0  1422640837
98              1.0      82        1     2.5  1084467729
99              1.0      82        2     3.0  1084465035

[100 rows x 5 columns]


In [11]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     323        1     3.5  1422640363             3.5
1              1.0     323        2     4.0  1422640110             4.0
2              1.0     323       17     3.5  1422640288             3.5
3              1.0     323       19     2.5  1422640116             2.5
4              1.0     323       22     3.0  1422640551             3.0


In [12]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                  20.077929           64.451713
2                  17.487628           58.622014
3                   5.575288           13.655103
4                   0.453921            0.680881
5                   3.416620           11.060200


In [13]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.210078        1
2                                     3.352199        2
3                                     2.449219        3
4                                     1.500000        4
5                                     3.237176        5
6                                     4.084853        6
7                                     2.530171        7
8                                     3.000000        8
9                                     1.500000        9
10                                    3.952307       10


In [14]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
26073                                      5.0    26073
3246                                       5.0     3246
1111                                       5.0     1111
6442                                       5.0     6442
84273                                      5.0    84273
...                                        ...      ...
184253                                     NaN   184253
184471                                     NaN   184471
185029                                     NaN   185029
185435                                     NaN   185435
187593                                     NaN   187593

[6151 rows x 2 columns]


In [15]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                  title  \
2           3                Grumpier Old Men (1995)   
3           4               Waiting to Exhale (1995)   
4           5     Father of the Bride Part II (1995)   
5           6                            Heat (1995)   
6           7                         Sabrina (1995)   
...       ...                                    ...   
9713   188301            Ant-Man and the Wasp (2018)   
9714   188675                          Dogman (2018)   
9717   188833  The Man Who Killed Don Quixote (2018)   
9721   189381                        SuperFly (2018)   
9724   190183               The Darkest Minds (2018)   

                                      genres  
2                             Comedy|Romance  
3                       Comedy|Drama|Romance  
4                                     Comedy  
5                      Action|Crime|Thriller  
6                             Comedy|Romance  
...                                      ... 