In [48]:
import pandas as pd
from math import sqrt
import numpy as np

In [49]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


In [50]:
userInput = [{'title':'Andrew Dice Clay: Dice Rules (1991)','rating':5},
             {'title':'Toy Story (1995)', 'rating':3.5},
             {'title':'Jumanji (1995)', 'rating':4.5},
             {'title':'No Game No Life: Zero (2017)', 'rating':5},
             {'title':'Bungo Stray Dogs: Dead Apple (2018)', 'rating':4.5}
            ]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                 title  rating
0  Andrew Dice Clay: Dice Rules (1991)     5.0
1                     Toy Story (1995)     3.5
2                       Jumanji (1995)     4.5
3         No Game No Life: Zero (2017)     5.0
4  Bungo Stray Dogs: Dead Apple (2018)     4.5


In [51]:

inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                title  rating
0        1                     Toy Story (1995)     3.5
1        2                       Jumanji (1995)     4.5
2   193583         No Game No Life: Zero (2017)     5.0
3   193587  Bungo Stray Dogs: Dead Apple (2018)     4.5
4   193609  Andrew Dice Clay: Dice Rules (1991)     5.0


In [52]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1           215     215        215
2           110     110        110
193583        1       1          1
193587        1       1          1
193609        1       1          1


In [53]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((18,),       userId  movieId  rating   timestamp
1772      18        1     3.5  1455209816
1773      18        2     3.0  1455617462), ((19,),       userId  movieId  rating  timestamp
2274      19        1     4.0  965705637
2275      19        2     3.0  965704331), ((21,),       userId  movieId  rating   timestamp
3219      21        1     3.5  1407618878
3220      21        2     3.5  1419795031), ((27,),       userId  movieId  rating  timestamp
4059      27        1     3.0  962685262
4060      27        2     4.0  962685711), ((68,),        userId  movieId  rating   timestamp
10360      68        1     2.5  1158531426
10361      68        2     2.5  1158532776)]


In [54]:
userSubsetGroup

[((18,),
        userId  movieId  rating   timestamp
  1772      18        1     3.5  1455209816
  1773      18        2     3.0  1455617462),
 ((19,),
        userId  movieId  rating  timestamp
  2274      19        1     4.0  965705637
  2275      19        2     3.0  965704331),
 ((21,),
        userId  movieId  rating   timestamp
  3219      21        1     3.5  1407618878
  3220      21        2     3.5  1419795031),
 ((27,),
        userId  movieId  rating  timestamp
  4059      27        1     3.0  962685262
  4060      27        2     4.0  962685711),
 ((68,),
         userId  movieId  rating   timestamp
  10360      68        1     2.5  1158531426
  10361      68        2     2.5  1158532776),
 ((82,),
         userId  movieId  rating   timestamp
  12730      82        1     2.5  1084467729
  12731      82        2     3.0  1084465035),
 ((91,),
         userId  movieId  rating   timestamp
  14121      91        1     4.0  1112713037
  14122      91        2     3.0  111271339

In [55]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [56]:
print(pearsonCorrelationDict)

{(18,): -1.0, (19,): -1.0, (21,): 0, (27,): 1.0, (68,): 0, (82,): 1.0, (91,): -1.0, (93,): 1.0, (103,): 0, (107,): 1.0, (112,): -1.0, (135,): -1.0, (140,): 1.0, (144,): -1.0, (153,): 0, (160,): 0, (169,): -1.0, (177,): -1.0, (184,): 0, (186,): 0, (202,): 0, (217,): -1.0, (219,): -1.0, (226,): -1.0, (232,): 1.0, (240,): 0, (249,): 0, (274,): -1.0, (276,): 0, (288,): -1.0, (298,): -1.0, (304,): -1.0, (307,): -1.0, (322,): -1.0, (323,): 1.0, (330,): -1.0, (347,): -1.0, (353,): -1.0, (357,): -1.0, (359,): -1.0, (373,): 0, (380,): 0, (381,): 1.0, (411,): -1.0, (414,): -1.0, (432,): 1.0, (434,): -1.0, (436,): 0, (448,): -1.0, (470,): -1.0, (474,): -1.0, (476,): 0, (477,): 0, (480,): 0, (483,): 0, (484,): -1.0, (517,): -1.0, (525,): -1.0, (534,): 1.0, (559,): -1.0, (561,): 0, (570,): -1.0, (573,): -1.0, (590,): -1.0, (599,): -1.0, (600,): 1.0, (604,): 1.0, (605,): -1.0, (608,): -1.0, (1,): 0, (5,): 0, (6,): 0, (7,): 0, (8,): 0, (15,): 0, (17,): 0, (20,): 0, (31,): 0, (32,): 0, (33,): 0, (40,)

In [57]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF['userId'] = pearsonDF['userId'].apply(func=lambda x : x[0])
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0             -1.0      18
1             -1.0      19
2              0.0      21
3              1.0      27
4              0.0      68


In [58]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
12              1.0     140
24              1.0     232
34              1.0     323
58              1.0     534
42              1.0     381


In [60]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     140        1     3.0   942924980
1               1.0     140        2     3.5  1085569813
2               1.0     140        6     5.0   942843185
3               1.0     140       11     4.0   949667337
4               1.0     140       21     4.0   949666898
..              ...     ...      ...     ...         ...
95              1.0     140      912     4.0  1085569768
96              1.0     140      914     4.0   942841652
97              1.0     140      919     4.0  1085569784
98              1.0     140      920     5.0  1055092887
99              1.0     140      924     4.0  1021898943

[100 rows x 5 columns]


In [61]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     140        1     3.0   942924980             3.0
1              1.0     140        2     3.5  1085569813             3.5
2              1.0     140        6     5.0   942843185             5.0
3              1.0     140       11     4.0   949667337             4.0
4              1.0     140       21     4.0   949666898             4.0


In [62]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                       12.0                38.0
2                       12.0                50.0
3                        0.0                 0.0
4                        1.0                 1.5
5                        3.0                 9.5


In [63]:
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.166667        1
2                                     4.166667        2
3                                          NaN        3
4                                     1.500000        4
5                                     3.166667        5
6                                     3.833333        6
7                                     3.500000        7
8                                          NaN        8
10                                    3.625000       10
11                                    4.500000       11


In [66]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
2936                                       5.0     2936
1199                                       5.0     1199
366                                        5.0      366
4225                                       5.0     4225
2565                                       5.0     2565
...                                        ...      ...
193579                                     NaN   193579
193581                                     NaN   193581
193583                                     NaN   193583
193585                                     NaN   193585
193587                                     NaN   193587

[4404 rows x 2 columns]


In [70]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie.head(5))

   movieId                               title                 genres
2        3             Grumpier Old Men (1995)         Comedy|Romance
3        4            Waiting to Exhale (1995)   Comedy|Drama|Romance
4        5  Father of the Bride Part II (1995)                 Comedy
5        6                         Heat (1995)  Action|Crime|Thriller
6        7                      Sabrina (1995)         Comedy|Romance
