In [None]:
import pandas as pd
from math import sqrt
import numpy as np

In [None]:
movies_df = pd.read_csv('/content/sample_data/movies.csv')
ratings_df = pd.read_csv('/content/sample_data/ratings.csv')
print(movies_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB
None


In [None]:
userInput = [{'title':'Four Musketeers, The (1974)', 'rating':4},
             {'title':'xXx: Return of Xander Cage (2017)', 'rating':3.5},
             {'title':'The Lego Batman Movie (2017)', 'rating':4},
             {'title':'Superman/Shazam!: The Return of Black Adam (2010)', 'rating':3},
             {'title':'Piku (2015)', 'rating':4.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                               title  rating
0                        Four Musketeers, The (1974)     4.0
1                  xXx: Return of Xander Cage (2017)     3.5
2                       The Lego Batman Movie (2017)     4.0
3  Superman/Shazam!: The Return of Black Adam (2010)     3.0
4                                        Piku (2015)     4.5


In [None]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                              title  rating
0     7720                        Four Musketeers, The (1974)     4.0
1   133389                                        Piku (2015)     4.5
2   167738                  xXx: Return of Xander Cage (2017)     3.5
3   167746                       The Lego Batman Movie (2017)     4.0
4   167780  Superman/Shazam!: The Return of Black Adam (2010)     3.0


In [None]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
7720         31      31         31
133389        3       3          3
167738       18      18         18
167746       63      63         63


In [None]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(2084,         userId  movieId  rating     timestamp
302307    2084   167738     4.5  1.520946e+09
302308    2084   167746     4.5  1.499713e+09), (2403,         userId  movieId  rating     timestamp
347851    2403   167738     3.5  1.513371e+09
347852    2403   167746     3.5  1.513372e+09), (3180,         userId  movieId  rating     timestamp
465041    3180   167738     2.5  1.490856e+09
465043    3180   167746     3.5  1.486763e+09), (4019,         userId  movieId  rating     timestamp
590495    4019   133389     3.5  1.547325e+09
590825    4019   167746     2.5  1.547325e+09), (4688,         userId  movieId  rating     timestamp
687923    4688   167738     3.0  1.484988e+09
687924    4688   167746     5.0  1.487032e+09)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0              0.0    2084
1              0.0    2403
2              1.0    3180
3              1.0    4019
4              1.0    4688


In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
2               1.0    3180
3               1.0    4019
4               1.0    4688
0               0.0    2084
64              0.0    4481


In [None]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating     timestamp
0               1.0    3180        1     4.0  1.471811e+09
1               1.0    3180        2     2.5  1.484512e+09
2               1.0    3180       10     2.0  1.471813e+09
3               1.0    3180       13     2.5  1.471958e+09
4               1.0    3180       15     1.0  1.505795e+09
..              ...     ...      ...     ...           ...
95              1.0    3180     1203     5.0  1.471811e+09
96              1.0    3180     1206     5.0  1.471813e+09
97              1.0    3180     1207     4.5  1.471812e+09
98              1.0    3180     1210     3.0  1.483028e+09
99              1.0    3180     1214     5.0  1.471811e+09

[100 rows x 5 columns]


In [None]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating     timestamp  weightedRating
0              1.0    3180        1     4.0  1.471811e+09             4.0
1              1.0    3180        2     2.5  1.484512e+09             2.5
2              1.0    3180       10     2.0  1.471813e+09             2.0
3              1.0    3180       13     2.5  1.471958e+09             2.5
4              1.0    3180       15     1.0  1.505795e+09             1.0


In [None]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                        2.0                 8.0
2                        2.0                 6.0
3                        0.0                 0.0
5                        0.0                 0.0
6                        1.0                 4.0


In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                         4.00        1
2                                         3.00        2
3                                          NaN        3
5                                          NaN        5
6                                         4.00        6
7                                          NaN        7
10                                        2.75       10
11                                         NaN       11
12                                         NaN       12
13                                        2.50       13


In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
106782                                     5.0   106782
1345                                       5.0     1345
34542                                      5.0    34542
134849                                     5.0   134849
733                                        5.0      733
...                                        ...      ...
206899                                     NaN   206899
207311                                     NaN   207311
207367                                     NaN   207367
207405                                     NaN   207405
207830                                     NaN   207830

[10309 rows x 2 columns]


In [None]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
4            5  Father of the Bride Part II (1995)   
5            6                         Heat (1995)   
...        ...                                 ...   
61790   206899             Charlie's Angels (2019)   
61936   207311               Last Christmas (2019)   
61960   207367              Little Monsters (2019)   
61967   207405                 Doctor Sleep (2019)   
62088   207830        Terminator: Dark Fate (2019)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
4                                           Comedy  
5                            Action|Crime|Thriller  
...                              