In [None]:
import pandas as pd
from math import sqrt
import numpy as np
from  google.colab import drive

In [None]:
movies_df = pd.read_csv('/content/drive/MyDrive/movies.csv')
ratings_df = pd.read_csv('/content/drive/MyDrive/ratings.csv')
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [None]:
userInput = [{'title':'Teenage Mutant Ninja Turtles: Out of the Shadows (2016)', 'rating':4.0},
             {'title':'Jesus Christ Vampire Hunter (2001)', 'rating':1.0},
             {'title':'Kung Fu Panda 3 (2016)', 'rating':5.0},
             {'title':'Akeelah and the Bee (2006)', 'rating':3.0},
             {'title':'Die Hard (1988)', 'rating':5.0}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                               title  rating
0  Teenage Mutant Ninja Turtles: Out of the Shado...     4.0
1                 Jesus Christ Vampire Hunter (2001)     1.0
2                             Kung Fu Panda 3 (2016)     5.0
3                         Akeelah and the Bee (2006)     3.0
4                                    Die Hard (1988)     5.0


In [None]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)


   movieId                                              title  rating
0     1036                                    Die Hard (1988)     5.0
1    27595                 Jesus Christ Vampire Hunter (2001)     1.0
2    44709                         Akeelah and the Bee (2006)     3.0
3   149406                             Kung Fu Panda 3 (2016)     5.0
4   159690  Teenage Mutant Ninja Turtles: Out of the Shado...     4.0


In [None]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())


         userId  rating  timestamp
movieId                           
1036        145     145        145
27595         1       1          1
44709         5       5          5
149406        8       8          8
159690        1       1          1


In [None]:
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    return len(x[1])

userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(448,        userId  movieId  rating   timestamp
68802     448     1036     5.0  1019124211
70450     448   149406     3.0  1483989104
70495     448   159690     2.0  1490638075), (50,       userId  movieId  rating   timestamp
7136      50     1036     3.5  1514238054
7389      50   149406     2.0  1514239641), (68,        userId  movieId  rating   timestamp
10528      68     1036     3.0  1158531885
11276      68    44709     4.0  1261085771), (140,        userId  movieId  rating   timestamp
21195     140     1036     4.0   942841715
21679     140    44709     4.0  1166645423), (232,        userId  movieId  rating   timestamp
33950     232     1036     4.5  1218164976
34492     232    44709     3.0  1206992657)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [None]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.755929     448
1         0.000000      50
2        -1.000000      68
3         0.000000     140
4         1.000000     232


In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
4          1.000000     232
5          1.000000     414
9          1.000000     599
0          0.755929     448
64         0.000000     231


In [None]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     232        1     3.5  1076955621
1               1.0     232        2     4.0  1085351710
2               1.0     232       10     3.0  1218167397
3               1.0     232       39     3.0  1182909940
4               1.0     232       47     4.5  1241823324
..              ...     ...      ...     ...         ...
95              1.0     232     2059     3.0  1182909401
96              1.0     232     2078     3.0  1076956629
97              1.0     232     2080     2.5  1182910401
98              1.0     232     2081     2.5  1182910383
99              1.0     232     2082     3.0  1182909518

[100 rows x 5 columns]


In [None]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     232        1     3.5  1076955621             3.5
1              1.0     232        2     4.0  1085351710             4.0
2              1.0     232       10     3.0  1218167397             3.0
3              1.0     232       39     3.0  1182909940             3.0
4              1.0     232       47     4.5  1241823324             4.5


In [None]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   3.755929           14.279645
2                   3.755929           11.767787
3                   2.755929            7.767787
5                   1.755929            4.267787
6                   2.000000            7.500000


In [None]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.801894        1
2                                     3.133123        2
3                                     2.818573        3
5                                     2.430501        5
6                                     3.750000        6
7                                     2.750000        7
8                                     3.000000        8
9                                     1.500000        9
10                                    3.334386       10
11                                    3.750000       11


In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

recommendation_df_sorted = recommendation_df[recommendation_df['weighted average recommendation score'] > 4.9]

print(recommendation_df_sorted)

         weighted average recommendation score  movieId
movieId                                                
1250                                       5.0     1250
3160                                       5.0     3160
1285                                       5.0     1285
2020                                       5.0     2020
2013                                       5.0     2013
...                                        ...      ...
741                                        5.0      741
132333                                     5.0   132333
2524                                       5.0     2524
750                                        5.0      750
866                                        5.0      866

[75 rows x 2 columns]


In [None]:
recommended_movie = movies_df.loc[movies_df['movieId'].isin(recommendation_df_sorted['movieId'])]

#we don't want to recommend the same movie
recommended_movie = recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                         title                     genres
32         34                   Babe (1995)             Children|Drama
83         94        Beautiful Girls (1996)       Comedy|Drama|Romance
124       151                Rob Roy (1995)   Action|Drama|Romance|War
229       266    Legends of the Fall (1994)  Drama|Romance|War|Western
251       290     Once Were Warriors (1994)                Crime|Drama
...       ...                           ...                        ...
4789     7132  Night at the Opera, A (1935)     Comedy|Musical|Romance
5230     8542    Day at the Races, A (1937)             Comedy|Musical
8730   127108               Brooklyn (2015)              Drama|Romance
8839   132333                   Seve (2014)          Documentary|Drama
9497   170705       Band of Brothers (2001)           Action|Drama|War

[75 rows x 3 columns]
