In [1]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv("G:\mozhgan_python\movies.csv")
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv("G:\mozhgan_python\Rating.csv")
#Head is a function that gets the first N rows of a dataframe. N's default is 5.
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [5]:
movies_df.head(10)


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995
5,6,Heat,Action|Crime|Thriller,1995
6,7,Sabrina,Comedy|Romance,1995
7,8,Tom and Huck,Adventure|Children,1995
8,9,Sudden Death,Action,1995
9,10,GoldenEye,Action|Adventure|Thriller,1995


In [6]:
#Dropping the genres column
movies_df = movies_df.drop('genres', 1)

In [7]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [14]:
ratings_df.movieId[1]

2471

In [9]:
#Drop removes a specified row or column from a dataframe
ratings_df = ratings_df.drop('timestamp', 1)

In [10]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [49]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [58]:
inputMovies['title'].tolist()

['Breakfast Club, The', 'Toy Story', 'Jumanji', 'Pulp Fiction', 'Akira']

In [60]:
movies_df['title']

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
                    ...             
34203                     Grand Slam
34204                     Bloodmoney
34205           The Butterfly Circus
34206                           Zero
34207          The 2000 Year Old Man
Name: title, Length: 34208, dtype: object

In [57]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputId

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
293,296,Pulp Fiction,1994
1246,1274,Akira,1988
1885,1968,"Breakfast Club, The",1985


In [62]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)
#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [63]:
inputMovies['movieId'].tolist()

[1, 2, 296, 1274, 1968]

In [64]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0
...,...,...,...
1048271,11325,1,5.0
1048290,11325,296,4.0
1048383,11325,1968,3.0
1048520,11327,296,3.0


In [68]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000000006F704F0>

In [70]:
userSubsetGroup.get_group(1130)



Unnamed: 0,userId,movieId,rating
104167,1130,1,0.5
104168,1130,2,4.0
104214,1130,296,4.0
104363,1130,1274,4.5
104443,1130,1968,4.5


In [84]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [85]:
userSubsetGroup[0:5]


[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0),
 (815,
         userId  movieId  rating
  73747     815        1     4.5
  73748     815        2     3.0
  73922     815      296     5.0
  74362     815     1274     3.0
  74678     815     1968     4.5),
 (1040,
         userId  movieId  rating
  96689    1040        1     3.0
  96690    1040        2     1.5
  96733    1040      296     3.5
  96859    1040     1274     3.0
  96922    1

In [86]:
userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0),
 (815,
         userId  movieId  rating
  73747     815        1     4.5
  73748     815        2     3.0
  73922     815      296     5.0
  74362     815     1274     3.0
  74678     815     1968     4.5),
 (1040,
         userId  movieId  rating
  96689    1040        1     3.0
  96690    1040        2     1.5
  96733    1040      296     3.5
  96859    1040     1274     3.0
  96922    1

In [30]:
group = group.sort_values(by='movieId')
group

Unnamed: 0,userId,movieId,rating
169378,1848,1,5.0
169379,1848,2,4.5
169390,1848,296,4.0
169470,1848,1968,4.0


In [33]:
inputMovies = inputMovies.sort_values(by='movieId')
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [35]:
nRatings = len(group)
nRatings

4

In [37]:
temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
temp_df

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
4,1968,"Breakfast Club, The",5.0


In [39]:
tempRatingList = temp_df['rating'].tolist()
tempRatingList

[3.5, 2.0, 5.0, 5.0]

In [41]:
tempGroupList = group['rating'].tolist()
tempGroupList

[5.0, 4.5, 4.0, 4.0]

In [92]:
sum([i**2 for i in tempRatingList])

66.25

In [93]:
pow(sum(tempRatingList),2)/float(nRatings)

60.0625

In [91]:
Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
Sxx

6.1875

In [95]:
Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
Syy

0.6875

In [97]:
Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
Sxy

-1.3125

In [106]:
from scipy.stats import pearsonr
corr, _ = pearsonr(tempRatingList, tempGroupList)
print('Pearsons correlation: %.3f' % corr)

Pearsons correlation: -0.636


In [125]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [126]:
pearsonCorrelationDict

{75: 0.8272781516947562,
 106: 0.5860090386731182,
 686: 0.8320502943378437,
 815: 0.5765566601970551,
 1040: 0.9434563530497265,
 1130: 0.2891574659831201,
 1502: 0.8770580193070299,
 1599: 0.4385290096535153,
 1625: 0.716114874039432,
 1950: 0.179028718509858,
 2065: 0.4385290096535153,
 2128: 0.5860090386731196,
 2432: 0.1386750490563073,
 2791: 0.8770580193070299,
 2839: 0.8204126541423674,
 2948: -0.11720180773462392,
 3025: 0.45124262819713973,
 3040: 0.89514359254929,
 3186: 0.6784622064861935,
 3271: 0.26989594817970664,
 3429: 0.0,
 3734: -0.15041420939904673,
 4099: 0.05860090386731196,
 4208: 0.29417420270727607,
 4282: -0.4385290096535115,
 4292: 0.6564386345361464,
 4415: -0.11183835382312353,
 4586: -0.9024852563942795,
 4725: -0.08006407690254357,
 4818: 0.4885967564883424,
 5104: 0.7674257668936507,
 5165: -0.4385290096535153,
 5547: 0.17200522903844556,
 6082: -0.04728779924109591,
 6207: 0.9615384615384616,
 6366: 0.6577935144802716,
 6482: 0.0,
 6530: -0.351605423203

In [127]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF

Unnamed: 0,0
75,0.827278
106,0.586009
686,0.832050
815,0.576557
1040,0.943456
...,...
1643,0.628808
1667,-0.637793
1785,0.764093
1824,0.801784


In [128]:
pearsonDF.columns = ['similarityIndex']

In [110]:
pearsonDF['userId'] = pearsonDF.index

In [113]:
pearsonDF.index = range(len(pearsonDF))
pearsonDF

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.832050,686
3,0.576557,815
4,0.943456,1040
...,...,...
95,0.628808,1643
96,-0.637793,1667
97,0.764093,1785
98,0.801784,1824


In [111]:
pearsonDF

Unnamed: 0,similarityIndex,userId
75,0.827278,75
106,0.586009,106
686,0.832050,686
815,0.576557,815
1040,0.943456,1040
...,...,...
1643,0.628808,1643
1667,-0.637793,1667
1785,0.764093,1785
1824,0.801784,1824


In [129]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [135]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)
topUsers.head(10)

Unnamed: 0,similarityIndex,userId
90,1.0,1255
76,0.986928,599
92,0.971061,1519
79,0.965581,784
85,0.96506,1066
34,0.961538,6207
55,0.961538,10707
88,0.94388,1200
4,0.943456,1040
86,0.919866,1107


In [136]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head(500)

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.000000,1255,1,3.0
1,1.000000,1255,2,2.5
2,1.000000,1255,10,3.5
3,1.000000,1255,18,3.5
4,1.000000,1255,19,3.5
...,...,...,...,...
495,0.986928,599,2541,2.0
496,0.986928,599,2571,3.0
497,0.986928,599,2596,3.0
498,0.986928,599,2617,2.0


In [137]:
topUsersRating.shape

(86563, 4)

In [138]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,1255,1,3.0,3.0
1,1.0,1255,2,2.5,2.5
2,1.0,1255,10,3.5,3.5
3,1.0,1255,18,3.5,3.5
4,1.0,1255,19,3.5,3.5


In [139]:
topUsersRating

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.000000,1255,1,3.0,3.000000
1,1.000000,1255,2,2.5,2.500000
2,1.000000,1255,10,3.5,3.500000
3,1.000000,1255,18,3.5,3.500000
4,1.000000,1255,19,3.5,3.500000
...,...,...,...,...,...
86558,-0.942809,974,4616,4.0,-3.771236
86559,-0.942809,974,4643,1.0,-0.942809
86560,-0.942809,974,4696,2.0,-1.885618
86561,-0.942809,974,4787,3.0,-2.828427


In [140]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,40.275978,141.858467
2,42.043299,99.130228
3,10.910736,28.494644
4,0.625402,1.11155
5,11.764089,27.482256


In [141]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.522161,1
2,2.357813,2
3,2.611615,3
4,1.777337,4
5,2.336114,5


In [142]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
90717,1462.504702,90717
103810,252.308024,103810
83349,226.403784,83349
5147,168.138484,5147
60397,116.91791,60397
64234,95.069077,64234
103655,78.928759,103655
6638,66.245804,6638
2930,58.230194,2930
6347,55.808479,6347


In [143]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
2845,2930,Return with Honor,1998
5052,5147,Wild Strawberries (Smultronstället),1957
6249,6347,Beat Street,1984
6529,6638,Valley Girl,1983
12826,60397,Mamma Mia!,2008
13207,64234,"Guyver, The",1991
16532,83349,"Green Hornet, The",2011
18161,90717,Tower Heist,2011
21405,103655,R.I.P.D.,2013
21452,103810,Red 2,2013
