In [1]:
import numpy as np
import pandas as pd

In [2]:
movies_df = pd.read_csv('/home/shah/Desktop/NLP_Tasks/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('/home/shah/Desktop/NLP_Tasks/ml-latest-small/ratings.csv')

movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
#create a new column for the years and remove from the title 
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year']

0       (1995)
1       (1995)
2       (1995)
3       (1995)
4       (1995)
         ...  
9737    (2017)
9738    (2017)
9739    (2017)
9740    (2018)
9741    (1991)
Name: year, Length: 9742, dtype: object

In [4]:
#Remove the year next to the title name
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
movies_df['title']

0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

In [5]:
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,(1995)
1,2,Jumanji,Adventure|Children|Fantasy,(1995)
2,3,Grumpier Old Men,Comedy|Romance,(1995)
3,4,Waiting to Exhale,Comedy|Drama|Romance,(1995)
4,5,Father of the Bride Part II,Comedy,(1995)


In [6]:
movies_df = movies_df.drop('genres', 1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,(1995)
1,2,Jumanji,(1995)
2,3,Grumpier Old Men,(1995)
3,4,Waiting to Exhale,(1995)
4,5,Father of the Bride Part II,(1995)


In [7]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
#get rid of timestamp coloumn as it is not important
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
#create user input for the program to recommend movies to
userInput = [
            {'title':'The Breakfast Club', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,title,rating
0,The Breakfast Club,5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [11]:
#Filtering out the movies by title
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovies = pd.merge(inputId, inputMovies)

#Dropping information we won't use from the input dataframe
inputMovies = inputMovies.drop('year', 1)

inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5


In [12]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
516,5,1,4.0
533,5,296,5.0


In [34]:
userSubset['userId'].value_counts()

483    4
600    4
414    4
477    4
480    4
      ..
444    1
235    1
234    1
445    1
301    1
Name: userId, Length: 403, dtype: int64

In [37]:
userSubset['userId'].mode()

0      91
1     177
2     219
3     274
4     298
5     414
6     434
7     474
8     477
9     480
10    483
11    599
12    600
13    608
dtype: int64

In [35]:
userSubset['movieId'].value_counts()

296     307
1       215
2       110
1274     39
Name: movieId, dtype: int64

In [36]:
userSubset['movieId'].mode()

0    296
dtype: int64

In [13]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [39]:
#checking one of the users with the userID=600 (can check any)
userSubsetGroup.get_group(600)

Unnamed: 0,userId,movieId,rating
95101,600,1,2.5
95102,600,2,4.0
95154,600,296,4.5
95306,600,1274,3.5


In [40]:
len(userSubsetGroup.get_group(600))

4

In [41]:
userSubsetGroup.get_group(4)

Unnamed: 0,userId,movieId,rating
320,4,296,1.0


In [42]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [43]:
#Top most user with id 91 having all 4 similar movies watched
userSubsetGroup[0]

(91,
        userId  movieId  rating
 14121      91        1     4.0
 14122      91        2     3.0
 14173      91      296     4.5
 14316      91     1274     5.0)

In [44]:
#dataframe of top user group
userSubsetGroup[0][1]

Unnamed: 0,userId,movieId,rating
14121,91,1,4.0
14122,91,2,3.0
14173,91,296,4.5
14316,91,1274,5.0


In [45]:
#select subset of user to iterate through
userSubsetGroup = userSubsetGroup[0:100]

In [46]:
#Calculate the Pearson Correlation between input sure and subset group and store in a dict
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    #Get the N (total similar movies watched) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    
    ###For Debugging Purpose
    #if nRatings<5:
    #    print(inputMovies['movieId'].isin(group['movieId'].tolist()))
    #    break
    #else:
    #    continue
    
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    #Now let's calculate the pearson correlation between two users, so called, x and y

    #For package based
    #scipy.stats import pearsonr
    #pearsonr(tempRatingList,tempGroupList)[0]

    #For hard code based
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [47]:
pearsonCorrelationDict.items()

dict_items([(91, 0.9221388919541469), (177, 0.0657951694959769), (219, 0.5459486832355505), (274, 0.8510644963469901), (298, 0.9883173560569456), (414, 0.9258200997725514), (434, 0.9864036607532465), (474, 0.0657951694959769), (477, 0.7237468644557459), (480, 0.8728715609439696), (483, 0.35043832202523123), (599, 0.9341484842923421), (600, 0.18442777839082938), (608, 0.9378934722869389), (18, 1.0), (21, 0), (50, 0.9449111825230734), (57, -0.9449111825230682), (68, -0.8660254037844356), (103, 0.8660254037844402), (107, -1.0), (135, 0.8660254037844402), (140, 0.5), (144, 1.0), (153, 0.8660254037844379), (160, 0.8660254037844402), (182, 0.9449111825230684), (202, 0), (217, 0.0), (226, 0.9819805060619667), (232, 0.6546536707079778), (240, -0.8660254037844386), (249, 0), (288, 0.9332565252573829), (304, 0.8660254037844356), (307, 0.9607689228305233), (318, 0.88249750329277), (322, 0.9607689228305233), (323, 0.0), (330, 0.8660254037844386), (353, 0.8660254037844356), (357, 0.7205766921228925

In [48]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()

Unnamed: 0,0
91,0.922139
177,0.065795
219,0.545949
274,0.851064
298,0.988317


In [49]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.922139,91
1,0.065795,177
2,0.545949,219
3,0.851064,274
4,0.988317,298


In [50]:
#top x similar user to input users
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
72,1.0,33
14,1.0,18
88,1.0,112
90,1.0,119
87,1.0,105


In [51]:
#rating of selected users to all movies
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,33,1,3.0
1,1.0,33,7,1.0
2,1.0,33,11,2.0
3,1.0,33,17,4.0
4,1.0,33,21,4.0


In [52]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,33,1,3.0,3.0
1,1.0,33,7,1.0,1.0
2,1.0,33,11,2.0,2.0
3,1.0,33,17,4.0,4.0
4,1.0,33,21,4.0,4.0


In [53]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,41.894371,144.59356
2,28.907889,84.133978
3,14.161698,43.039994
5,9.283947,26.898021
6,18.893662,74.664086


In [54]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.451384,1
2,2.910416,2
3,3.039183,3
5,2.897261,5
6,3.951806,6


In [55]:
#recommended movies, see top 20 movies which the algorithm suggested
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
138966,5.0,138966
290,5.0,290
95311,5.0,95311
5888,5.0,5888
172589,5.0,172589


In [56]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
251,290,Once Were Warriors,(1994)
835,1096,Sophie's Choice,(1982)
993,1295,"Unbearable Lightness of Being, The",(1988)
3031,4055,Panic,(2000)
4046,5747,Gallipoli,(1981)
4108,5888,Brother (Brat),(1997)
4246,6192,Open Hearts (Elsker dig for evigt),(2002)
5013,7767,"Best of Youth, The (La meglio gioventù)",(2003)
5230,8542,"Day at the Races, A",(1937)
5448,26073,"Human Condition III, The (Ningen no joken III)",(1961)
