In [348]:
#Dataframe manipulation library
import pandas as pd
#Math functions, we'll only need the sqrt function so let's import only that
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt

In [349]:
#Storing the movie information into a pandas dataframe
movies_df = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings_df = pd.read_csv('ratings.csv')

In [350]:
#Head is a function that gets the first N rows of a dataframe. N's default is 5.
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Add year column

In [351]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [352]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


Seperate Genres

In [353]:
#Every genre is separated by a | so we simply have to call the split function on |
movies_df['genres'] = movies_df.genres.str.split('|')
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [354]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['genres']:
        moviesWithGenres_df.at[index, genre] = 1
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [355]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [356]:
#Drop removes a specified row or column from a dataframe
ratings_df = ratings_df.drop('timestamp', 1)

Choose the random user number 100 for our recommendation test 

In [357]:
our_user_df = ratings_df[ratings_df.userId == 100] 

In [358]:
our_user_df.head()

Unnamed: 0,userId,movieId,rating
15300,100,3,3.5
15301,100,11,4.0
15302,100,16,4.5
15303,100,17,4.5
15304,100,19,1.0


In [359]:
#Filtering out the movies by title
inputId = movies_df[movies_df['movieId'].isin(our_user_df['movieId'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
our_user_df = pd.merge(inputId, our_user_df)
#Dropping information we won't use from the input dataframe
our_user_df = our_user_df.drop('year', 1)
#Final input dataframe
#If a movie you added in above isn't here, then it might not be in the original 
#dataframe or it might spelled differently, please check capitalisation.
our_user_df.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,3,Grumpier Old Men,"[Comedy, Romance]",100,3.5
1,11,"American President, The","[Comedy, Drama, Romance]",100,4.0
2,16,Casino,"[Crime, Drama]",100,4.5
3,17,Sense and Sensibility,"[Drama, Romance]",100,4.5
4,19,Ace Ventura: When Nature Calls,[Comedy],100,1.0


In [360]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(our_user_df['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
1,1,3,4.0
12,1,223,3.0
14,1,235,4.0
16,1,296,3.0
20,1,356,4.0


In [361]:
#Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['movieId'].isin(our_user_df['movieId'].tolist())]
userMovies.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,11,"American President, The","[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,16,Casino,"[Crime, Drama]",1995,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,17,Sense and Sensibility,"[Drama, Romance]",1995,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,19,Ace Ventura: When Nature Calls,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [362]:
#Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)
#Dropping unnecessary issues due to save memory and to avoid issues
userGenreTable = userMovies.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
userGenreTable.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Calculate Recommendation based on Content

In [363]:
#Dot produt to get weights
userProfile = userGenreTable.transpose().dot((our_user_df['rating']-3))
#The user profile
userProfile.head()

Adventure    17.5
Animation     1.0
Children      6.0
Comedy       79.0
Fantasy       9.5
dtype: float64

In [364]:
#Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['movieId'])
#And drop the unnecessary information
genreTable = genreTable.drop('movieId', 1).drop('title', 1).drop('genres', 1).drop('year', 1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [365]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

movieId
1    0.319209
2    0.093220
3    0.466102
4    0.670904
5    0.223164
dtype: float64

we use NormalizeData dunction to get scores from 0 to 5 for our recommendation based on content

In [366]:
import numpy as np

def NormalizeData(data):
    return 5*((data - np.min(data)) / (np.max(data) - np.min(data)))

recommendationTable_df = NormalizeData(recommendationTable_df)

In [367]:
recommendationTable_df.describe()

count    9742.000000
mean        1.580459
std         0.969418
min         0.000000
25%         1.023689
50%         1.387479
75%         2.106599
max         5.000000
dtype: float64

In [368]:
#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)
#Just a peek at the values
recommendationTable_df.head()

movieId
4956     5.000000
4719     4.949239
26236    4.898477
42015    4.796954
1907     4.703892
dtype: float64

## Content Based Recommendation Result

In [369]:
#The final recommendation table
movies_df.loc[movies_df['movieId'].isin(recommendationTable_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
505,587,Ghost,"[Comedy, Drama, Fantasy, Romance, Thriller]",1990
743,970,Beat the Devil,"[Adventure, Comedy, Crime, Drama, Romance]",1953
1390,1907,Mulan,"[Adventure, Animation, Children, Comedy, Drama...",1998
1394,1912,Out of Sight,"[Comedy, Crime, Drama, Romance, Thriller]",1998
2903,3893,Nurse Betty,"[Comedy, Crime, Drama, Romance, Thriller]",2000
3460,4719,Osmosis Jones,"[Action, Animation, Comedy, Crime, Drama, Roma...",2001
3608,4956,"Stunt Man, The","[Action, Adventure, Comedy, Drama, Romance, Th...",1980
4655,6954,"Barbarian Invasions, The (Les invasions barbares)","[Comedy, Crime, Drama, Mystery, Romance]",2003
5029,7831,Another Thin Man,"[Comedy, Crime, Drama, Mystery, Romance]",1939
5033,7835,Song of the Thin Man,"[Comedy, Crime, Drama, Musical, Mystery, Romance]",1947


In [370]:
recommendationTable_df = pd.DataFrame(recommendationTable_df)
recommendationTable_df.reset_index(inplace=True)
recommendationTable_df.columns = ['movieId', 'Predictions']
recommendationTable_df.head()

Unnamed: 0,movieId,Predictions
0,4956,5.0
1,4719,4.949239
2,26236,4.898477
3,42015,4.796954
4,1907,4.703892


# Recommendation based on Collaborative Filtering

In [371]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [372]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [373]:
userSubsetGroup[0:3]

[(100,        userId  movieId  rating
  15300     100        3     3.5
  15301     100       11     4.0
  15302     100       16     4.5
  15303     100       17     4.5
  15304     100       19     1.0
  15305     100       28     4.5
  15306     100       62     4.0
  15307     100       74     4.0
  15308     100       89     4.0
  15309     100       95     4.5
  15310     100      104     3.5
  15311     100      168     4.5
  15312     100      222     4.0
  15313     100      223     3.5
  15314     100      234     4.0
  15315     100      235     1.0
  15316     100      237     4.0
  15317     100      261     4.5
  15318     100      266     4.0
  15319     100      296     3.5
  15320     100      356     4.0
  15321     100      357     4.0
  15322     100      368     4.5
  15323     100      377     4.0
  15324     100      380     3.5
  15325     100      441     4.0
  15326     100      497     4.0
  15327     100      508     4.0
  15328     100      537     4.0
  153

In [374]:
userSubsetGroup = userSubsetGroup[0:100]

caculation of simillarity of users

In [375]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = our_user_df.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [376]:
pearsonCorrelationDict

{100: 1.0,
 414: -0.10478666965966302,
 474: -0.018876129865153655,
 599: -0.09949269434228988,
 68: 0.2387065055884038,
 480: 0.22803573662268903,
 288: -0.35477020208494653,
 606: 0.08720616709228293,
 603: -0.07299818336810522,
 307: 0.06709298448448268,
 177: 0.22511477297063842,
 608: 0.1350770301684615,
 590: 0.10728723407800067,
 600: -0.029259641783869122,
 448: -0.15082508457794003,
 489: 0.12783493856265266,
 182: -0.1660145629155574,
 357: 0.210881647046992,
 45: -0.09006303954404742,
 483: 0.041629395063256004,
 91: -0.13528452953490766,
 274: 0.13921404181248273,
 597: 0.031641841795823064,
 57: -0.31528449090195343,
 387: -0.26550698723836463,
 19: 0.20072336220739684,
 226: -0.058723655001291286,
 64: 0.008265968485100021,
 42: 0.3387252930627786,
 140: -0.011322725655732273,
 200: 0.04459836629685872,
 330: 0.16205570567956146,
 555: 0.06008888850803653,
 84: 0.11530568885123926,
 380: -0.25045710109732666,
 132: 0.1080845092914642,
 275: -0.2299257568401962,
 391: -0.1

In [377]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,1.0,100
1,-0.104787,414
2,-0.018876,474
3,-0.099493,599
4,0.238707,68


In [378]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[1:51]
topUsers.head()

Unnamed: 0,similarityIndex,userId
61,0.542385,292
41,0.469297,381
79,0.39795,6
28,0.338725,42
94,0.327151,117


In [379]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.542385,292,1,4.0
1,0.542385,292,10,3.5
2,0.542385,292,19,1.5
3,0.542385,292,32,2.0
4,0.542385,292,34,3.5


In [380]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.542385,292,1,4.0,2.16954
1,0.542385,292,10,3.5,1.898348
2,0.542385,292,19,1.5,0.813578
3,0.542385,292,32,2.0,1.08477
4,0.542385,292,34,3.5,1.898348


In [381]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,6.552439,25.217546
2,5.265704,17.452916
3,3.60672,11.758021
4,0.513255,1.539766
5,2.177112,7.890395


In [382]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.848574,1
2,3.314451,2
3,3.260032,3
4,3.0,4
5,3.624249,5


In [383]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df = recommendation_df.reset_index(drop=True)
recommendation_df.head(10)


Unnamed: 0,weighted average recommendation score,movieId
0,5.0,8008
1,5.0,7614
2,5.0,187593
3,5.0,1934
4,5.0,1979
5,5.0,4454
6,5.0,4467
7,5.0,4509
8,5.0,4564
9,5.0,1939


## Collaborative Filtering Based Recommendation Result

In [384]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
1413,1934,You Can't Take It with You,"[Comedy, Romance]",1938
1417,1939,"Best Years of Our Lives, The","[Drama, War]",1946
1456,1979,Friday the 13th Part VI: Jason Lives,[Horror],1986
3294,4454,More,"[Animation, Drama, Sci-Fi, IMAX]",1998
3302,4467,"Adventures of Baron Munchausen, The","[Adventure, Comedy, Fantasy]",1988
3331,4509,"Great Outdoors, The",[Comedy],1988
3360,4564,Always,"[Drama, Fantasy, Romance]",1989
4973,7614,Oklahoma!,"[Musical, Romance, Western]",1955
5082,8008,Brigadoon,"[Fantasy, Musical, Romance]",1954
9709,187593,Deadpool 2,"[Action, Comedy, Sci-Fi]",2018


In [385]:
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [386]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


# Calculation Movies Rankings
for when we dont have good recommendations and in order to recommend movies that have a general good quality

In [387]:
movies_rating_df = pd.merge(movies_df, ratings_df, on='movieId')
movies_rating_df.head()

Unnamed: 0,movieId,title,genres,year,userId,rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1,4.0
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,5,4.0
2,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,7,4.5
3,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,15,2.5
4,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,17,4.5


In [388]:
movies_rating_df.groupby('title')['rating'].mean().sort_values(ascending=False)[:10]

title
Sorority House Massacre                           5.0
Entertaining Angels: The Dorothy Day Story        5.0
Passenger, The (Professione: reporter)            5.0
Little Dieter Needs to Fly                        5.0
Human Condition III, The (Ningen no joken III)    5.0
Ex Drummer                                        5.0
When Worlds Collide                               5.0
Peaceful Warrior                                  5.0
Eva                                               5.0
Advise and Consent                                5.0
Name: rating, dtype: float64

In [389]:
movies_rating_df.groupby('title')['rating'].count().sort_values(ascending=False)[:10]

title
Forrest Gump                          329
Shawshank Redemption, The             317
Pulp Fiction                          307
Silence of the Lambs, The             279
Matrix, The                           278
Star Wars: Episode IV - A New Hope    251
Jurassic Park                         238
Braveheart                            237
Terminator 2: Judgment Day            224
Schindler's List                      220
Name: rating, dtype: int64

In [390]:
C = movies_rating_df.groupby('title')['rating'].mean().mean()
print(C)

3.2659524470615393


In [391]:
m = movies_rating_df.groupby('title')['rating'].count().quantile(0.95)
print(m)

48.0


In [392]:
ratings = movies_rating_df.groupby('title')['rating'].mean()
count = movies_rating_df.groupby('title')['rating'].count()
qualified  = movies_rating_df.groupby('title')['rating'].count()[lambda x: x > m]
qualified = pd.merge(ratings, count, left_index=True, right_index=True)
qualified = qualified.drop(qualified[qualified['rating_y'] <= m].index)
qualified.reset_index(inplace=True)
qualified.columns = ['title', 'vote_average', 'vote_count']
print(qualified.head())

                        title  vote_average  vote_count
0  10 Things I Hate About You      3.527778          54
1                12 Angry Men      4.163793          58
2       2001: A Space Odyssey      3.894495         109
3               28 Days Later      3.974138          58
4                         300      3.681250          80


In [393]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

# Result of Ranking od movies based on user's ratings

In [394]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)
qualified = qualified.sort_values('wr', ascending=False)
qualified.head()

Unnamed: 0,title,vote_average,vote_count,wr
362,"Shawshank Redemption, The",4.429022,317,4.27607
160,Fight Club,4.272936,218,4.091225
180,"Godfather, The",4.289062,192,4.08444
398,Star Wars: Episode IV - A New Hope,4.231076,251,4.07614
330,Pulp Fiction,4.197068,307,4.071171


# merging all three recommendation systems together for hybrid result

In [395]:
movie_ranking = pd.merge(qualified, movies_df, on='title')
movie_ranking.head()

Unnamed: 0,title,vote_average,vote_count,wr,movieId,genres,year
0,"Shawshank Redemption, The",4.429022,317,4.27607,318,"[Crime, Drama]",1994
1,Fight Club,4.272936,218,4.091225,2959,"[Action, Crime, Drama, Thriller]",1999
2,"Godfather, The",4.289062,192,4.08444,858,"[Crime, Drama]",1972
3,Star Wars: Episode IV - A New Hope,4.231076,251,4.07614,260,"[Action, Adventure, Sci-Fi]",1977
4,Pulp Fiction,4.197068,307,4.071171,296,"[Comedy, Crime, Drama, Thriller]",1994


In [396]:
movie_ranking = pd.merge(recommendation_df, movie_ranking, on='movieId')
movie_ranking.head()

Unnamed: 0,weighted average recommendation score,movieId,title,vote_average,vote_count,wr,genres,year
0,5.0,3404,Titanic,3.421233,146,3.382813,"[Action, Drama]",1953
1,5.0,77846,12 Angry Men,4.163793,58,3.757224,"[Crime, Drama]",1997
2,4.590789,112852,Guardians of the Galaxy,4.050847,59,3.698745,"[Action, Adventure, Sci-Fi]",2014
3,4.577909,58559,"Dark Knight, The",4.238255,149,4.001349,"[Action, Crime, Drama, IMAX]",2008
4,4.556025,318,"Shawshank Redemption, The",4.429022,317,4.27607,"[Crime, Drama]",1994


In [397]:
movie_ranking = pd.merge(recommendationTable_df, movie_ranking, on='movieId')
movie_ranking.head()

Unnamed: 0,movieId,Predictions,weighted average recommendation score,title,vote_average,vote_count,wr,genres,year
0,587,4.433164,3.667722,Ghost,3.434783,115,3.385066,"[Comedy, Drama, Fantasy, Romance, Thriller]",1990
1,6373,4.230118,3.463672,Bruce Almighty,3.316901,71,3.296351,"[Comedy, Drama, Fantasy, Romance]",2003
2,2797,4.230118,3.844917,Big,3.71978,91,3.563063,"[Comedy, Drama, Fantasy, Romance]",1988
3,2324,4.170897,3.980474,Life Is Beautiful (La Vita è bella),4.147727,88,3.836513,"[Comedy, Drama, Romance, War]",1997
4,356,4.170897,4.273888,Forrest Gump,4.164134,329,4.049776,"[Comedy, Drama, Romance, War]",1994


# Final Result of our Hybrid Recommendation system based on three different systems

In [398]:
movie_ranking['recommendation_score'] = (movie_ranking['Predictions']+movie_ranking['weighted average recommendation score']+movie_ranking['wr'])/3
movie_ranking.drop(['Predictions','weighted average recommendation score','wr','vote_count','vote_average'], axis=1, inplace=True)
movie_ranking = movie_ranking.sort_values(by='recommendation_score', ascending=False)
movie_ranking.head()

Unnamed: 0,movieId,title,genres,year,recommendation_score
4,356,Forrest Gump,"[Comedy, Drama, Romance, War]",1994,4.164854
16,1247,"Graduate, The","[Comedy, Drama, Romance]",1967,4.049519
18,1197,"Princess Bride, The","[Action, Adventure, Comedy, Fantasy, Romance]",1987,4.038317
3,2324,Life Is Beautiful (La Vita è bella),"[Comedy, Drama, Romance, War]",1997,3.995961
13,6711,Lost in Translation,"[Comedy, Drama, Romance]",2003,3.943703
