In [213]:
import pandas as pd
import numpy as np
import re
import math
import graphlab as gl
#Below line is required when running first time
#graphlab.get_dependencies()

# Dataframe with IMDB Movie feature Dataset containing Netflix Movie ID
df = pd.read_csv('omdb.txt', sep="|")
# Dataframe with Netflix data containing userid rating and movie id
# Iterate df_filtered_822109 and find movies similar to the movie list

#Cleansing the IMDB Data which contains netflix movie id
#filter null rows.
df = df[df.Title != 'null']
#pick first year for Series.
df['Year'] = df['Year'].str.replace('–[0-9]*$','')
#Correct Rated column
df.Rated = df.Rated.str.replace('UNRATED','NOT RATED')
df.Rated = df.Rated.str.replace('NR','NOT RATED')
df.Rated = df.Rated.str.replace('Not Rated','NOT RATED')
df.Rated = df.Rated.str.replace('Unrated','NOT RATED')

#remove commas from imdbvotes
df.imdbVotes = df.imdbVotes.str.replace(',','')

# classify runtime by hours
df.Runtime = df.Runtime.str.replace(' min','')
df.Runtime = df.Runtime.str.replace(',','')
df.Runtime = df.Runtime.str.replace('1 h 30','90')
df.Runtime = df.Runtime.str.replace('3 h 48','223')
df.Runtime = df.Runtime.str.replace('1 h 20','80')
#df.Runtime = df.Runtime.str.replace(NaN,'0')

df.Runtime = pd.to_numeric(df.Runtime, errors='coerce')
minRuntime = min(df.Runtime)
maxRuntime = max(df.Runtime)

df['Runtime'] = df['Runtime'].apply(lambda x: 4*((x - minRuntime)/(maxRuntime - minRuntime)) + 1)
df['Runtime'] = df['Runtime'].apply(lambda x: 1 if x < 1 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['Runtime'] = df['Runtime'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.imdbVotes = pd.to_numeric(df.imdbVotes, errors='coerce')
minImdbVotes = min(df.imdbVotes)
maxImdbVotes = max(df.imdbVotes)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 4*((x - minImdbVotes)/(maxImdbVotes - minImdbVotes)) + 1)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 1 if x < 1 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.imdbRating = pd.to_numeric(df.imdbRating, errors='coerce')
minImdbRating = min(df.imdbRating)
maxImdbRating = max(df.imdbRating)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 4*((x - minImdbRating)/(maxImdbRating - minImdbRating)) + 1)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 1 if x < 1 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['imdbRating'] = df['imdbRating'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df.Year = pd.to_numeric(df.Year, errors='coerce')
minYear = min(df.Year)
maxYear = max(df.Year)
df['Year'] = df['Year'].apply(lambda x: 4*((x - minYear)/(maxYear - minYear)) + 1)
df['Year'] = df['Year'].apply(lambda x: 1 if x < 1 else x)
df['Year'] = df['Year'].apply(lambda x: 2 if x > 1 and x < 2 else x)
df['Year'] = df['Year'].apply(lambda x: 3 if x > 2 and x < 3 else x)
df['Year'] = df['Year'].apply(lambda x: 4 if x > 3 and x < 4 else x)
df['Year'] = df['Year'].apply(lambda x: 5 if x > 4 and x < 5 else x)

df = df[df['Genre'].notnull()]
df = df[df['Writer'].notnull()]
df = df[df['Actors'].notnull()]
df = df[df['Director'].notnull()]
df = df[df['Country'].notnull()]
df = df[df['Language'].notnull()]

df_u= pd.read_csv('netflix-user-movie.txt', sep=",")
df_u.columns = ['movieid','userid','rating', 'date']
df_u.head()

Unnamed: 0,movieid,userid,rating,date
0,1,822109,5,2005-05-13
1,1,885013,4,2005-10-19
2,1,30878,4,2005-12-26
3,1,823519,3,2004-05-03
4,1,893988,3,2005-11-17


In [214]:
userRating_dict = {}
userMoviesCount_dict = {}
userMoviesAvgRating_dict = {}
userMoviesLiked = {}

#Iterate the dataframe and build aggregate ratings and movie counts for each user
for row in df_u.itertuples():
    userRating_dict[row.userid] = userRating_dict.get(row.userid,0) + row.rating
    userMoviesCount_dict[row.userid] = userMoviesCount_dict.get(row.userid,0) + 1
 
#Compute the average rating
for userid in userRating_dict:
    userMoviesAvgRating_dict[userid] = userRating_dict[userid]/userMoviesCount_dict[userid]

In [215]:
#list to add to the dataframe
like_list = []

#Iterate the dataframe and find if the movie is liked by the user or not
for row in df_u.itertuples():
    like_list.extend('1' if row.rating >= userMoviesAvgRating_dict[row.userid] else '0')
    
#Add a new likedmovie to the dataframe
df_u = df_u.assign(likedmovie = like_list)
df_u.head()

Unnamed: 0,movieid,userid,rating,date,likedmovie
0,1,822109,5,2005-05-13,1
1,1,885013,4,2005-10-19,1
2,1,30878,4,2005-12-26,1
3,1,823519,3,2004-05-03,1
4,1,893988,3,2005-11-17,1


In [216]:
#Number of IMDB Dataframe records
len(df)

9156

In [217]:
df.to_csv('normalized.csv', sep='|')

movieSFrame = gl.SFrame.read_csv("normalized.csv", sep='|')
#'Runtime','imdbRating','Year','imdbVotes'
knn_model = gl.nearest_neighbors.create(movieSFrame,features=['Genre','Writer','Actors','Director','Country','Language','Rated','Awards'],label='id')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,long,str,str,str,str,str,str,str,str,float,float,str,str,float,str,str,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Defaulting to brute force instead of ball tree because there are multiple distance components.


In [218]:
df_filtered_822109 = df_u[(df_u.userid == 822109) & (df_u.likedmovie == '1') & (df_u.movieid.isin(list(df['id'])))]
df_filtered_822109.head()

Unnamed: 0,movieid,userid,rating,date,likedmovie
3013481,571,822109,5,2005-04-25,1
3215550,607,822109,5,2005-04-07,1
4854997,985,822109,5,2005-04-25,1
5644197,1144,822109,5,2005-05-21,1
6202889,1220,822109,5,2005-04-25,1


In [219]:
movieTitles_dict = {}
recommendation_data_frame = pd.DataFrame(columns=['userid','movieid','rating','rank'])

for row in df.itertuples():
    movieTitles_dict[row.id] = row.Title

frame_dict = {}    
for row in df_filtered_822109.itertuples():
    frame_dict[row.movieid] = movieSFrame[movieSFrame['Title'] == movieTitles_dict[row.movieid]]

for key in frame_dict.keys():
    oframe = knn_model.query(frame_dict[key],k=3)
    rec_list = list(oframe['reference_label'])
    rank_list = list(oframe['rank'])
    i = 0
    for movieid in rec_list:
        recommendation_data_frame.loc[len(recommendation_data_frame)]=['822109', movieid, userMoviesAvgRating_dict[822109],rank_list[i]]                               
        i = i + 1
print(recommendation_data_frame)

    userid  movieid  rating  rank
0   822109   1798.0     4.0   1.0
1   822109   4883.0     4.0   2.0
2   822109  16244.0     4.0   3.0
3   822109   1810.0     4.0   1.0
4   822109  11679.0     4.0   2.0
5   822109  15373.0     4.0   3.0
6   822109   2580.0     4.0   1.0
7   822109  14574.0     4.0   2.0
8   822109  12337.0     4.0   3.0
9   822109   2698.0     4.0   1.0
10  822109    262.0     4.0   2.0
11  822109   1892.0     4.0   3.0
12  822109   2594.0     4.0   1.0
13  822109  10505.0     4.0   2.0
14  822109  16882.0     4.0   3.0
15  822109   1905.0     4.0   1.0
16  822109  14203.0     4.0   2.0
17  822109  15818.0     4.0   3.0
18  822109   2095.0     4.0   1.0
19  822109   3026.0     4.0   2.0
20  822109    413.0     4.0   3.0
21  822109   1843.0     4.0   1.0
22  822109   8372.0     4.0   2.0
23  822109  16908.0     4.0   3.0
24  822109   2612.0     4.0   1.0
25  822109  10419.0     4.0   2.0
26  822109  10952.0     4.0   3.0
27  822109   3254.0     4.0   1.0
28  822109   8

In [220]:
recommendation_data_frame['movieid'] = recommendation_data_frame['movieid'].astype(int)
print('===================================')
print('RECOMMENDED MOVIES FOR USER 822109')
print('===================================')
print(recommendation_data_frame)

RECOMMENDED MOVIES FOR USER 822109
    userid  movieid  rating  rank
0   822109     1798     4.0   1.0
1   822109     4883     4.0   2.0
2   822109    16244     4.0   3.0
3   822109     1810     4.0   1.0
4   822109    11679     4.0   2.0
5   822109    15373     4.0   3.0
6   822109     2580     4.0   1.0
7   822109    14574     4.0   2.0
8   822109    12337     4.0   3.0
9   822109     2698     4.0   1.0
10  822109      262     4.0   2.0
11  822109     1892     4.0   3.0
12  822109     2594     4.0   1.0
13  822109    10505     4.0   2.0
14  822109    16882     4.0   3.0
15  822109     1905     4.0   1.0
16  822109    14203     4.0   2.0
17  822109    15818     4.0   3.0
18  822109     2095     4.0   1.0
19  822109     3026     4.0   2.0
20  822109      413     4.0   3.0
21  822109     1843     4.0   1.0
22  822109     8372     4.0   2.0
23  822109    16908     4.0   3.0
24  822109     2612     4.0   1.0
25  822109    10419     4.0   2.0
26  822109    10952     4.0   3.0
27  822109   

In [221]:
def knn_recommendations():
    return recommendation_data_frame