In [297]:
import pandas as pd
import numpy as np
import re
import math
import graphlab as gl
#Below line is required when running first time
#graphlab.get_dependencies()

# Dataframe with IMDB Movie feature Dataset containing Netflix Movie ID
df = pd.read_csv('omdb.txt', sep="|")

#Cleansing the IMDB Data which contains netflix movie id
#filter null rows.
df = df[df.Title != 'null']
#pick first year for Series.
df['Year'] = df['Year'].str.replace('–[0-9]*$','')
#Correct Rated column
df.Rated = df.Rated.str.replace('UNRATED','NOT RATED')
df.Rated = df.Rated.str.replace('NR','NOT RATED')
df.Rated = df.Rated.str.replace('Not Rated','NOT RATED')
df.Rated = df.Rated.str.replace('Unrated','NOT RATED')

#remove commas from imdbvotes
df.imdbVotes = df.imdbVotes.str.replace(',','')

# classify runtime by hours
df.Runtime = df.Runtime.str.replace(' min','')
df.Runtime = df.Runtime.str.replace(',','')
df.Runtime = df.Runtime.str.replace('1 h 30','90')
df.Runtime = df.Runtime.str.replace('3 h 48','223')
df.Runtime = df.Runtime.str.replace('1 h 20','80')

df.Runtime = pd.to_numeric(df.Runtime, errors='coerce')
minRuntime = min(df.Runtime)
maxRuntime = max(df.Runtime)
#Normalize the Runtime column
df['Runtime'] = df['Runtime'].apply(lambda x: 4*((x - minRuntime)/(maxRuntime - minRuntime)) + 1)
df['Runtime'] = df['Runtime'].apply(lambda x: np.ceil(x))

df.imdbVotes = pd.to_numeric(df.imdbVotes, errors='coerce')
minImdbVotes = min(df.imdbVotes)
maxImdbVotes = max(df.imdbVotes)
#Normalize the imdbVotes column
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: 4*((x - minImdbVotes)/(maxImdbVotes - minImdbVotes)) + 1)
df['imdbVotes'] = df['imdbVotes'].apply(lambda x: np.ceil(x))

df.imdbRating = pd.to_numeric(df.imdbRating, errors='coerce')
minImdbRating = min(df.imdbRating)
maxImdbRating = max(df.imdbRating)
#Normalize the imdbRating column
df['imdbRating'] = df['imdbRating'].apply(lambda x: 4*((x - minImdbRating)/(maxImdbRating - minImdbRating)) + 1)
df['imdbRating'] = df['imdbRating'].apply(lambda x: np.ceil(x))


df.Year = pd.to_numeric(df.Year, errors='coerce')
df['Year'] = df['Year'].astype(float)
minYear = min(df.Year)
maxYear = max(df.Year)
#Normalize the Year column
df['Year'] = df['Year'].apply(lambda x: 4*((x - minYear)/(maxYear - minYear)) + 1)
df['Year'] = df['Year'].apply(lambda x: np.ceil(x))

df = df[df['Genre'].notnull()]
df = df[df['Writer'].notnull()]
df = df[df['Actors'].notnull()]
df = df[df['Director'].notnull()]
df = df[df['Country'].notnull()]
df = df[df['Language'].notnull()]

# Dataframe with Netflix data containing userid rating and movie id
df_u= pd.read_csv('netflix-user-movie.txt', sep=",")
df_u.columns = ['movieid','userid','rating', 'date']
df_u.head()

Unnamed: 0,movieid,userid,rating,date
0,1,822109,5,2005-05-13
1,1,885013,4,2005-10-19
2,1,30878,4,2005-12-26
3,1,823519,3,2004-05-03
4,1,893988,3,2005-11-17


In [298]:
userRating_dict = {}
userMoviesCount_dict = {}
userMoviesAvgRating_dict = {}
userMoviesLiked = {}

#Iterate the dataframe and build aggregate ratings and movie counts for each user
for row in df_u.itertuples():
    userRating_dict[row.userid] = userRating_dict.get(row.userid,0) + row.rating
    userMoviesCount_dict[row.userid] = userMoviesCount_dict.get(row.userid,0) + 1
 
#Compute the average rating
for userid in userRating_dict:
    userMoviesAvgRating_dict[userid] = userRating_dict[userid]/userMoviesCount_dict[userid]

In [299]:
#list to add to the dataframe
like_list = []

#Iterate the dataframe and find if the movie is liked by the user or not
#A movie is considered liked if the rating given by the user is greater than or equal to his/her average rating.
for row in df_u.itertuples():
    like_list.extend('1' if row.rating >= userMoviesAvgRating_dict[row.userid] else '0')
    
#Add a new likedmovie to the dataframe
df_u = df_u.assign(likedmovie = like_list)
df_u.tail()

Unnamed: 0,movieid,userid,rating,date,likedmovie
24049260,4499,2591364,2,2005-02-16,0
24049261,4499,1791000,2,2005-02-10,0
24049262,4499,512536,5,2005-07-27,1
24049263,4499,988963,3,2005-12-20,1
24049264,4499,1704416,3,2004-06-02,1


In [300]:
#Number of IMDB Dataframe records
len(df)

9156

In [301]:
#Filling unavailable numeric data with 0
df['Runtime'].fillna(0, inplace=True)
df['imdbRating'].fillna(0, inplace=True)
df['Year'].fillna(0, inplace=True)
df['imdbVotes'].fillna(0, inplace=True)

In [302]:
#dump the pandas dataframe to a file and load it to an SFrame.
#Graphlab Knn works only on Sframes. A regular conversion for reasons unknown does not work.
df.to_csv('normalized.csv', sep='|')
movieSFrame = gl.SFrame.read_csv("normalized.csv", sep='|')

#Build the KNN model
knn_model = gl.nearest_neighbors.create(movieSFrame,features=['Genre','Writer','Actors','Director','Country','Language','Rated','Awards','Runtime','imdbRating','Year','imdbVotes'],label='id')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[long,long,str,float,str,str,str,str,str,str,str,str,float,float,str,str,float,str,str,str,str,long]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Defaulting to brute force instead of ball tree because there are multiple distance components.


In [303]:
df_filtered_822109 = df_u[(df_u.userid == 822109) & (df_u.likedmovie == '1') & (df_u.movieid.isin(list(df['id'])))]
df_filtered_822109

Unnamed: 0,movieid,userid,rating,date,likedmovie
3013481,571,822109,5,2005-04-25,1
3215550,607,822109,5,2005-04-07,1
4854997,985,822109,5,2005-04-25,1
5644197,1144,822109,5,2005-05-21,1
6202889,1220,822109,5,2005-04-25,1
8958037,1798,822109,4,2005-04-07,1
9164811,1810,822109,4,2005-04-25,1
9312559,1843,822109,4,2005-04-25,1
9686126,1905,822109,5,2005-05-21,1
10182695,1975,822109,4,2005-04-25,1


In [304]:
movieTitles_dict = {}

#Build a map of movieid and Title
for row in df.itertuples():
    movieTitles_dict[row.id] = row.Title

#Build an Sframe for each movie which user 822109 liked. Sframes are necessary for querying the Knn model.
frame_dict = {}    
for row in df_filtered_822109.itertuples():
    frame_dict[row.movieid] = movieSFrame[movieSFrame['Title'] == movieTitles_dict[row.movieid]]

In [310]:
#initialize the result dataframe which will contain recommendations
recommendation_data_frame = pd.DataFrame(columns=['userid','movieid','predicted_rating','rank'])

# Find movies similar to the movies which user 822109 liked by querying the Knn model   
for key in frame_dict.keys():
    oframe = knn_model.query(frame_dict[key],k=4)
    print(oframe)
    rec_list = list(oframe['reference_label'])
    rank_list = list(oframe['rank'])
    #Skip the index 0 as it is the movie which was already watched by user 822109
    i = 0
    for movieid in rec_list:
        if rank_list[i] == 1:
            i = i + 1
            continue
        rating_norm_factor = 5 - userMoviesAvgRating_dict[822109]
        if rank_list[i] - 1 == 1 :
            predicted_rating = 0.5 * rating_norm_factor;# Using a weightage factor of 0.5 for rank 1
        if rank_list[i] - 1 == 2 : 
            predicted_rating = 0.35 * rating_norm_factor;# Using a weightage factor of 0.35 for rank 2
        if rank_list[i] - 1 == 3 : 
            predicted_rating = 0.15 * rating_norm_factor;# Using a weightage factor of 0.15 for rank 3            
        recommendation_data_frame.loc[len(recommendation_data_frame)]=['822109', movieid, userMoviesAvgRating_dict[822109] + predicted_rating,rank_list[i]-1]
        i = i + 1

+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       1798      |      0.0      |  1   |
|      0      |       4883      |      77.0     |  2   |
|      0      |      16244      | 77.4142135624 |  3   |
|      0      |      14973      |      79.0     |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       1810      |   0.0    |  1   |
|      0      |      15373      |   94.0   |  2   |
|      0      |      11679      |   95.0   |  3   |
|      0      |       2358      |   96.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       2580      |   0.0    |  1   |
|      0      |      14574      |  113.0   |  2   |
|      0      |      12337      |  121.0   |  3   |
|      0      |      17308      |  121.0   |  4   |
|      1      |       2698      |   0.0    |  1   |
|      1      |       262       |   80.0   |  2   |
|      1      |       1892      |   86.0   |  3   |
|      1      |       1262      |   89.0   |  4   |
+-------------+-----------------+----------+------+
[8 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       2594      |   0.0    |  1   |
|      0      |      10505      |   67.0   |  2   |
|      0      |      16882      |   73.0   |  3   |
|      0      |      17482      |   73.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       1905      |      0.0      |  1   |
|      0      |      14203      | 165.828427125 |  2   |
|      0      |      15818      | 172.236067977 |  3   |
|      0      |      12317      | 174.236067977 |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       2095      |   0.0    |  1   |
|      0      |       413       |   82.0   |  2   |
|      0      |       3026      |   82.0   |  3   |
|      0      |      14652      |   82.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       1843      |   0.0    |  1   |
|      0      |       8372      |  110.0   |  2   |
|      0      |      16908      |  110.0   |  3   |
|      0      |      10059      |  112.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       2612      |   0.0    |  1   |
|      0      |      10419      |  151.0   |  2   |
|      0      |      10952      |  155.0   |  3   |
|      0      |      17129      |  155.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       3254      |   0.0    |  1   |
|      0      |       8181      |  107.0   |  2   |
|      0      |      16922      |  110.0   |  3   |
|      0      |      11234      |  112.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       1975      |   0.0    |  1   |
|      0      |      10094      |  129.0   |  2   |
|      0      |       9189      |  130.0   |  3   |
|      0      |        58       |  131.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       571       |      0.0      |  1   |
|      0      |       9037      | 73.4494897428 |  2   |
|      0      |       2478      | 74.4494897428 |  3   |
|      0      |      12468      |      80.0     |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       1220      |   0.0    |  1   |
|      0      |       7882      |   98.0   |  2   |
|      0      |      12191      |  100.0   |  3   |
|      0      |       3314      |  104.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       3106      |      0.0      |  1   |
|      0      |       9037      |      85.0     |  2   |
|      0      |       7617      |      87.0     |  3   |
|      0      |       571       | 90.2360679775 |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       4472      |   0.0    |  1   |
|      0      |       9995      |   84.0   |  2   |
|      0      |      15058      |   90.0   |  3   |
|      0      |       2000      |   99.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       4306      |      0.0      |  1   |
|      0      |       3333      | 66.2360679775 |  2   |
|      0      |      12145      | 95.4494897428 |  3   |
|      0      |       8376      | 101.236067977 |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       985       |      0.0      |  1   |
|      0      |      13391      |     180.0     |  2   |
|      0      |       5939      |     186.0     |  3   |
|      0      |       8387      | 187.414213562 |  4   |
|      1      |      12124      |      0.0      |  1   |
|      1      |       4085      |      65.0     |  2   |
|      1      |       6789      |      80.0     |  3   |
|      1      |       6435      |      81.0     |  4   |
|      2      |      12173      |      0.0      |  1   |
|      2      |       6708      |     138.0     |  2   |
+-------------+-----------------+---------------+------+
[12 rows x 4 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.


+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       607       |      0.0      |  1   |
|      0      |      16265      | 79.4494897428 |  2   |
|      0      |       571       | 82.2360679775 |  3   |
|      0      |       2880      |      85.0     |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       2152      |   0.0    |  1   |
|      0      |      11490      |  130.0   |  2   |
|      0      |      15472      |  135.0   |  3   |
|      0      |      12090      |  137.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       3433      |   0.0    |  1   |
|      0      |       1914      |   69.0   |  2   |
|      0      |       1546      |   74.0   |  3   |
|      0      |       3317      |   74.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       4330      |   0.0    |  1   |
|      0      |       331       |   87.0   |  2   |
|      0      |       7716      |   91.0   |  3   |
|      0      |      14480      |   94.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       3538      |   0.0    |  1   |
|      0      |       8827      |   94.0   |  2   |
|      0      |      13389      |   98.0   |  3   |
|      0      |      15985      |   99.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+---------------+------+
| query_label | reference_label |    distance   | rank |
+-------------+-----------------+---------------+------+
|      0      |       3825      |      0.0      |  1   |
|      0      |       2372      | 127.414213562 |  2   |
|      0      |       5293      |     139.0     |  3   |
|      0      |      14527      |     139.0     |  4   |
+-------------+-----------------+---------------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       1144      |   0.0    |  1   |
|      0      |      17357      |  106.0   |  2   |
|      0      |      11789      |  111.0   |  3   |
|      0      |      12593      |  112.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



+-------------+-----------------+----------+------+
| query_label | reference_label | distance | rank |
+-------------+-----------------+----------+------+
|      0      |       3860      |   0.0    |  1   |
|      0      |      10928      |  126.0   |  2   |
|      0      |      15393      |  126.0   |  3   |
|      0      |       2528      |  138.0   |  4   |
+-------------+-----------------+----------+------+
[4 rows x 4 columns]



In [311]:
recommendation_data_frame['movieid'] = recommendation_data_frame['movieid'].astype(int)
print('===================================')
print('RECOMMENDED MOVIES FOR USER 822109')
print('===================================')
print(recommendation_data_frame)

RECOMMENDED MOVIES FOR USER 822109
    userid  movieid  predicted_rating  rank
0   822109     4883              4.50   1.0
1   822109    16244              4.35   2.0
2   822109    14973              4.15   3.0
3   822109    15373              4.50   1.0
4   822109    11679              4.35   2.0
5   822109     2358              4.15   3.0
6   822109    14574              4.50   1.0
7   822109    12337              4.35   2.0
8   822109    17308              4.15   3.0
9   822109      262              4.50   1.0
10  822109     1892              4.35   2.0
11  822109     1262              4.15   3.0
12  822109    10505              4.50   1.0
13  822109    16882              4.35   2.0
14  822109    17482              4.15   3.0
15  822109    14203              4.50   1.0
16  822109    15818              4.35   2.0
17  822109    12317              4.15   3.0
18  822109      413              4.50   1.0
19  822109     3026              4.35   2.0
20  822109    14652              4.15   3

In [307]:
def knn_recommendations():
    return recommendation_data_frame