In [1]:
#Setting working directory
import os
os.chdir('E:\\Imarticus\\DSP22\\python')
print (os.getcwd())

E:\Imarticus\DSP22\python


In [2]:
# Import necessary libraries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
from IPython.display import Image

In [None]:
#------------#
# Some Basics
#------------#

# Content based approach utilizes a series of discrete characteristics of an 
# item in order to recommend additional items with similar properties.

# Collaborative filtering approach builds a model from a user’s past behaviors 
# (items previously purchased or selected and/or numerical ratings given to those items) 
# as well as similar decisions made by other users. This model is then used to 
# predict items (or ratings for items) that the user may have an interest in.

In [6]:
# read data
df_movies = pd.read_csv(
    'movies.csv',
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    'ratings.csv',
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [7]:
df_movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [21]:
df_ratings[df_ratings['movieId']==2][0:3]

Unnamed: 0,userId,movieId,rating
560,6,2,4.0
1026,8,2,4.0
1773,18,2,3.0


In [14]:
print(df_movies.shape)
print(df_ratings.shape)

(9742, 2)
(100836, 3)


In [None]:
# KNN does not make any assumptions on the underlying data distribution but it 
# relies on item feature similarity. When KNN makes inference about a movie, 
# KNN will calculate the “distance” between the target movie and every other movie 
# in its database, then it ranks its distances and returns the top K nearest neighbor 
# movies as the most similar movie recommendations.

In [None]:
# Now, how do we feed the dataframe of ratings into a KNN model? First, we need to transform 
# the dataframe of ratings into a proper format that can be consumed by a KNN model

# Notice that df_ratings data has got both 'userId' and 'movieId' repeating in terms of rows
# Dataset with this structure is definitely not going to help in finding the nearest neighbors
# What if we pivot this table and get the unique Movie IDs as rows and User IDs as columns?
# It will help find related movies (this is what is our objective, to recommend relevant movies)

# If we had User IDs as rows, then kNN would find similar users. Well, even with that,
# we could find the movies that those users liked and recommend movies. The problem with this 
# approach is that, not every user rates movies but almost every movie is rated by someone or the other

In [15]:
from scipy.sparse import csr_matrix
# pivot ratings into movie features
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(df_movie_features.values)

In [18]:
df_movie_features.head(2)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0


In [22]:
df_movie_features.shape

(9724, 610)

In [None]:
# Notice that we have got a pretty high dimensional dataset to deal with
# While it was required to do what we did, but high dimensionality is not good for kNN

# The cosine similarity is advantageous because even if the two similar vectors or
# documents are far apart by the Euclidean distance (due to the size of the document), 
# chances are they may still be oriented closer together. 
# The smaller the angle, higher the cosine similarity.

In [23]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

In [24]:
model_knn.fit(mat_movie_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [25]:
distances, indices = model_knn.kneighbors(mat_movie_features)

In [26]:
indices

array([[   0, 2353,  418, ..., 1182,   31,  277],
       [   1,  322,  436, ...,  217,  138,  615],
       [   2, 2578,  607, ...,  619,   71,  363],
       ...,
       [9599, 9439, 9606, ..., 9259, 9155, 9576],
       [9599, 9439, 9606, ..., 9259, 9155, 9576],
       [9723, 9665, 9603, ..., 8448, 7876, 6805]], dtype=int64)

In [27]:
indices[0]

array([   0, 2353,  418,  615,  224,  314,  322,  910,  546,  963,  968,
       3189,  506,  123,  257,  897,  815, 1182,   31,  277], dtype=int64)

In [28]:
distances[0]

array([0.        , 0.42739868, 0.4343632 , 0.4357382 , 0.44261175,
       0.452904  , 0.4588548 , 0.4589107 , 0.46108717, 0.46583116,
       0.4696188 , 0.47202337, 0.47214073, 0.4796753 , 0.48196733,
       0.48581868, 0.48775423, 0.4906829 , 0.49140745, 0.49145526],
      dtype=float32)

In [29]:
# With the existing fitted model, if you ask it to get you top 5 related 
# movies for a given movie, it can give you that. Just find the index of
# a given movie from df_movies, and for the recommended indexes, find their
# corresponding movie names. Simple!