In [1]:

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from scipy.sparse import csr_matrix



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/movierecommenderdataset/movies.csv
/kaggle/input/movierecommenderdataset/ratings.csv




# Loading Data

In [2]:
movie_ratings = pd.read_csv('/kaggle/input/movierecommenderdataset/ratings.csv')

In [3]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies = pd.read_csv('/kaggle/input/movierecommenderdataset/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


**Combining the two datasets**

In [5]:

movie_data = movie_ratings.merge(movies, on ='movieId')
movie_data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [6]:
#Finding the movie with highest rating
max_rating_index = movie_data['rating'].idxmax()
highest_rating = movie_data.loc[max_rating_index]
print(highest_rating)


userId                                 1
movieId                               47
rating                               5.0
timestamp                      964983815
title        Seven (a.k.a. Se7en) (1995)
genres                  Mystery|Thriller
Name: 3, dtype: object


# Preparing the data

In [7]:
#item based filtering
user_item_matrix = movie_ratings.pivot(columns = ['movieId'], index =['userId'], values ='rating').fillna(0)
user_item_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#optimising the above sparse matrix using function from scipy library
matrix_values = csr_matrix(user_item_matrix)
matrix_values

<610x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

# Model Training
Here, we are going to use the `Collaborative Filtering` technique to make a movie recommendation model. This can be done using the either of algoritms: k nearest-neighbors or SVM(Support Vector Machine).In this case for the purpose of simplicity we are going to use `knn`

In [9]:
#Using Cosine similarity
knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn_model.fit(matrix_values)

In [10]:
#movie recommender function which takes matrix_values, no of recommendations and a movie from the dataset as arguments

def recommender( movie_name, matrix, n_recs):
    idx = process.extractOne(movie_name,movies['title'])[2] 
    print(idx)
    print('Movie selected: ',movies['title'][idx], 'Index: ', idx)
    print("Searching Recommendations......")
    distance, indices = knn_model.kneighbors(matrix[idx], n_neighbors= n_recs)
    for i in indices:
        print(movies['title'][i].where(i!=idx)) # using where to avoid printing the selected movie

In [11]:
recommender('Jumanji', matrix_values, 5)

1
Movie selected:  Jumanji (1995) Index:  1
Searching Recommendations......
1                         NaN
365       Black Beauty (1994)
416    Jimmy Hollywood (1994)
377        Clean Slate (1994)
549        Dragonheart (1996)
Name: title, dtype: object


Hence, here we used collaborative filtering approach to predict top 5 movies similar to Batman

Now, lets input a movie which is not present in the current dataset

In [12]:
recommender('Superman', matrix_values, 5)

1986
Movie selected:  Superman (1978) Index:  1986
Searching Recommendations......


IndexError: row index (1986) out of range

We can clearly see that the above code throws an error because the movie 'Superman' is not present in the dataset.This is because in collaborative filtering, recommendations are based on user-item interactions, user-ratings in case of our model.Hence, when we give 'Superman' as an input the model has no knowledge about its ratings,which makes it impossible for the model to make any recommendation.