In [1]:

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
from scipy.sparse import csr_matrix



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/movierecommenderdataset/movies.csv
/kaggle/input/movierecommenderdataset/ratings.csv




In [2]:
#Loading data
movie_ratings = pd.read_csv('/kaggle/input/movierecommenderdataset/ratings.csv')

In [3]:
print(movie_ratings.head())

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


In [4]:
movies = pd.read_csv('/kaggle/input/movierecommenderdataset/movies.csv')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


**Combining the two datasets**

In [5]:
#Combining the two datasets
movie_data = movie_ratings.merge(movies, on ='movieId')
print(movie_data.head())


   userId  movieId  rating  timestamp                        title  \
0       1        1     4.0  964982703             Toy Story (1995)   
1       1        3     4.0  964981247      Grumpier Old Men (1995)   
2       1        6     4.0  964982224                  Heat (1995)   
3       1       47     5.0  964983815  Seven (a.k.a. Se7en) (1995)   
4       1       50     5.0  964982931   Usual Suspects, The (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                               Comedy|Romance  
2                        Action|Crime|Thriller  
3                             Mystery|Thriller  
4                       Crime|Mystery|Thriller  


In [6]:
#Finding the movie with highest rating
max_rating_index = movie_data['rating'].idxmax()
highest_rating = movie_data.loc[max_rating_index]
print(highest_rating)


userId                                 1
movieId                               47
rating                               5.0
timestamp                      964983815
title        Seven (a.k.a. Se7en) (1995)
genres                  Mystery|Thriller
Name: 3, dtype: object


In [7]:
#item based filtering
user_item_matrix = movie_ratings.pivot(columns = ['movieId'], index =['userId'], values ='rating').fillna(0)
print(user_item_matrix)

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     0.0     4.0     0.0     0.0     4.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
...         ...     ...     ...     ...     ...     ...     ...     ...   
606         2.5     0.0     0.0     0.0     0.0     0.0     2.5     0.0   
607         4.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
608         2.5     2.0     2.0     0.0     0.0     0.0     0.0     0.0   
609         3.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
610         5.0     0.0     0.0     0.0     0.0     5.0     0.0     0.0   

movieId  9       10     

In [8]:
#optimising the above sparse matrix
matrix_values = csr_matrix(user_item_matrix)
print(matrix_values)

  (0, 0)	4.0
  (0, 2)	4.0
  (0, 5)	4.0
  (0, 43)	5.0
  (0, 46)	5.0
  (0, 62)	3.0
  (0, 89)	5.0
  (0, 97)	4.0
  (0, 124)	5.0
  (0, 130)	5.0
  (0, 136)	5.0
  (0, 184)	5.0
  (0, 190)	3.0
  (0, 197)	5.0
  (0, 201)	4.0
  (0, 224)	5.0
  (0, 257)	3.0
  (0, 275)	3.0
  (0, 291)	5.0
  (0, 307)	4.0
  (0, 314)	4.0
  (0, 320)	5.0
  (0, 325)	4.0
  (0, 367)	3.0
  (0, 384)	4.0
  :	:
  (609, 9238)	5.0
  (609, 9246)	4.5
  (609, 9256)	4.0
  (609, 9268)	5.0
  (609, 9274)	3.5
  (609, 9279)	3.5
  (609, 9282)	3.0
  (609, 9288)	3.0
  (609, 9304)	3.0
  (609, 9307)	2.5
  (609, 9312)	4.5
  (609, 9317)	3.0
  (609, 9324)	3.0
  (609, 9339)	4.0
  (609, 9341)	4.0
  (609, 9348)	3.5
  (609, 9371)	3.5
  (609, 9372)	3.5
  (609, 9374)	5.0
  (609, 9415)	4.0
  (609, 9416)	4.0
  (609, 9443)	5.0
  (609, 9444)	5.0
  (609, 9445)	5.0
  (609, 9485)	3.0


In [9]:
#Using Cosine similarity
knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn_model.fit(matrix_values)

In [10]:
#movie recommender function which takes matrix_values, no of recommendations and a movie from the dataset as arguments

def recommender( movie_name, matrix, n_recs):
    idx = process.extractOne(movie_name,movies['title'])[2] 
    print(idx)
    print('Movie selected: ',movies['title'][idx], 'Index: ', idx)
    print("Searching Recommendations......")
    distance, indices = knn_model.kneighbors(matrix[idx], n_neighbors= n_recs)
    for i in indices:
        print(movies['title'][i].where(i!=idx)) # using where to avoid printing the selected movie

In [11]:
recommender('Batman', matrix_values, 5)

126
Movie selected:  Batman Forever (1995) Index:  126
Searching Recommendations......
126                              NaN
187                 Cure, The (1995)
395                 Firm, The (1993)
411           In the Army Now (1994)
517    Great Day in Harlem, A (1994)
Name: title, dtype: object


Hence, here we used collaborative filtering approach to predict top 5 movies similar to Batman