<a href="https://colab.research.google.com/github/Ramya1410/Netflix-like-movie-recommender-system/blob/main/movie_recommendation_system_collaborative_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Loading data files

In [None]:
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [None]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [None]:
movies.shape

(10329, 3)

In [None]:
ratings.shape

(105339, 4)

In [None]:
movies.describe()

Unnamed: 0,movieId
count,10329.0
mean,31924.282893
std,37734.741149
min,1.0
25%,3240.0
50%,7088.0
75%,59900.0
max,149532.0


In [None]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,105339.0,105339.0,105339.0,105339.0
mean,364.924539,13381.312477,3.51685,1130424000.0
std,197.486905,26170.456869,1.044872,180266000.0
min,1.0,1.0,0.5,828565000.0
25%,192.0,1073.0,3.0,971100800.0
50%,383.0,2497.0,3.5,1115154000.0
75%,557.0,5991.0,4.0,1275496000.0
max,668.0,149532.0,5.0,1452405000.0


 From the above table we can conclue that
 
 - The average rating is 3.5 and minimum and maximum rating is 0.5 and 5 respectively. 
 - There are 668 user who has given their ratings for 149532 movies.


In [None]:
genres=[]
for genre in movies.genres:
    
    x=genre.split('|')
    for i in x:
         if i not in genres:
            genres.append(str(i))
genres=str(genres)    
movie_title=[]
for title in movies.title:
    movie_title.append(title[0:-7])
movie_title=str(movie_title)    

In [None]:
df=pd.merge(ratings,movies, how='left',on='movieId')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama
1,1,24,1.5,1217895807,Powder (1995),Drama|Sci-Fi
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [None]:
ratings = df.dropna(axis=0, subset = ['title'])    # Drop all 'NaN' values
movie_rating_Count = (ratings.groupby(by=['title'])['rating'].count().reset_index().
                     rename(columns = {'rating':'TotalRatingCount'})[['title', 'TotalRatingCount']])

movie_rating_Count.head(10)

Unnamed: 0,title,TotalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),1
3,'Til There Was You (1997),3
4,"'burbs, The (1989)",20
5,'night Mother (1986),1
6,(500) Days of Summer (2009),37
7,*batteries not included (1987),11
8,...And Justice for All (1979),10
9,10 (1979),3


In [None]:
movie_rating_Count.describe()

Unnamed: 0,TotalRatingCount
count,10323.0
mean,10.204301
std,22.834557
min,1.0
25%,1.0
50%,3.0
75%,8.0
max,325.0


In [None]:
# Merging the rating counts with the ratings
ratings = ratings.merge(movie_rating_Count, left_on='title', right_on='title', how='left')
# left_on => on left dataframe which column considered, right_on => on right dataframe which column considered
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,TotalRatingCount
0,1,16,4.0,1217897793,Casino (1995),Crime|Drama,84
1,1,24,1.5,1217895807,Powder (1995),Drama|Sci-Fi,38
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,207
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,196
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,228


In [None]:
popularity_threshold = 100
rating_popular_movie = ratings.query('TotalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,TotalRatingCount
2,1,32,4.0,1217896246,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,207
3,1,47,4.0,1217896556,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,196
4,1,50,4.0,1217896523,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,228
5,1,110,4.0,1217896150,Braveheart (1995),Action|Drama|War,248
6,1,150,3.0,1217895940,Apollo 13 (1995),Adventure|Drama|IMAX,197


In [None]:
rating_popular_movie.shape


(22856, 7)

In [None]:
s = set(rating_popular_movie['title'])
s

{'2001: A Space Odyssey (1968)',
 'Ace Ventura: Pet Detective (1994)',
 'Aladdin (1992)',
 'Alien (1979)',
 'Aliens (1986)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'American Beauty (1999)',
 'American History X (1998)',
 'American Pie (1999)',
 'Apocalypse Now (1979)',
 'Apollo 13 (1995)',
 'Armageddon (1998)',
 'Austin Powers: International Man of Mystery (1997)',
 'Austin Powers: The Spy Who Shagged Me (1999)',
 'Babe (1995)',
 'Back to the Future (1985)',
 'Back to the Future Part II (1989)',
 'Batman (1989)',
 'Batman Begins (2005)',
 'Batman Forever (1995)',
 'Beautiful Mind, A (2001)',
 'Beauty and the Beast (1991)',
 'Beetlejuice (1988)',
 'Being John Malkovich (1999)',
 'Big Lebowski, The (1998)',
 'Birdcage, The (1996)',
 'Blade Runner (1982)',
 'Bourne Identity, The (2002)',
 'Braveheart (1995)',
 'Breakfast Club, The (1985)',
 'Broken Arrow (1996)',
 "Bug's Life, A (1998)",
 'Casablanca (1942)',
 'Clear and Present Danger (1994)',
 'Clerks (1994)',
 'Cliff

In [None]:
# Create a Pivot Table
features = rating_popular_movie.pivot_table(index='title', columns='userId', values='rating').fillna(0)
features.head()

userId,1,2,3,4,5,6,7,8,9,10,...,659,660,661,662,663,664,665,666,667,668
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,3.0
Ace Ventura: Pet Detective (1994),0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,3.5,0.0
Aladdin (1992),0.0,0.0,3.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
Alien (1979),0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,2.0,0.0,4.0,0.0,4.0
Aliens (1986),0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5


In [None]:
from scipy.sparse import csr_matrix
features_matrix = csr_matrix(features.values)    # All info of pivot table converted into an array
features_matrix

<153x653 sparse matrix of type '<class 'numpy.float64'>'
	with 22856 stored elements in Compressed Sparse Row format>

In [None]:
from sklearn.neighbors import NearestNeighbors   # Not KNearestNeighbors, NearestNeighbors => Unsupervised Algo
model = NearestNeighbors(metric = "cosine", algorithm="brute")
model.fit(features_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [None]:
features.shape


(153, 653)

In [None]:
# Taking a new movie at random
query_index = np.random.choice(features.shape[0])    # Collect 1 record
print(query_index)

113


In [None]:
# Find similar movies(nearer to the selected movie) using kneighbors
distances, indices = model.kneighbors(features.iloc[query_index,:].values.reshape(1, -1), n_neighbors=6)
# n_neighbors = 6 => will include the movie itself => We will be getting 5 other movie recommendations

In [None]:
distances


array([[1.44328993e-15, 3.76512178e-01, 3.98468833e-01, 3.99802741e-01,
        4.18355051e-01, 4.53282727e-01]])

In [None]:
indices


array([[113, 117, 107, 120,  54,  81]], dtype=int64)

In [None]:
# Print top 5 movie name recommendations for movie along with the distances from original movie
for i in range(0, len(distances.flatten())):     # Convert 'distances' array into 1-D array
    if(i==0):
        print("Recommendations for {0}:\n".format(features.index[query_index]))    # 1st recommendation => same movie itself
    else:
        print("{0}: {1}, with distance of {2}:".format(i, features.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Schindler's List (1993):

1: Shawshank Redemption, The (1994), with distance of 0.37651217818091653:
2: Pulp Fiction (1994), with distance of 0.39846883295454893:
3: Silence of the Lambs, The (1991), with distance of 0.3998027414348021:
4: Forrest Gump (1994), with distance of 0.4183550510675107:
5: Jurassic Park (1993), with distance of 0.4532827268444225:
