Netflix Recommendation Engine Project

In [None]:
# Importing the Basic required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# connecting google drive to colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reading the dataset
netflix_dataset = pd.read_csv('/content/drive/MyDrive/combined_data_1.txt.zip',header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
netflix_dataset.head()

In [None]:
netflix_dataset

In [None]:
  netflix_dataset.dtypes

In [None]:
netflix_dataset.dtypes

In [None]:
netflix_dataset.shape

In [None]:
#get the customer count with NaN values
movie_count=netflix_dataset.isnull().sum()
movie_count

In [None]:
#claculate how many customers we are having in the dataset
customer_count=netflix_dataset['Cust_Id'].nunique()

In [None]:
customer_count

In [None]:
#without NaN values
customer_count=netflix_dataset['Cust_Id'].nunique()-movie_count
customer_count

In [None]:
#get the total number of ratings given by the customers
rating_count=netflix_dataset['Cust_Id'].count()-movie_count
rating_count

In [None]:
#To find out how many people have rated the movies as 1, 2, 3,4,5 stars ratings to the movies
stars=netflix_dataset.groupby('Rating')['Rating'].agg(['count'])

In [None]:
ax=stars.plot(kind='barh', legend=False, figsize=(15,10))
plt.title(f'Total pool: {movie_count} Movies, {customer_count} Customers, {rating_count} ratings given', fontsize=20)
plt.grid(True)

In [None]:
#add another column that will have movie id
#first of all we will be calculating how many null values we are having in the ratings column
df_nan=pd.DataFrame(pd.isnull(netflix_dataset.Rating))

In [None]:
df_nan

In [None]:
df_nan=df_nan[df_nan['Rating']==True]
df_nan

In [None]:
df_nan.shape

In [None]:
df_nan.head()

In [None]:
#now we will reset the index as the column
df_nan=df_nan.reset_index()

In [None]:
df_nan

In [None]:
#now we will create a numpy array that will contain 1 from values 0 to 547, 2 from 549 to 693 and so on
movie_np=[]
movie_id=1
for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
    temp=np.full((1, i-j-1), movie_id)
    movie_np=np.append(movie_np, temp)
    movie_id+=1

#account for last record and corresponding length
#numpy approach
last_record=np.full((1, len(netflix_dataset)-df_nan.iloc[-1,0-1]), movie_id)#movie id will be 4499
movie_np=np.append(movie_np, last_record)

In [None]:
len(netflix_dataset)

In [None]:
netflix_dataset

In [None]:
netflix_dataset=netflix_dataset[pd.notnull(netflix_dataset['Rating'])]
netflix_dataset['Movie_Id']=movie_np.astype(int)
netflix_dataset['Cust_Id']=netflix_dataset['Cust_Id'].astype(int)
print("Now the dataset will look like: ")
netflix_dataset.head()

In [None]:
#now we will remove all the users that have rated less movies and
#also all those movies that has been rated less in numbers
f=['count','mean']

In [None]:
dataset_movie_summary=netflix_dataset.groupby('Movie_Id').agg(f)

In [None]:
dataset_movie_summary

In [None]:
dataset_movie_summary=netflix_dataset.groupby('Movie_Id')['Rating'].agg(f)

In [None]:
dataset_movie_summary

In [None]:
dataset_movie_summary["count"].quantile(0.7)

In [None]:
#create a benchmark
movie_benchmark=round(dataset_movie_summary['count'].quantile(0.7),0)
movie_benchmark

In [None]:
dataset_movie_summary['count']

In [None]:
drop_movie_list=dataset_movie_summary[dataset_movie_summary['count']<movie_benchmark].index
drop_movie_list

In [None]:
#remove all the users that are in-active
dataset_cust_summary=netflix_dataset.groupby('Cust_Id')['Rating'].agg(f)
dataset_cust_summary

In [None]:
cust_benchmark=round(dataset_cust_summary['count'].quantile(0.7),0)
cust_benchmark

In [None]:
drop_cust_list=dataset_cust_summary[dataset_cust_summary['count']<cust_benchmark].index
drop_cust_list

In [None]:
#remove all the customers and movies that are below the benchmark
print('The original dataframe has: ', netflix_dataset.shape, 'shape')

In [None]:
netflix_dataset=netflix_dataset[~netflix_dataset['Movie_Id'].isin(drop_movie_list)]
netflix_dataset=netflix_dataset[~netflix_dataset['Cust_Id'].isin(drop_cust_list)]
print('After the triming, the shape is: {}'.format(netflix_dataset.shape))

In [None]:
netflix_dataset.head()

In [None]:
#prepare the dataset for SVD and it takes the matrix as the input
# so for input, we will convert the dataset into sparse matrix
#4499 movies
df_p = pd.pivot_table(netflix_dataset, values='Rating', index='Cust_Id', columns='Movie_Id')
print(df_p.shape)

In [None]:
df_p

In [None]:
# importing the next dataset
df_title = pd.read_csv("/content/drive/MyDrive/movie_titles.csv",  encoding='ISO-8859-1', header=None, usecols=[0,1,2], names=['Movie_Id','Year','Name' ])

df_title.set_index('Movie_Id', inplace=True)

In [None]:
df_title.head()

In [None]:
# Installing the surprice library from scikit
!pip install scikit-surprise

In [None]:
#model building

import math
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [None]:
reader=Reader()

In [None]:
#only work with top 100K rows
data=Dataset.load_from_df(netflix_dataset[['Cust_Id','Movie_Id','Rating']][:100000], reader)

In [None]:
svd=SVD()
cross_validate(svd, data, measures=['RMSE','MAE'], cv=3)
#for 1st fold- 1,2,3,4,5

In [None]:
netflix_dataset.head()

In [None]:
# considering only user 712664
dataset_712664=netflix_dataset[(netflix_dataset['Cust_Id'] ==712664)& (netflix_dataset['Rating']==5)]
dataset_712664=dataset_712664.set_index('Movie_Id')
dataset_712664=dataset_712664.join(df_title)['Name']
dataset_712664

In [None]:
df_title

In [None]:
#build the recommendation algorithm
# make a copy of movie_title

user_712664=df_title.copy()
user_712664

In [None]:
user_712664=user_712664.reset_index()
user_712664

In [None]:
user_712664=user_712664[~user_712664['Movie_Id'].isin(drop_movie_list)]
user_712664

In [None]:
user_712664['Estimate_Score']=user_712664['Movie_Id'].apply(lambda x: svd.predict(712664, x).est)
user_712664=user_712664.drop('Movie_Id', axis=1)

In [None]:
user_712664=user_712664.sort_values('Estimate_Score')
print(user_712664)

In [None]:
user_712664.head(5)

In [None]:
user_712664=user_712664.sort_values('Estimate_Score', ascending=False)
print(user_712664.head(5))