In [1]:
# importing the required packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# connecting the drive for importing the dataset:

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Loaing the dataset

netflix_dataset = pd.read_csv('/content/drive/MyDrive/Intellipat_ML_Projects/Copy of combined_data_1.txt.zip',header=None,usecols=[0,1],names=['Cust_Id','Ratings'])

In [4]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In [5]:
# first 5 records:

netflix_dataset.head()

Unnamed: 0,Cust_Id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [6]:
# Last 5 rows:

netflix_dataset.tail()

Unnamed: 0,Cust_Id,Ratings
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0
24058262,1704416,3.0


In [7]:
# checking datatypes of each column:

netflix_dataset.dtypes

Cust_Id     object
Ratings    float64
dtype: object

In [8]:
# short summary of dataset:

netflix_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Cust_Id  object 
 1   Ratings  float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB


In [9]:
# checking nulls:

netflix_dataset.isnull().sum()

Cust_Id       0
Ratings    4499
dtype: int64

***Understanding The Dataset:***

In [10]:
netflix_dataset.head()

Unnamed: 0,Cust_Id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [11]:
#. checking the nulls in 'Rating' column to know the movie counts:
# get the total number of movie count is basicaly to find the sum of NAN values in Ratings column

movie_count = netflix_dataset.isnull().sum()[1]
movie_count

4499

In [12]:
# identifying the unique Customers:

Cust_count = netflix_dataset['Cust_Id'].nunique()
Cust_count

475257

In [13]:
Cust_count-movie_count

470758

In [14]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In [15]:
# Creating the new 'Movie_Id' column:

movie_id = None
movie_np = []

#Iterate over dataset in Cust_Id
for cust_Id in netflix_dataset['Cust_Id']:
  if ':' in cust_Id:
     #Update the current movie Id
    movie_id = int(cust_Id.replace(':',''))
  movie_np.append(movie_id)

In [70]:
# movie_np

In [17]:
# Adding the new column to the DataFrame:

netflix_dataset['Movie_Id'] = movie_np

In [18]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [19]:
# dropping the rows in which ':','NaN' present:
netflix_dataset = netflix_dataset[netflix_dataset['Ratings'].notna()]
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [20]:
# checking the dtype:
netflix_dataset.dtypes

Cust_Id      object
Ratings     float64
Movie_Id      int64
dtype: object

In [21]:
# changing the datatype of 'Cust_Id' column:

netflix_dataset['Cust_Id'] = netflix_dataset['Cust_Id'].astype(int)

In [22]:
# checking the dtype again

netflix_dataset.dtypes

Cust_Id       int64
Ratings     float64
Movie_Id      int64
dtype: object

In [23]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [24]:
# pre-filtering:

# will remove all those movie_Id which have rated very less in number
# will remove all those Customers who rated very less movies

In [25]:
# setting a bench mark for movie:

dataset_movie_summary = netflix_dataset.groupby('Movie_Id')['Ratings'].agg(['count'])
dataset_movie_summary

Unnamed: 0_level_0,count
Movie_Id,Unnamed: 1_level_1
1,547
2,145
3,2012
4,142
5,1140
...,...
4495,614
4496,9519
4497,714
4498,269


In [26]:
movie_benchmark = round(dataset_movie_summary['count'].quantile(0.6),0)#less than 60% value rated movie needs to be removed
movie_benchmark

908.0

In [27]:
drop_movie_list = dataset_movie_summary[dataset_movie_summary['count']<movie_benchmark].index
drop_movie_list

Index([   1,    2,    4,    7,    9,   10,   11,   12,   13,   14,
       ...
       4480, 4481, 4486, 4487, 4491, 4494, 4495, 4497, 4498, 4499],
      dtype='int64', name='Movie_Id', length=2699)

In [28]:
len(drop_movie_list)

2699

In [29]:
movie_count - len(drop_movie_list)

1800

In [30]:
# now will remove all those Cust_Id who are in-active(rated very less movies)

dataset_cust_Id_summary = netflix_dataset.groupby('Cust_Id')['Ratings'].agg(['count'])
dataset_cust_Id_summary

Unnamed: 0_level_0,count
Cust_Id,Unnamed: 1_level_1
6,153
7,195
8,21
10,49
25,4
...,...
2649404,12
2649409,10
2649421,3
2649426,74


In [31]:
# setting a benchmark:

Cust_Id_benchmark = dataset_cust_Id_summary['count'].quantile(0.6)#less than 60% value of customers rated movie needs to be removed
Cust_Id_benchmark

36.0

In [32]:
drop_cust_Id_list = dataset_cust_Id_summary[dataset_cust_Id_summary['count']<Cust_Id_benchmark].index
drop_cust_Id_list

Index([      8,      25,      33,      83,      94,     126,     130,     133,
           142,     149,
       ...
       2649337, 2649343, 2649351, 2649376, 2649379, 2649384, 2649401, 2649404,
       2649409, 2649421],
      dtype='int64', name='Cust_Id', length=282042)

In [33]:
len(drop_cust_Id_list)

282042

In [34]:
Cust_count - len(drop_cust_Id_list)

193215

In [35]:
netflix_dataset = netflix_dataset[~netflix_dataset['Movie_Id'].isin(drop_movie_list)]
netflix_dataset = netflix_dataset[~netflix_dataset['Cust_Id'].isin(drop_cust_Id_list)]

In [36]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3
...,...,...,...
24056842,1055714,5.0,4496
24056843,2643029,4.0,4496
24056844,267802,4.0,4496
24056845,1559566,3.0,4496


In [37]:
updated_movie_count = netflix_dataset['Movie_Id'].nunique()
updated_movie_count

1800

In [38]:
updated_cust_count = netflix_dataset['Cust_Id'].nunique()
updated_cust_count

188716

In [39]:
# resetting the index of dataset:

netflix_dataset.reset_index(drop=True,inplace=True)

In [40]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
0,712664,5.0,3
1,1331154,4.0,3
2,2632461,3.0,3
3,44937,5.0,3
4,656399,4.0,3
...,...,...,...
19695831,1055714,5.0,4496
19695832,2643029,4.0,4496
19695833,267802,4.0,4496
19695834,1559566,3.0,4496


In [41]:
# SVD Model Building for making further recommendations:

In [42]:
# importing the movie_name dataset:

df_title = pd.read_csv('/content/drive/MyDrive/Intellipat_ML_Projects/Copy of movie_titles.csv', encoding = 'ISO-8859-1', header=None,usecols=[0,1,2],names=['Movie_Id','Year','Name'])
df_title

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [43]:
# first 5 rows

df_title.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [44]:
# number of 'Movie_count' in df_title column:

df_title['Movie_Id'].nunique()

17770

In [45]:
# check for missing values:
df_title.isnull().sum() #year column is of no use for us

Movie_Id    0
Year        7
Name        0
dtype: int64

In [46]:
# check for duplicateds in df_title dataset:
df_title.duplicated().sum()

0

In [47]:
# dtype check
df_title.dtypes

Movie_Id      int64
Year        float64
Name         object
dtype: object

In [48]:
# dropping those movies which having less than 908 ratings from df_title dataset
df_title = df_title[~df_title['Movie_Id'].isin(drop_movie_list)]
df_title

Unnamed: 0,Movie_Id,Year,Name
2,3,1997.0,Character
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
7,8,2004.0,What the #$*! Do We Know!?
15,16,1996.0,Screamers
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [49]:
# rearranging the index:

df_title.reset_index(drop=True,inplace=True)
df_title

Unnamed: 0,Movie_Id,Year,Name
0,3,1997.0,Character
1,5,2004.0,The Rise and Fall of ECW
2,6,1997.0,Sick
3,8,2004.0,What the #$*! Do We Know!?
4,16,1996.0,Screamers
...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
15067,17767,2004.0,Fidel Castro: American Experience
15068,17768,2000.0,Epoch
15069,17769,2003.0,The Company


#  SVD (Singular value Decomposition) is used for recommendation engine

"""
The surprise library in Python is used for building and analyzing recommender systems. It provides tools to work with collaborative
 filtering algorithms,
which are commonly used to recommend items (such as movies, books, or products) to users based on their past interactions or preferences.
"""

In [50]:
# installing the surprise package (for model building):

! pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357287 sha256=5ed9a7646dc68addee18c2a2b70249842764c53232fb7821ffecd74811b47348
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a

In [51]:
from surprise import Reader,Dataset, SVD
from surprise.model_selection import cross_validate

#Reader function: for reading the dataset using SVD model
#Dataset function: for loading the dataset in SVD model


In [52]:
reader = Reader()

In [53]:
netflix_dataset.shape

(19695836, 3)

In [54]:
# selecting top 300k rows for SVD model building
data = Dataset.load_from_df(netflix_dataset[['Cust_Id','Movie_Id','Ratings']][:300000],reader)

In [55]:
data

<surprise.dataset.DatasetAutoFolds at 0x7c042bb8efb0>

In [56]:
model = SVD()

In [57]:
cross_validate(model,data,measures=['RMSE'],cv=3)

{'test_rmse': array([0.98870985, 0.98923259, 0.9857191 ]),
 'fit_time': (4.903033971786499, 4.435438871383667, 4.8439531326293945),
 'test_time': (1.0474724769592285, 1.3503234386444092, 1.2847237586975098)}

In [58]:
# Making recommendation :

In [59]:
netflix_dataset

Unnamed: 0,Cust_Id,Ratings,Movie_Id
0,712664,5.0,3
1,1331154,4.0,3
2,2632461,3.0,3
3,44937,5.0,3
4,656399,4.0,3
...,...,...,...
19695831,1055714,5.0,4496
19695832,2643029,4.0,4496
19695833,267802,4.0,4496
19695834,1559566,3.0,4496


In [60]:
df_title

Unnamed: 0,Movie_Id,Year,Name
0,3,1997.0,Character
1,5,2004.0,The Rise and Fall of ECW
2,6,1997.0,Sick
3,8,2004.0,What the #$*! Do We Know!?
4,16,1996.0,Screamers
...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
15067,17767,2004.0,Fidel Castro: American Experience
15068,17768,2000.0,Epoch
15069,17769,2003.0,The Company


In [61]:
# making the recommendation for user_Id '2643029'

user_2643029 = df_title.copy()#making the copy of df_title for not changing the original data
user_2643029

Unnamed: 0,Movie_Id,Year,Name
0,3,1997.0,Character
1,5,2004.0,The Rise and Fall of ECW
2,6,1997.0,Sick
3,8,2004.0,What the #$*! Do We Know!?
4,16,1996.0,Screamers
...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
15067,17767,2004.0,Fidel Castro: American Experience
15068,17768,2000.0,Epoch
15069,17769,2003.0,The Company


In [62]:
# # based on estimation scored after predicitng the model data, the estimation score will be in the range of 1-5
# if the score is more than 3.5 we can recomend those movies to user

# SVD as it is a recomendation engione, the output of the model ie, predicted data is the estaimation score on which it wil decided the movied
# to be recomended or Not

# here in this code we are creating a new coulmns infront of each movie name on which we decide to recomend a movie

# predicting the estimation_score for all the movie_Id for this particular user


#Note: this is only for one user 2643029
user_2643029['Estimated_score'] = user_2643029['Movie_Id'].apply(lambda x:model.predict(2643029,x).est)
user_2643029

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
0,3,1997.0,Character,3.483505
1,5,2004.0,The Rise and Fall of ECW,4.187366
2,6,1997.0,Sick,3.272113
3,8,2004.0,What the #$*! Do We Know!?,3.427527
4,16,1996.0,Screamers,3.438019
...,...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.599257
15067,17767,2004.0,Fidel Castro: American Experience,3.599257
15068,17768,2000.0,Epoch,3.599257
15069,17769,2003.0,The Company,3.599257


In [63]:
#Top 5 movies that can be recommend to the userid 2643029

user_2643029.sort_values(by='Estimated_score',ascending=False).head()

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
25,68,2004.0,Invader Zim,4.533561
12,32,2004.0,ABC Primetime: Mel Gibson's The Passion of the...,4.288524
1,5,2004.0,The Rise and Fall of ECW,4.187366
26,71,1995.0,Maya Lin: A Strong Clear Vision,4.060946
19,48,2001.0,Justice League,4.041311


In [64]:
# Making recommendation for userId '293198'

user_293198	 = df_title.copy()
user_293198

Unnamed: 0,Movie_Id,Year,Name
0,3,1997.0,Character
1,5,2004.0,The Rise and Fall of ECW
2,6,1997.0,Sick
3,8,2004.0,What the #$*! Do We Know!?
4,16,1996.0,Screamers
...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
15067,17767,2004.0,Fidel Castro: American Experience
15068,17768,2000.0,Epoch
15069,17769,2003.0,The Company


In [65]:
# predicting the estimated_score:
user_293198['Estimated_score'] = user_293198['Movie_Id'].apply(lambda x:model.predict(293198,x).est)
user_293198

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
0,3,1997.0,Character,3.408847
1,5,2004.0,The Rise and Fall of ECW,3.716833
2,6,1997.0,Sick,3.253017
3,8,2004.0,What the #$*! Do We Know!?,3.249196
4,16,1996.0,Screamers,3.395810
...,...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.575342
15067,17767,2004.0,Fidel Castro: American Experience,3.575342
15068,17768,2000.0,Epoch,3.575342
15069,17769,2003.0,The Company,3.575342


In [66]:
# sorting:

# Top 10 movies that should be recommend to this user

user_293198.sort_values('Estimated_score',ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
0,33,2000.0,Aqua Teen Hunger Force: Vol. 1,4.60605
1,68,2004.0,Invader Zim,4.299906
2,32,2004.0,ABC Primetime: Mel Gibson's The Passion of the...,4.084298
3,106,2004.0,Stevie Ray Vaughan and Double Trouble: Live at...,4.083207
4,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,4.050456
5,88,1998.0,Record of Lodoss War: Chronicles of the Heroic...,4.029298
6,76,1952.0,I Love Lucy: Season 2,4.017759
7,44,1996.0,Spitfire Grill,3.974859
8,97,2002.0,Mostly Martha,3.969186
9,83,1983.0,Silkwood,3.966965


In [67]:
# Recommendation for user '712664'

user_712664 = df_title.copy()
user_712664

Unnamed: 0,Movie_Id,Year,Name
0,3,1997.0,Character
1,5,2004.0,The Rise and Fall of ECW
2,6,1997.0,Sick
3,8,2004.0,What the #$*! Do We Know!?
4,16,1996.0,Screamers
...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
15067,17767,2004.0,Fidel Castro: American Experience
15068,17768,2000.0,Epoch
15069,17769,2003.0,The Company


In [68]:
# Estimated _score:

user_712664['Estimated_score'] = user_712664['Movie_Id'].apply(lambda x:model.predict(712664,x).est)
user_712664

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
0,3,1997.0,Character,3.649854
1,5,2004.0,The Rise and Fall of ECW,3.591746
2,6,1997.0,Sick,2.961733
3,8,2004.0,What the #$*! Do We Know!?,2.979413
4,16,1996.0,Screamers,3.348229
...,...,...,...,...
15066,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.594650
15067,17767,2004.0,Fidel Castro: American Experience,3.594650
15068,17768,2000.0,Epoch,3.594650
15069,17769,2003.0,The Company,3.594650


In [69]:
# top 8 movies that can be recommend to this user '712664'

user_712664.sort_values(by='Estimated_score',ascending=False).reset_index(drop=True).head(8)

Unnamed: 0,Movie_Id,Year,Name,Estimated_score
0,106,2004.0,Stevie Ray Vaughan and Double Trouble: Live at...,4.291995
1,46,1964.0,Rudolph the Red-Nosed Reindeer,4.180253
2,32,2004.0,ABC Primetime: Mel Gibson's The Passion of the...,4.162236
3,76,1952.0,I Love Lucy: Season 2,4.096584
4,68,2004.0,Invader Zim,4.066134
5,18,1994.0,Immortal Beloved,4.037356
6,44,1996.0,Spitfire Grill,3.989477
7,79,1956.0,The Killing,3.976666
