In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
df = pd.read_csv('/content/drive/MyDrive/Netflix data/Copy of combined_data_1.txt.zip', names = ['Cust_ID', 'Rating'], usecols = [0,1])

In [8]:
df

Unnamed: 0,Cust_ID,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


In this dataset there is customers in Cust_ID column , and 1: 2: ... is the each movie. In cust_Id column cust_id can be repeated. In rating column there is rating for particular movie of a customer,

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24058263 entries, 0 to 24058262
Data columns (total 2 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Cust_ID  object 
 1   Rating   float64
dtypes: float64(1), object(1)
memory usage: 367.1+ MB


In [10]:
df.isnull().sum()

Unnamed: 0,0
Cust_ID,0
Rating,4499


In [11]:
# total number of movie count is basically no. of NaN values in rating column

movie_count = df.isnull().sum()[1]
movie_count

4499

In [12]:
# Finding the number of unique customers

no_of_customers = df['Cust_ID'].nunique()
no_of_customers

475257

In [13]:
# actual customers who rated
actual_customer  = no_of_customers - movie_count
actual_customer

470758

In [14]:
#creating a new column according to the movie

movie_np = None
movie_id = []

for Cust_ID in df['Cust_ID']:
  if ':' in Cust_ID:
    movie_np = int(Cust_ID.replace(':', ''))
  movie_id.append(movie_np)

In [15]:
# adding the list as a column with df dataset

df['Movie_ID'] = movie_id

In [16]:
df.head()

Unnamed: 0,Cust_ID,Rating,Movie_ID
0,1:,,1
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1


In [17]:
#removing the NaN values from the dataset

df =  df[df['Rating'].notna()]

In [18]:
df.head()

Unnamed: 0,Cust_ID,Rating,Movie_ID
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [19]:
df.dtypes

Unnamed: 0,0
Cust_ID,object
Rating,float64
Movie_ID,int64


In [20]:
# changing dtype of Cust_ID column to int

df['Cust_ID'] = df['Cust_ID'].astype(int)

In [21]:
df['Cust_ID'].dtypes

dtype('int64')

In [22]:
# 1) I will remove all the customers who rated less number of movies
# 2) Then I will remove all the movies which have less no. of rating

In [23]:
df_group = df.groupby('Movie_ID')['Rating'].agg(['count'])
df_group

Unnamed: 0_level_0,count
Movie_ID,Unnamed: 1_level_1
1,547
2,145
3,2012
4,142
5,1140
...,...
4495,614
4496,9519
4497,714
4498,269


In [24]:
df_group.sort_values(by =['count'], ascending = True)

Unnamed: 0_level_0,count
Movie_ID,Unnamed: 1_level_1
4362,36
4338,39
3656,42
915,43
4294,44
...,...
571,154832
4432,156183
3860,160454
2152,162597


In [25]:
# Now I will create a benchmark with the threshold of 60%

movie_count_benchmark = round(df_group['count'].quantile(0.6), 0)

movie_count_benchmark

908.0

In [26]:
# It means I will delete all the movies which total rating count is less than 908

In [27]:
drop_movie_list = df_group[df_group['count']< movie_count_benchmark].index
drop_movie_list

Index([   1,    2,    4,    7,    9,   10,   11,   12,   13,   14,
       ...
       4480, 4481, 4486, 4487, 4491, 4494, 4495, 4497, 4498, 4499],
      dtype='int64', name='Movie_ID', length=2699)

In [28]:
len(drop_movie_list)

2699

In [29]:
4499 - 2699

1800

In [30]:
# 1800 movie has the benchmark

# now will remove the customers who has given less no of rating

df_group1 = df.groupby('Cust_ID')['Rating'].agg(['count'])
df_group1

Unnamed: 0_level_0,count
Cust_ID,Unnamed: 1_level_1
6,153
7,195
8,21
10,49
25,4
...,...
2649404,12
2649409,10
2649421,3
2649426,74


In [31]:
customer_count_benchmark = round(df_group1['count'].quantile(0.6), 0)
customer_count_benchmark

36.0

In [32]:
drop_customer_list = df_group1[df_group1['count']< customer_count_benchmark].index
drop_customer_list

Index([      8,      25,      33,      83,      94,     126,     130,     133,
           142,     149,
       ...
       2649337, 2649343, 2649351, 2649376, 2649379, 2649384, 2649401, 2649404,
       2649409, 2649421],
      dtype='int64', name='Cust_ID', length=282042)

In [33]:
len(drop_customer_list)

282042

In [34]:
df.head(2)

Unnamed: 0,Cust_ID,Rating,Movie_ID
1,1488844,3.0,1
2,822109,5.0,1


In [35]:
df = df[~df['Movie_ID'].isin(drop_movie_list)]
df = df[~df['Cust_ID'].isin(drop_customer_list)]

In [36]:
df.shape

(19695836, 3)

In [37]:
# model making

df_title = pd.read_csv('/content/drive/MyDrive/Netflix data/Copy of movie_titles.csv',
                       encoding= 'ISO-8859-1', header = None, usecols = [0,1,2], names = ['Movie_ID', 'Year', 'Name'])

In [38]:
df_title

Unnamed: 0,Movie_ID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [39]:
! pip install scikit-surprise



In [40]:
# we will use SVD (Singular Value Decomposition) that is used for recommandation

from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [46]:
read = Reader()
data = Dataset.load_from_df(df[['Cust_ID','Movie_ID', 'Rating']][:100000], read)

In [47]:
data

<surprise.dataset.DatasetAutoFolds at 0x7946b2f3be80>

In [48]:
model = SVD()

In [49]:
cross_validate(model, data, measures = ['RMSE'], cv = 3)

{'test_rmse': array([1.02179796, 1.01608735, 1.01643467]),
 'fit_time': (1.6687850952148438, 2.5340800285339355, 1.586259365081787),
 'test_time': (7.694182395935059, 7.112680435180664, 0.20818018913269043)}

In [50]:
df.head()

Unnamed: 0,Cust_ID,Rating,Movie_ID
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3


In [51]:
df_title.head()

Unnamed: 0,Movie_ID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [53]:
# Recommandation
#Recommanding a movie to a new user

user_656399 = df_title.copy()
user_656399

Unnamed: 0,Movie_ID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [54]:
user_656399 = user_656399[~user_656399['Movie_ID'].isin(drop_movie_list)]

In [55]:
user_656399

Unnamed: 0,Movie_ID,Year,Name
2,3,1997.0,Character
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
7,8,2004.0,What the #$*! Do We Know!?
15,16,1996.0,Screamers
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [59]:
user_656399['Estimate Score'] = user_656399['Movie_ID'].apply(lambda x : model.predict(656399, x).est)
user_656399

Unnamed: 0,Movie_ID,Year,Name,Estimate Score
2,3,1997.0,Character,4.092642
4,5,2004.0,The Rise and Fall of ECW,3.256138
5,6,1997.0,Sick,3.423281
7,8,2004.0,What the #$*! Do We Know!?,3.296462
15,16,1996.0,Screamers,3.224171
...,...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.588967
17766,17767,2004.0,Fidel Castro: American Experience,3.588967
17767,17768,2000.0,Epoch,3.588967
17768,17769,2003.0,The Company,3.588967


In [61]:
 #this is top 5 movie that recommanded to that user
 user_656399.sort_values(by = ['Estimate Score'], ascending = False).head(5)

Unnamed: 0,Movie_ID,Year,Name,Estimate Score
17,18,1994.0,Immortal Beloved,4.482199
2,3,1997.0,Character,4.092642
24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,3.986355
29,30,2003.0,Something's Gotta Give,3.749098
12741,12742,1984.0,Cat on a Hot Tin Roof,3.588967
