In [5]:
#Importing Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
#Warnings
import warnings
warnings.filterwarnings('ignore')

In [7]:
#mounting gdrive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
df=pd.read_csv('/content/gdrive/MyDrive/combinedNetflixData.txt',names = ["CustID", "Ratings"], usecols = [0, 1], header = None)
df.head()

Unnamed: 0,CustID,Ratings
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0


In [9]:
df.shape[0]

24058263

In [10]:
df.shape[1]

2

Fetch some relevant information

In [11]:
#How many movies we are dealing with in total?
total_count_of_movies=df.isnull().sum()['Ratings']
total_count_of_movies

4499

In [12]:
#Total Customer Count
total_customer_count=df['CustID'].nunique()-total_count_of_movies
total_customer_count

470758

How many ratings has been given

In [13]:
Total_Ratings=df['CustID'].count()-total_count_of_movies
Total_Ratings


24053764

In [14]:
#Division of Ratings
df['Ratings'].value_counts()
df.head(2)

Unnamed: 0,CustID,Ratings
0,1:,
1,1488844,3.0


Segregation of data

In [15]:
# Creating the duplicates
temp = df.copy()

temp['movie_id'] = temp['CustID'].where(temp['Ratings'].isna()).ffill()

# Dropping the irrelevant columns
temp=temp.dropna(subset=['Ratings'])

# Strings splits
temp['movie_id'] = temp['movie_id'].astype(str).str.replace(':', '', regex=False)

In [16]:
temp.head()

Unnamed: 0,CustID,Ratings,movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


Preparation for Collabrative Filtering


In [17]:
#We cannot remove duplicates for any column since the requirement is completely based on the ratings of the users given, hence we will go ahead with the benchmarks

#Here there is a need of two benchmarks in the dataset

#Customers who are not frequently giving rating (non active users or maybe fake or dummy users). These are users that watch movies but don't give ratings, so we can remove them
#Those movies that has less ratings are possibly not much popular so will not recommend and remove them from the list

In [18]:
#Benchmark 1 - Movies Part

In [19]:
temp.head()

Unnamed: 0,CustID,Ratings,movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [20]:
movie_list=temp.groupby('movie_id')['Ratings'].agg(["count"])
movie_list

Unnamed: 0_level_0,count
movie_id,Unnamed: 1_level_1
1,547
10,249
100,78
1000,760
1001,6204
...,...
995,104
996,11858
997,302
998,446


Threshold for setting the movie as good movie or to be considered

In [21]:
movie_list["count"].quantile(0.7)

1798.6

In [22]:
benchmark_movie = round(movie_list["count"].quantile(0.7), 0)

In [23]:
benchmark_movie

1799.0

In [24]:
#We can use this benchmark to decide which movie has to be there and which one to be dropped

#Any movie that is having less than 1800 ratings will be dropped, only movie having a lot of rating to be there in the list

In [25]:
drop_movie_list=movie_list[movie_list["count"]<benchmark_movie].index

In [26]:
len(drop_movie_list)

3149

In [27]:
#Benchmark 2 - User Part

In [28]:
cust_list=temp.groupby('CustID')['Ratings'].agg(["count"])
cust_list

Unnamed: 0_level_0,count
CustID,Unnamed: 1_level_1
10,49
1000004,1
1000027,30
1000033,101
1000035,20
...,...
999964,48
999972,35
999977,14
999984,38


Threshold for setting the movie as good movie or to be considered

In [29]:
cust_list["count"].quantile(0.7)

52.0

In [30]:
benchmark_users=round(cust_list["count"].quantile(0.7), 0)
benchmark_users

52.0

In [31]:
drop_cust_list=cust_list[cust_list["count"]<benchmark_users].index
drop_cust_list

Index(['10', '1000004', '1000027', '1000035', '1000038', '1000051', '1000057',
       '100006', '100007', '1000072',
       ...
       '999932', '999935', '99994', '999945', '999949', '999964', '999972',
       '999977', '999984', '999988'],
      dtype='object', name='CustID', length=327300)

In [32]:
len(drop_cust_list)

327300

Removal of data

In [33]:
temp.columns

Index(['CustID', 'Ratings', 'movie_id'], dtype='object')

In [34]:
(temp['CustID'].unique())

array(['1488844', '822109', '885013', ..., '1017887', '72311', '594210'],
      dtype=object)

In [35]:
#To remove the movies and customer from the existing data using the drop_list that we have created

In [36]:
temp.head()

Unnamed: 0,CustID,Ratings,movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1


In [37]:
temp = temp[~temp["movie_id"].isin(drop_movie_list)]  # Data - > isin(check) -> True -> False (remove)
temp = temp[~temp["CustID"].isin(drop_cust_list)]

In [38]:
temp.shape

(17337458, 3)

In [39]:
#Working with recommendation

In [40]:
#Load our secondary data

In [41]:
movies_df = pd.read_csv("/content/gdrive/MyDrive/NetflixMovieData.csv", names = ["MovieID", "Year", "Name"], usecols = [0, 1, 2], header = None)

In [42]:
movies_df.head()

Unnamed: 0,MovieID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [43]:
#Recommendation System with SVD

In [44]:
pip install scikit-surprise


Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357294 sha256=3a945135f5211b77f0a326d84aba6343de066daee1d0ef87ca2524c3a6171076
  Stored in directory: /root/.cache/pip/wheels/4b/3f/d

In [45]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

Read the data

In [46]:
reader=Reader()

In [47]:
data=Dataset.load_from_df(temp[['CustID', 'movie_id', 'Ratings']][:100000], reader)

Model Building

In [48]:
model=SVD()

In [49]:
cross_validate(model, data, measures=['RMSE'], cv=5, verbose=True)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0002  1.0051  0.9937  1.0065  1.0003  1.0012  0.0045  
Fit time          1.86    3.82    5.51    2.54    1.59    3.06    1.44    
Test time         0.12    0.37    0.29    0.11    0.11    0.20    0.11    


{'test_rmse': array([1.00023018, 1.00506039, 0.99368128, 1.00649038, 1.00033246]),
 'fit_time': (1.8578720092773438,
  3.8162899017333984,
  5.505520582199097,
  2.542386293411255,
  1.591782569885254),
 'test_time': (0.11595511436462402,
  0.3739347457885742,
  0.2905619144439697,
  0.1104280948638916,
  0.10984039306640625)}

Creating filter for recommendation¶

In [50]:
temp['CustID'].dtype

dtype('O')

In [51]:
data_1331154=temp[(temp['CustID']==1331154) & (temp['Ratings']==5.0)]

In [52]:
data_1331154.head()

Unnamed: 0,CustID,Ratings,movie_id


In [53]:
movies_df

Unnamed: 0,MovieID,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17764,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17765,17767,2004.0,Fidel Castro: American Experience
17766,17768,2000.0,Epoch
17767,17769,2003.0,The Company


In [54]:
temp.head()

Unnamed: 0,CustID,Ratings,movie_id
696,712664,5.0,3
697,1331154,4.0,3
698,2632461,3.0,3
699,44937,5.0,3
700,656399,4.0,3


In [55]:
list_of_infinite_possibility = movies_df.copy()

In [56]:
list_of_infinite_possibility.reset_index(inplace=True)

These all are the movie that a user can possibly see. Now we will use this list and the data for framing the recommendation

In [57]:
list_of_infinite_possibility=list_of_infinite_possibility[~list_of_infinite_possibility['MovieID'].isin(drop_movie_list)]

Make Recommendation

In [58]:
list_of_infinite_possibility["Estimate Score"]=list_of_infinite_possibility['MovieID'].apply(lambda x: model.predict(44937, x).est)

In [59]:
list_of_infinite_possibility

Unnamed: 0,index,MovieID,Year,Name,Estimate Score
0,0,1,2003.0,Dinosaur Planet,3.605637
1,1,2,2004.0,Isle of Man TT 2004 Review,3.605637
2,2,3,1997.0,Character,3.605637
3,3,4,1994.0,Paula Abdul's Get Up & Dance,3.605637
4,4,5,2004.0,The Rise and Fall of ECW,3.605637
...,...,...,...,...,...
17764,17764,17766,2002.0,Where the Wild Things Are and Other Maurice Se...,3.605637
17765,17765,17767,2004.0,Fidel Castro: American Experience,3.605637
17766,17766,17768,2000.0,Epoch,3.605637
17767,17767,17769,2003.0,The Company,3.605637


In [60]:
list_of_infinite_possibility = list_of_infinite_possibility.sort_values('Estimate Score', ascending=False)

In [61]:
list_of_infinite_possibility

Unnamed: 0,index,MovieID,Year,Name,Estimate Score
0,0,1,2003.0,Dinosaur Planet,3.605637
11844,11844,11846,1985.0,Prizzi's Honor,3.605637
11850,11850,11852,1991.0,Return to the Blue Lagoon,3.605637
11849,11849,11851,1946.0,The Yearling,3.605637
11848,11848,11850,2003.0,Dumb and Dumberer: When Harry Met Lloyd,3.605637
...,...,...,...,...,...
5929,5929,5930,2002.0,Sweet Sixteen,3.605637
5930,5930,5931,1979.0,Last Hurrah for Chivalry,3.605637
5931,5931,5932,1966.0,Lost in Space: Season 2: Vol. 2,3.605637
5932,5932,5933,1975.0,Tommy,3.605637
