In [437]:
import pandas as pd
import numpy as np
import os
import datetime as dt

In [438]:
df_title=pd.read_csv('movie_titles.csv', encoding='ISO-8859-1', header=None, usecols=[0,1,2], names=['movieNumber','year','title' ])

In [439]:
df_title

Unnamed: 0,movieNumber,year,title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17764,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17765,17767,2004.0,Fidel Castro: American Experience
17766,17768,2000.0,Epoch
17767,17769,2003.0,The Company


In [440]:
with open('combined_data.txt', 'r') as f:
    lines = f.readlines()

In [441]:
data = []
for line in lines:
    if ':' in line:
        movie = line.split(':')[0]
    else:
        customerId = line.split(',')[0]
        rating = line.split(',')[1]
        data.append([movie,customerId,rating])
        

df_Netflix = pd.DataFrame(data,columns = ['movieNumber','customerId','ratings'])

In [442]:
df_Netflix.head()

Unnamed: 0,movieNumber,customerId,ratings
0,1,1488844,3
1,1,822109,5
2,1,885013,4
3,1,30878,4
4,1,823519,3


In [443]:
df_Netflix.isnull().sum()

movieNumber    0
customerId     0
ratings        0
dtype: int64

In [444]:
df_Netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24053764 entries, 0 to 24053763
Data columns (total 3 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   movieNumber  object
 1   customerId   object
 2   ratings      object
dtypes: object(3)
memory usage: 550.5+ MB


In [445]:
#df_Netflix['customerId'] = df_Netflix['customerId'].astype('int')
df_Netflix['movieNumber'] = df_Netflix['movieNumber'].astype('int')
df_Netflix['ratings'] = df_Netflix['ratings'].astype('float')

In [446]:
df_Netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24053764 entries, 0 to 24053763
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   movieNumber  int32  
 1   customerId   object 
 2   ratings      float64
dtypes: float64(1), int32(1), object(1)
memory usage: 458.8+ MB


In [447]:
df_Netflix.head()

Unnamed: 0,movieNumber,customerId,ratings
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0


In [448]:
#counting number of movies
movieCount = len(df_Netflix['movieNumber'].unique())
print("Total Number of Movies : ",movieCount)

Total Number of Movies :  4499


In [449]:
#counting number of Customers
customerCount = len(df_Netflix['customerId'].unique())
print("Total Number of Customers : ",customerCount)

Total Number of Customers :  470758


In [450]:
#How much do users like the content
netflixRatingSpread= df_Netflix['ratings'].value_counts().sort_index(ascending = False)
print('The rating spread of the content :\n',netflixRatingSpread,)

The rating spread of the content :
 5.0    5506583
4.0    8085741
3.0    6904181
2.0    2439073
1.0    1118186
Name: ratings, dtype: int64


In [451]:
#finding the rating spread movie wise

ratingSpreadMovieWise = df_Netflix.groupby(['movieNumber','ratings']).size().reset_index()
ratingSpreadMovieWise.columns = ['movieNumber', 'rating', 'ratingCount']
ratingSpreadMovieWise.sort_values(by = ['movieNumber','rating'],ascending = [True,False],inplace = True,ignore_index = True)

In [452]:
ratingSpreadMovieWise.head()

Unnamed: 0,movieNumber,rating,ratingCount
0,1,5.0,145
1,1,4.0,207
2,1,3.0,136
3,1,2.0,31
4,1,1.0,28


# Populartiy based Recommender System

In [457]:
#find the Ratings count and avg Ratings of each movie
# here rating count and coustomerId count will be same so we use just Ratings to filter fake customers

popularBy_Rating_df = df_Netflix.groupby('movieNumber').agg(avgRating = ('ratings','mean'),
                                                            ratingCount = ('ratings','count')).reset_index()

popularBy_Rating_df

Unnamed: 0,movieNumber,avgRating,ratingCount
0,1,3.749543,547
1,2,3.558621,145
2,3,3.641153,2012
3,4,2.739437,142
4,5,3.919298,1140
...,...,...,...
4494,4495,3.478827,614
4495,4496,3.763000,9519
4496,4497,2.715686,714
4497,4498,2.464684,269


In [461]:
# Top 70 percentile of movies on the basis of their ratings count

#creating filter
ratingCountThreshold = round(popularBy_Rating_df['ratingCount'].quantile(0.30),0)
ratingCountFilter = (popularBy_Rating_df['ratingCount']>=ratingCountThreshold)
ratingCountFilter

0        True
1       False
2        True
3       False
4        True
        ...  
4494     True
4495     True
4496     True
4497     True
4498     True
Name: ratingCount, Length: 4499, dtype: bool

In [462]:
# Now we can sort by rating mean
popularBy_Rating_df = popularBy_Rating_df [ratingCountFilter].reset_index(drop=True)
popularBy_Rating_df.sort_values(by = ['ratingCount'],ascending = [False],inplace = True,ignore_index = True)

In [464]:
popularBy_Rating_df

Unnamed: 0,movieNumber,avgRating,ratingCount
0,1905,4.153908,193941
1,2152,3.425322,162597
2,3860,3.434517,160454
3,4432,3.755364,156183
4,571,3.962585,154832
...,...,...,...
3146,2770,3.396476,227
3147,2134,3.286344,227
3148,3834,2.066079,227
3149,3430,4.176211,227


In [465]:
# we need movieNumber to filter our search for SVD
movieList = popularBy_Rating_df['movieNumber'].values

In [466]:
popularBy_Rating_df.head(50)

Unnamed: 0,movieNumber,avgRating,ratingCount
0,1905,4.153908,193941
1,2152,3.425322,162597
2,3860,3.434517,160454
3,4432,3.755364,156183
4,571,3.962585,154832
5,3938,4.144543,153996
6,4306,4.325245,151292
7,2452,4.434708,149866
8,1962,3.750569,145519
9,3962,4.415523,140979


In [467]:
popularBy_Rating_df = popularBy_Rating_df.merge(df_title, on='movieNumber')

# 50 MOST POPULAR MOVIES

In [470]:
popularBy_Rating_df['title'].head(50)

0     Pirates of the Caribbean: The Curse of the Bla...
1                                       What Women Want
2                                        Bruce Almighty
3                                       The Italian Job
4                                       American Beauty
5                                               Shrek 2
6                                       The Sixth Sense
7         Lord of the Rings: The Fellowship of the Ring
8                                        50 First Dates
9                             Finding Nemo (Widescreen)
10                                  The Wedding Planner
11                                     The Last Samurai
12                                 The Bourne Supremacy
13                                      Men in Black II
14                                           Braveheart
15                                                Ghost
16                                          Man on Fire
17                             The Silence of th

# SVD BASED RECOMMENDER SYSTEM (Colaborative Sysytem)

In [471]:
import math
from scipy.sparse import csr_matrix
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [472]:
#filtering out the low and fake ratings before SVD

df_Netflix = df_Netflix[df_Netflix['movieNumber'].isin(movieList)]

In [473]:
df_Netflix.head()

Unnamed: 0,movieNumber,customerId,ratings
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0


In [484]:
df_Netflix.shape

(23863973, 3)

In [474]:
#for checking if with user input
validCustomerList = df_Netflix['customerId'].values

In [475]:
#filtering titles of movies based on filterd ratings
df_filteredTitle = df_title.copy()

In [476]:
df_filteredTitle = df_filteredTitle[df_filteredTitle['movieNumber'].isin(movieList)]

In [477]:
df_filteredTitle

Unnamed: 0,movieNumber,year,title
0,1,2003.0,Dinosaur Planet
2,3,1997.0,Character
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
7,8,2004.0,What the #$*! Do We Know!?
...,...,...,...
4494,4495,2002.0,Clifford: Happy Birthday Clifford / Puppy Love
4495,4496,1993.0,Farewell My Concubine
4496,4497,1990.0,Texasville
4497,4498,2000.0,Gonin


In [478]:
reader=Reader()

In [479]:
data=Dataset.load_from_df(df_Netflix[['movieNumber', 'customerId', 'ratings']][:100000], reader)

In [480]:
svd=SVD()
cross_validate(svd, data, measures=['RMSE','MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0491  1.0527  1.0491  1.0503  0.0017  
MAE (testset)     0.8320  0.8342  0.8353  0.8338  0.0014  
Fit time          2.37    2.09    1.90    2.12    0.19    
Test time         1.93    0.37    0.35    0.88    0.74    


{'test_rmse': array([1.0491492 , 1.05272346, 1.04913909]),
 'test_mae': array([0.83203807, 0.83420769, 0.83529349]),
 'fit_time': (2.36628794670105, 2.087416887283325, 1.9009227752685547),
 'test_time': (1.9295868873596191, 0.3730041980743408, 0.3450753688812256)}

In [481]:
trainset=data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x23ae3353400>

In [482]:
def userRecommender(custId):
    if custId in validCustomerList:
        df_custId = df_filteredTitle.copy()
        df_custId['Estimate_Score']=df_custId['movieNumber'].apply(lambda x: svd.predict(custId, x).est)
        df_custId=df_custId.drop('movieNumber', axis=1)
        df_custId=df_custId.sort_values('Estimate_Score')
        print(df_custId['title'].head(10))
    else:
        #if its a new customer then recommend top 10 most popular movies
        print(popularBy_Rating_df['title'].head(10))

In [483]:
#Search with customerId
custId = int(input('Enter Customer-ID for Movie Recommendation'))
userRecommender(custId)

Enter Customer-ID for Movie Recommendation822109
0    Pirates of the Caribbean: The Curse of the Bla...
1                                      What Women Want
2                                       Bruce Almighty
3                                      The Italian Job
4                                      American Beauty
5                                              Shrek 2
6                                      The Sixth Sense
7        Lord of the Rings: The Fellowship of the Ring
8                                       50 First Dates
9                            Finding Nemo (Widescreen)
Name: title, dtype: object
