# Objective
1.Find out the list of most popular and liked genre

2.Create Model that finds the best suited Movie for one
user in every genre.

3.Find what Genre Movies have received the best and worst ratings based on User Rating.

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
data=pd.read_csv('CapstoneDataSet.csv')

In [34]:
data.head()

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
0,1488844,3,1,Action,Dinosaur Planet
1,822109,5,1,Action,Dinosaur Planet
2,885013,4,1,Action,Dinosaur Planet
3,30878,4,1,Action,Dinosaur Planet
4,823519,3,1,Action,Dinosaur Planet


# Data Cleaning

In [35]:
data.isnull().sum()

Cust_Id         0
Rating          0
Movie_Id        0
Genre        1086
MovieName       0
dtype: int64

In [36]:
data.shape

(1048574, 5)

In [37]:
#How many % of data is NA ?  
(1086/1048574)*100

0.10356922830434476

In [38]:
#Since only 0.1 % data is missing , it is negligible amount. Hence, deleting those rows.

data.shape

(1048574, 5)

In [39]:
data=data.dropna()
data.shape

(1047488, 5)

In [40]:
1048574-1047488

1086

In [41]:
#Checking if there are duplicate rows
data.duplicated().sum()

0

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1047488 entries, 0 to 1048573
Data columns (total 5 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   Cust_Id    1047488 non-null  int64 
 1   Rating     1047488 non-null  int64 
 2   Movie_Id   1047488 non-null  int64 
 3   Genre      1047488 non-null  object
 4   MovieName  1047488 non-null  object
dtypes: int64(3), object(2)
memory usage: 48.0+ MB


In [43]:
#Checking the data for a particular Customer 1488844
data[data['Cust_Id']==1488844]

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
0,1488844,3,1,Action,Dinosaur Planet
5149,1488844,4,8,Animation,What the #$*! Do We Know!?
24352,1488844,2,17,Thriller,7 Seconds
93266,1488844,3,30,Historical,Something's Gotta Give
224744,1488844,3,44,Historical,Spitfire Grill
262264,1488844,5,58,Other,Dragonheart
287314,1488844,3,76,Sci-Fi,I Love Lucy: Season 2
312105,1488844,3,80,Documentary,Winter Kills
312388,1488844,3,81,Educational,Antarctica: IMAX
313761,1488844,3,83,Gang,Silkwood


# 1.Find out the list of most popular and liked genre

In [44]:
#Most Popular - Most Watched
#Most Like - Higher rating

# Most Watched + Highher Rating

In [65]:
#Calculated the count of no. of reviews per genre(i.e no. of times it is watched) and avg rating for each genre
genre_summary=data.groupby('Genre')['Rating'].agg(['count','mean'])
genre_summary

Unnamed: 0_level_0,count,mean
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Action,547,3.749543
Animation,123898,3.581107
Biography,88510,3.463439
Comedy,145,3.558621
Crime,97323,3.728872
Documentary,38282,3.311661
Drama,19258,3.496365
Educational,111976,3.898523
Fan,13278,3.465356
Fiction,1019,3.084396


In [46]:
#Finding top 30 percentile genre based on ratings i.e most like genres

genre_rating_benchmark=genre_summary['mean'].quantile(0.7)
genre_most_liked=genre_summary[genre_summary['mean']>genre_rating_benchmark]
genre_most_liked=np.array(genre_most_liked.index)
print('Most Higher Rated Genre BenchMark  : ',genre_rating_benchmark)
print('List of most liked Genre : ',genre_most_liked)

Most Higher Rated Genre BenchMark  :  3.610958227374906
List of most liked Genre :  ['Action' 'Crime' 'Educational' 'Gang' 'Historical' 'Horror']


In [47]:
##Finding top 30 percentile genre based on count i.e most watched/popular genres
genre_count_benchmark=genre_summary['count'].quantile(0.7)
genre_most_popular=genre_summary[genre_summary['count']>genre_count_benchmark]
genre_most_popular=np.array(genre_most_popular.index)
print('Most Watched Genre BenchMark  : ',genre_count_benchmark)
print('List of most popular Genre : ',genre_most_popular)

Most Watched Genre BenchMark  :  64955.69999999997
List of most popular Genre :  ['Animation' 'Biography' 'Crime' 'Educational' 'Historical' 'Mystery']


In [48]:
#Calculating the intersection of most liked and most popular genre
genre_most_liked_popular=np.intersect1d(genre_most_liked,genre_most_popular)
genre_most_liked_popular

array(['Crime', 'Educational', 'Historical'], dtype=object)

# Answer:  Most Popular and Liked Genre = ['Crime', 'Educational', 'Historical']

# 2.Create Model that finds the best suited Movie for one user in every genre.

In [49]:
data[data['Cust_Id']==124105]

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
6,124105,4,1,Action,Dinosaur Planet
717852,124105,5,191,Educational,X2: X-Men United


In [50]:
data.Genre.value_counts().count()

20

In [51]:
data_model=data[['Cust_Id','Rating','Movie_Id']]

In [52]:
data_model.head(5)

Unnamed: 0,Cust_Id,Rating,Movie_Id
0,1488844,3,1
1,822109,5,1
2,885013,4,1
3,30878,4,1
4,823519,3,1


In [53]:
!pip install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [54]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [96]:
#Initialize SVD
svd = SVD()
reader = Reader()

data_svd=Dataset.load_from_df(data[['Cust_Id', 'Movie_Id', 'Rating']],reader)
# Compute the RMSE of the SVD algorithm
cross_validate(svd, data_svd, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9855  0.9881  0.9848  0.9861  0.0014  
MAE (testset)     0.7706  0.7718  0.7708  0.7710  0.0006  
Fit time          48.48   50.09   48.70   49.09   0.71    
Test time         4.49    3.95    4.52    4.32    0.26    


{'test_rmse': array([0.98547314, 0.9881036 , 0.98481271]),
 'test_mae': array([0.77055013, 0.77182565, 0.77076293]),
 'fit_time': (48.47765517234802, 50.09334468841553, 48.70305252075195),
 'test_time': (4.4946558475494385, 3.9470605850219727, 4.516969919204712)}

In [61]:
data[data['Cust_Id']==30878]

Unnamed: 0,Cust_Id,Rating,Movie_Id,Genre,MovieName
3,30878,4,1,Action,Dinosaur Planet
2849,30878,1,5,Thriller,The Rise and Fall of ECW
31469,30878,3,18,Animation,Immortal Beloved
52546,30878,3,28,Biography,Lilo and Stitch
92838,30878,4,30,Historical,Something's Gotta Give
224734,30878,5,44,Historical,Spitfire Grill
262250,30878,3,58,Other,Dragonheart
312390,30878,3,81,Educational,Antarctica: IMAX
340984,30878,3,97,Animation,Mostly Martha
415825,30878,3,118,War,Rambo: First Blood Part II


In [66]:
movie_details=data[['Movie_Id','MovieName','Genre']]
movie_details=movie_details.drop_duplicates()
movie_details


Unnamed: 0,Movie_Id,MovieName,Genre
0,1,Dinosaur Planet,Action
547,2,Isle of Man TT 2004 Review,Comedy
692,3,Character,Horror
2704,4,Paula Abdul's Get Up & Dance,Romance
2846,5,The Rise and Fall of ECW,Thriller
...,...,...,...
1009258,237,Broken Blossoms,Animation
1009606,238,Led Zeppelin: The Song Remains the Same,Other
1011941,239,Winnie the Pooh: Springtime with Roo,Mystery
1014131,240,Woman of the Year,Drama


In [95]:
user_30878=movie_details.copy()

In [97]:
#create a training set for svd
trainset = data_svd.build_full_trainset()
svd.fit(trainset)

#Predict the ratings for user_30878
user_30878['Estimate_Score'] = user_30878['Movie_Id'].apply(lambda x: svd.predict(30878, x).est)

#Drop extra columns from the user_30878 data frame
user_30878 = user_30878.drop('Movie_Id', axis = 1)

# Sort predicted ratings for user_30878 in descending order
user_30878 = user_30878.sort_values('Estimate_Score', ascending=False)

#Print top 10 recommendations
print(user_30878.head(10))

                                                 MovieName        Genre  \
21096    Lord of the Rings: The Return of the King: Ext...     Thriller   
281536                                         Invader Zim       RomCom   
557382                                          The Chorus    Animation   
287305                               I Love Lucy: Season 2       Sci-Fi   
951590                            That '70s Show: Season 1    Biography   
358272                            Magnolia: Bonus Material  Educational   
335222                                          Elfen Lied        Other   
45182        Inspector Morse 31: Death Is Now My Neighbour        Drama   
1019315                                 North by NorthWest       Horror   
540328                                Gentlemen of Fortune    Biography   

         Estimate_Score  
21096          4.399505  
281536         4.272484  
557382         4.271894  
287305         4.234490  
951590         4.225217  
358272         4.1

In [100]:
print(user_30878.head(20))

                                                 MovieName        Genre  \
21096    Lord of the Rings: The Return of the King: Ext...     Thriller   
281536                                         Invader Zim       RomCom   
557382                                          The Chorus    Animation   
287305                               I Love Lucy: Season 2       Sci-Fi   
951590                            That '70s Show: Season 1    Biography   
358272                            Magnolia: Bonus Material  Educational   
335222                                          Elfen Lied        Other   
45182        Inspector Morse 31: Death Is Now My Neighbour        Drama   
1019315                                 North by NorthWest       Horror   
540328                                Gentlemen of Fortune    Biography   
577375                                      Reservoir Dogs   Historical   
335781   Record of Lodoss War: Chronicles of the Heroic...        Crime   
716692                   

3.Find what Genre Movies have received the best and worst ratings based on User Rating.

In [102]:
#Finding top 10 percentile genre based on ratings i.e most like genres

genre_rating_benchmark=genre_summary['mean'].quantile(0.9)
genre_most_liked=genre_summary[genre_summary['mean']>genre_rating_benchmark]
genre_most_liked=np.array(genre_most_liked.index)
print('Most Higher Rated Genre BenchMark  : ',genre_rating_benchmark)
print('List of most liked Genre : ',genre_most_liked)

Most Higher Rated Genre BenchMark  :  3.826005530985234
List of most liked Genre :  ['Educational' 'Horror']


In [104]:
##Finding bottom 10 percentile genre based on ratings i.e most like genres
genre_rating_benchmark_worst=genre_summary['mean'].quantile(0.1)
genre_most_worst=genre_summary[genre_summary['mean']<genre_rating_benchmark_worst]
genre_most_worst=np.array(genre_most_worst.index)
print('Most Lower Rated Genre BenchMark  : ',genre_rating_benchmark_worst)
print('List of most worst Genre : ',genre_most_worst)

Most Lower Rated Genre BenchMark  :  3.0832680427734536
List of most worst Genre :  ['Romance' 'Thriller']
