 Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Import ratings data

In [2]:
df=pd.read_csv('Ratings.txt',header=None, names=['Cust_Id', 'Rating'], usecols = [0,1])

In [3]:
df

Unnamed: 0,Cust_Id,Rating
0,1:,
1,1488844,3.0
2,822109,5.0
3,885013,4.0
4,30878,4.0
...,...,...
24058258,2591364,2.0
24058259,1791000,2.0
24058260,512536,5.0
24058261,988963,3.0


checking for null values

In [4]:
df.isna().sum()

Cust_Id       0
Rating     4499
dtype: int64

separating customer id and movie id into different columns
---------

In [5]:
df=df.fillna(0)

df['movie_id']=df['Cust_Id'].where(df['Rating']==0)
 
df['movie_id']=df['movie_id'].fillna(method='ffill')

df['Rating']=df['Rating'].replace({0 : np.nan})

df=df.dropna()

In [6]:
df['movie_id']=df['movie_id'].str.replace(':','')

df['movie_id']=df['movie_id'].astype(int)

df

Unnamed: 0,Cust_Id,Rating,movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


Number of movies

In [7]:
df['movie_id'].nunique()

4499

Number of customers

In [8]:
df['Cust_Id'].nunique()

470758

Number of rating

In [9]:
df['Rating'].count()

24053764

Merging two data sets
---------------

In [10]:
df_title=pd.read_csv('movies_list.csv')

In [11]:
df_title

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
1,2,Jumanji (1995),Adventure
2,3,Grumpier Old Men (1995),Comedy
3,4,Waiting to Exhale (1995),Comedy
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [12]:
df=df.merge(df_title,how='left',left_on='movie_id',right_on='movieId')


In [13]:
df_merged=df.drop('movieId',axis=1)

1.Find out the list of most popular and liked genre
--------

Popular - Genre which has been rated most times

In [14]:
df_merged.describe(include='object')

Unnamed: 0,Cust_Id,title,genres
count,24053764,23813503,23813503
unique,470758,4405,19
top,305344,Marie from the Bay of Angels (Marie Baie Des A...,Drama
freq,4467,193941,6670725


Drama is the most popular genre

Liked genre - rated high

In [15]:
rate=df_merged.groupby(['genres']).mean(['Rating'])

In [16]:
rate[rate['Rating']==rate['Rating'].max()]

Unnamed: 0_level_0,Rating,movie_id
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Musical,3.844063,1818.362647


3.Find what Genre Movies have received the best and worst ratings based on User Rating.
----


In [17]:
rate[rate['Rating']==rate['Rating'].max()]

Unnamed: 0_level_0,Rating,movie_id
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Musical,3.844063,1818.362647


In [18]:
rate[rate['Rating']==rate['Rating'].min()]

Unnamed: 0_level_0,Rating,movie_id
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
War,3.147445,3352.701411


2.Create Model that finds the best suited Movie for one user in every genre
---------------

In [19]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [20]:
reader=Reader()

data = Dataset.load_from_df(df_merged[['Cust_Id', 'movie_id', 'Rating']][:100000], reader)

svd = SVD()

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.0567  1.0491  1.0502  1.0520  0.0034  
MAE (testset)     0.8399  0.8300  0.8279  0.8326  0.0053  
Fit time          2.55    1.92    2.43    2.30    0.27    
Test time         0.68    0.40    0.58    0.55    0.11    


{'test_rmse': array([1.05671578, 1.04910014, 1.05024987]),
 'test_mae': array([0.83993999, 0.82998365, 0.82786892]),
 'fit_time': (2.55295991897583, 1.9175710678100586, 2.4285783767700195),
 'test_time': (0.675938606262207, 0.40250611305236816, 0.5755908489227295)}

In [21]:
user_712664 = df_title.copy()

In [22]:
user_712664 = user_712664.reset_index()

In [23]:
data = Dataset.load_from_df(df_merged[['Cust_Id', 'movie_id', 'Rating']], reader)

In [24]:
data

<surprise.dataset.DatasetAutoFolds at 0x15f80ec07d0>

In [25]:
trainset = data.build_full_trainset()

In [26]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x15ffa6f6110>

In [27]:
user_712664['Estimate_Score'] = user_712664['movieId'].apply(lambda x: svd.predict(712664, x).est)

#Drop extra columns from the user_712664 data frame
user_712664 = user_712664.drop('index', axis = 1)


In [28]:
user_712664

Unnamed: 0,movieId,title,genres,Estimate_Score
0,1,Toy Story (1995),Adventure,3.871708
1,2,Jumanji (1995),Adventure,3.779419
2,3,Grumpier Old Men (1995),Comedy,3.783955
3,4,Waiting to Exhale (1995),Comedy,2.868392
4,5,Father of the Bride Part II (1995),Comedy,3.956485
...,...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy,3.599634
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,3.599634
27275,131258,The Pirates (2014),Adventure,3.599634
27276,131260,Rentun Ruusu (2001),(no genres listed),3.599634


In [29]:
user_712664['genres'].nunique()

20

List best suited Movie for one user in every genre.
--------------

In [33]:
user_712664.groupby('genres').max(['Estimate_Score'])

Unnamed: 0_level_0,movieId,Estimate_Score
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),131260,3.599634
Action,131180,4.475107
Adventure,131262,4.477167
Animation,131243,4.285115
Children,131054,4.229188
Comedy,131256,4.498832
Crime,131011,4.455889
Documentary,131110,4.245576
Drama,131176,4.673273
Fantasy,130071,3.938225


In [34]:
user_712664.groupby('genres').agg({'title':'first','Estimate_Score':'max'})

Unnamed: 0_level_0,title,Estimate_Score
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),Milky Way (Tejút) (2007),3.599634
Action,Turbo: A Power Rangers Movie (1997),4.475107
Adventure,"NeverEnding Story II: The Next Chapter, The (1...",4.477167
Animation,Steamboat Willie (1928),4.285115
Children,"5,000 Fingers of Dr. T, The (1953)",4.229188
Comedy,Spaceballs (1987),4.498832
Crime,"Pelican Brief, The (1993)",4.455889
Documentary,"Haunted World of Edward D. Wood Jr., The (1996)",4.245576
Drama,"Color of Paradise, The (Rang-e khoda) (1999)",4.673273
Fantasy,Made in Heaven (1987),3.938225
