Movie Recommendation System

Objectives

- Obtain a prediction for a specific user for a particular item
- Introduce a new user with rating to a rating matrix and make recommendations for them
- Create a function that will return the top n recommendations for a user

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, make_scorer

Loading Dataset

In [2]:
df = pd.read_csv("merged_movie_data.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp_x,title,genres,tag,timestamp_y,imdbId,tmdbId
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,114709,862.0
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,,,113228,15602.0
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,,,113277,949.0
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,,,114369,807.0
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,,,114814,629.0


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102677 entries, 0 to 102676
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   userId       102677 non-null  int64  
 1   movieId      102677 non-null  int64  
 2   rating       102677 non-null  float64
 3   timestamp_x  102677 non-null  int64  
 4   title        102677 non-null  object 
 5   genres       102677 non-null  object 
 6   tag          3476 non-null    object 
 7   timestamp_y  3476 non-null    float64
 8   imdbId       102677 non-null  int64  
 9   tmdbId       102664 non-null  float64
dtypes: float64(3), int64(4), object(3)
memory usage: 7.8+ MB


In [4]:
#Shape of the dataframe
print("The number of rows: {}".format(df.shape[0]))

print("The number of columns:{}".format(df.shape[1]))

The number of rows: 102677
The number of columns:10


In [5]:
df.describe()


Unnamed: 0,userId,movieId,rating,timestamp_x,timestamp_y,imdbId,tmdbId
count,102677.0,102677.0,102677.0,102677.0,3476.0,102677.0,102664.0
mean,327.761933,19742.712623,3.514813,1209495000.0,1323525000.0,356499.4,20476.871289
std,183.211289,35884.40099,1.043133,217011700.0,173155400.0,629571.7,54097.633332
min,1.0,1.0,0.5,828124600.0,1137179000.0,417.0,2.0
25%,177.0,1199.0,3.0,1019138000.0,1138032000.0,99710.0,710.0
50%,328.0,3005.0,3.5,1186590000.0,1279956000.0,118842.0,6950.0
75%,477.0,8366.0,4.0,1439916000.0,1498457000.0,317248.0,11673.0
max,610.0,193609.0,5.0,1537799000.0,1537099000.0,8391976.0,525662.0


In [6]:
# This function will check the datatypes within the dataframe
def check_data_types(dataframe):
    data_types = dataframe.dtypes
    print(data_types)

# Run the function
check_data_types(df)

userId           int64
movieId          int64
rating         float64
timestamp_x      int64
title           object
genres          object
tag             object
timestamp_y    float64
imdbId           int64
tmdbId         float64
dtype: object


In [13]:
#Check for null counts
null_counts = df.isnull().sum()
print(null_counts)

userId             0
movieId            0
rating             0
timestamp_x        0
title              0
genres             0
tag            99201
timestamp_y    99201
imdbId             0
tmdbId            13
dtype: int64


In [14]:
#Check for the total of missing values
total_missing = df.isnull().sum().sum()
print(f'Total missing values: {total_missing}')

Total missing values: 198415


In [15]:
#Check for column titles with missing values
missing_columns = df.columns[df.isnull().any()]
print(missing_columns)

Index(['tag', 'timestamp_y', 'tmdbId'], dtype='object')
