In [87]:
import pandas as pd
import numpy as np
from surprise import SVD
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate

from surprise.model_selection import train_test_split
from surprise import accuracy

from surprise.prediction_algorithms import SVD
from surprise.model_selection import GridSearchCV

### Reading data into python and exploring data info

In [33]:
ratings_df = pd.read_csv('../data/ratings.csv')
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [34]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [35]:
movies_df = pd.read_csv('../data/movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [36]:
tags_df = pd.read_csv('../data/tags.csv')
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [5]:
links_df = pd.read_csv('../data/links.csv')
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [15]:
#Instantiate algorithm from Surprise
algo = SVD()

In [19]:
X = ratings_df.drop('rating', axis=1)
y = ratings_df['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [22]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [31]:
len(movies_df)

9742

In [32]:
len(ratings_df)

100836

In [23]:
movies_df.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [24]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [25]:
ratings_df.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

### Merging dataframes

#### We need to merge the ratings and movies dataframes so we can have the combined data to use for the recommendation system. We then created a .csv version of the dataframe so we can reload it in the correct format for Surprise.

In [71]:
## merge ratings and movies df's
merged_df = pd.merge(ratings_df, movies_df, on='movieId', how='right')

In [72]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100854 entries, 0 to 100853
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  float64
 1   movieId    100854 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  float64
 4   title      100854 non-null  object 
 5   genres     100854 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 5.4+ MB


In [73]:
merged_df.isna().sum()

userId       18
movieId       0
rating       18
timestamp    18
title         0
genres        0
dtype: int64

In [76]:
merged_df = merged_df.dropna()

In [77]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100853
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  float64
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  float64
 4   title      100836 non-null  object 
 5   genres     100836 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 5.4+ MB


In [85]:
merged_df['rating'].value_counts()

4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: rating, dtype: int64

In [79]:
merged_df.to_csv('../data/ratings_and_movies.csv', index=False)

In [89]:
user_item_rating = merged_df[['userId', 'movieId', 'rating']]

In [91]:
user_item_rating.to_csv('../data/user_item_rating.csv', index=False)

In [106]:
merged_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1.0,1,4.0,9.649827e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5.0,1,4.0,8.474350e+08,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7.0,1,4.5,1.106636e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15.0,1,2.5,1.510578e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17.0,1,4.5,1.305696e+09,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100849,184.0,193581,4.0,1.537109e+09,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
100850,184.0,193583,3.5,1.537110e+09,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
100851,184.0,193585,3.5,1.537110e+09,Flint (2017),Drama
100852,184.0,193587,3.5,1.537110e+09,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


### Starting with Surprise

In [107]:
reader = Reader(line_format='user item rating timestamp title genres', sep=',', skip_lines=1, rating_scale=(1, 5))

movies_all = Dataset.load_from_file('../ratings_and_movies.csv', reader=reader)

ValueError: line_format parameter is incorrect.

In [112]:
reader = Reader(line_format='user item rating timestamp title genres', sep=',', rating_scale=(0, 5))

movie_ratings_all = Dataset.load_from_df(merged_df[['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres']], reader)

ValueError: line_format parameter is incorrect.

In [95]:
trainset, testset = train_test_split(movies_and_ratings, test_size=0.25, random_state=42)

In [96]:
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x223800fe6d0>

In [98]:
preds = algo.test(testset)

In [99]:
rmse = accuracy.rmse(preds)
mae = accuracy.mae(preds)

RMSE: 0.8729
MAE:  0.6717
