# Model Based Matrix Factorization

Technical documents:

https://towardsdatascience.com/recommender-systems-matrix-factorization-using-pytorch-bd52f46aa199

https://datajobs.com/data-science-repo/Recommender-Systems-%5BNetflix%5D.pdf

In [11]:
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

## Data preparation

In [12]:
movie = pd.read_csv('movie.csv')
rating = pd.read_csv('rating.csv')

df = movie.merge(rating, on='movieId', how='left')

In [13]:
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [15]:
movie_ids = [130219, 356, 4422, 541]
movies = [
    'The Dark Knight (2011)',
    'Cries and Whispers (Whiskningar och rop) (1972)',
    'Forrest Gump (19940)',
    'Blade Runner (1982)'
]

In [30]:
sample_df = df[df['movieId'].isin(movie_ids)]
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2457839,356,Forrest Gump (1994),Comedy|Drama|Romance|War,4.0,4.0,1996-08-24 09:28:42
2457840,356,Forrest Gump (1994),Comedy|Drama|Romance|War,7.0,4.0,2002-01-16 19:02:55
2457841,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,1996-06-05 13:44:19
2457842,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,4.0,2001-07-01 20:26:38
2457843,356,Forrest Gump (1994),Comedy|Drama|Romance|War,10.0,3.0,1999-11-25 02:32:02


In [31]:
sample_df.shape

(97343, 6)

In [32]:
user_movie_df = sample_df.pivot_table(index='userId', columns='title', values='rating')
user_movie_df.head()

title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [33]:
reader = Reader(rating_scale=(1, 5))

In [46]:
data = Dataset.load_from_df(sample_df[['userId',
                        'movieId',
                        'rating']], reader)

## Modelling

In [47]:
train, test = train_test_split(data, test_size=0.25)
svd_model = SVD()
svd_model.fit(train)
predictions = svd_model.test(test)

In [50]:
predictions[0:15]

[Prediction(uid=18061.0, iid=541, r_ui=4.0, est=4.022806866335472, details={'was_impossible': False}),
 Prediction(uid=88446.0, iid=356, r_ui=2.5, est=4.102406319164842, details={'was_impossible': False}),
 Prediction(uid=66492.0, iid=541, r_ui=5.0, est=4.226048504937297, details={'was_impossible': False}),
 Prediction(uid=65964.0, iid=356, r_ui=3.5, est=4.102406319164842, details={'was_impossible': False}),
 Prediction(uid=94814.0, iid=541, r_ui=5.0, est=4.226048504937297, details={'was_impossible': False}),
 Prediction(uid=7990.0, iid=356, r_ui=4.0, est=4.102406319164842, details={'was_impossible': False}),
 Prediction(uid=56923.0, iid=541, r_ui=4.0, est=4.470642432529985, details={'was_impossible': False}),
 Prediction(uid=34823.0, iid=356, r_ui=4.0, est=4.5658801606586605, details={'was_impossible': False}),
 Prediction(uid=107414.0, iid=356, r_ui=5.0, est=4.102406319164842, details={'was_impossible': False}),
 Prediction(uid=24260.0, iid=356, r_ui=4.0, est=4.102406319164842, detai

In [51]:
accuracy.rmse(predictions)

RMSE: 0.9318


0.9318399211904502

## Model Tunning

In [53]:
param_grid = {'n_epochs': [5, 10, 20],
             'lr_all': [0.002, 0.005, 0.007]}

In [54]:
gs = GridSearchCV(SVD,
                 param_grid,
                 measures=['rmse', 'mae'],
                 cv=3,
                 n_jobs=-1,
                 joblib_verbose=True)

In [55]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:    7.8s finished


In [57]:
gs.best_score['rmse']

0.9300205884365481

In [58]:
gs.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.002}

## Final Model and Prediction

In [60]:
svd_model = SVD(**gs.best_params['rmse'])

In [63]:
data = data.build_full_trainset()

In [64]:
svd_model.fit(data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1c70191bd90>

In [66]:
svd_model.predict(uid=1, iid=541, verbose=True)

user: 1          item: 541        r_ui = None   est = 4.21   {'was_impossible': False}


Prediction(uid=1, iid=541, r_ui=None, est=4.2147850195011305, details={'was_impossible': False})

In [67]:
sample_df[sample_df['userId'] == 1]

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
3612352,541,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0,4.0,2005-04-02 23:30:03


**Our prediction is 4.21, real value is 4.**