# Matrix factorization

In [11]:
%load_ext autoreload
%autoreload 2

import sys; sys.path.append('../')

from pathlib import Path

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from src.models.matrix_factorization import MatrixFactorization, train
from src.util.data import get_interactions, get_sparsity_factor
from src.util.plot import Plot

plot = Plot()

RATINGS_PATH = Path('../data/ratings_small.csv')
OUTPUT_PATH = Path('../models/matrix-factorization.pt')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data

In [12]:
ratings = pd.read_csv(RATINGS_PATH)

In [13]:
user_encoder = LabelEncoder()
user_encoder.fit(ratings['userId'].values)

movie_encoder = LabelEncoder()
movie_encoder.fit(ratings['movieId'].values)

LabelEncoder()

In [14]:
train_ratings, test_ratings = train_test_split(
    ratings,
    test_size=0.05,
    stratify=ratings['userId'].values,
    random_state=42
)

In [15]:
train_interactions = get_interactions(
    train_ratings,
    user_encoder,
    movie_encoder
)

test_interactions = get_interactions(
    test_ratings,
    user_encoder,
    movie_encoder
)

Building interaction matrix: 100%|██████████| 95003/95003 [00:00<00:00, 2021322.87it/s]
Building interaction matrix: 100%|██████████| 5001/5001 [00:00<00:00, 1250564.26it/s]


In [16]:
train_sparsity = get_sparsity_factor(train_interactions)
test_sparsity = get_sparsity_factor(test_interactions)

In [17]:
print(f'Train sparsity: {(train_sparsity * 100):.3f}%')
print(f'Test sparsity: {(test_sparsity * 100):.3f}%')

Train sparsity: 1.562%
Test sparsity: 0.082%


## Model

In [18]:
model = MatrixFactorization(
    train_interactions,
    n_factors=20,
    user_encoder=user_encoder,
    movie_encoder=movie_encoder,
)

In [19]:
train_loss_history, test_loss_history = train(
    model,
    train_interactions,
    test_interactions,
    epochs=200,
    batch_size=5000,
    verbose=1
)

Training:   0%|          | 1/200 [00:01<03:52,  1.17s/it]Train loss: 35.370, Test loss: 36.132
Training:   1%|          | 2/200 [00:02<03:48,  1.15s/it]Train loss: 34.480, Test loss: 35.472
Training:   2%|▏         | 3/200 [00:03<03:44,  1.14s/it]Train loss: 33.623, Test loss: 34.818
Training:   2%|▏         | 4/200 [00:04<03:42,  1.14s/it]Train loss: 32.776, Test loss: 34.177
Training:   2%|▎         | 5/200 [00:05<03:41,  1.13s/it]Train loss: 31.947, Test loss: 33.551
Training:   3%|▎         | 6/200 [00:06<03:40,  1.14s/it]Train loss: 31.139, Test loss: 32.941
Training:   4%|▎         | 7/200 [00:07<03:39,  1.14s/it]Train loss: 30.355, Test loss: 32.347
Training:   4%|▍         | 8/200 [00:09<03:38,  1.14s/it]Train loss: 29.595, Test loss: 31.768
Training:   4%|▍         | 9/200 [00:10<03:37,  1.14s/it]Train loss: 28.858, Test loss: 31.204
Training:   5%|▌         | 10/200 [00:11<03:35,  1.14s/it]Train loss: 28.145, Test loss: 30.655
Training:   6%|▌         | 11/200 [00:12<03:34,  

In [20]:
plot.convergence(
    losses=[train_loss_history, test_loss_history],
    names=['Train', 'Test']
)

In [21]:
torch.save(model, OUTPUT_PATH)