## Домашнее задание 5

In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
from IPython.display import display, clear_output

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

# Загрузка данных

In [3]:
interactions_df = pd.read_csv('../data/interactions.csv')
users_df = pd.read_csv('../data/users.csv')
items_df = pd.read_csv('../data/items.csv')

In [4]:
interactions_df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [5]:
interactions_df = interactions_df[interactions_df['last_watch_dt'] < '2021-04-01']

In [6]:
interactions_df.shape

(263874, 5)

# Фильтрация пользователей и айтемов по кол-ву просмотров

In [7]:
users_interactions_count_df = interactions_df.groupby(['user_id', 'item_id']).size().groupby('user_id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['user_id']]
print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))

# users: 86614
# users with at least 5 interactions: 14563


In [8]:
print('# of interactions: %d' % len(interactions_df))
interactions_from_selected_users_df = interactions_df.merge(users_with_enough_interactions_df,
               how = 'right',
               left_on = 'user_id',
               right_on = 'user_id')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 263874
# of interactions from users with at least 5 interactions: 142670


# Препроцессинг

In [9]:
import math

In [10]:
def smooth_user_preference(x):
    return math.log(1+x, 2)

interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['user_id', 'item_id'])['watched_pct'].sum() \
                    .apply(smooth_user_preference).reset_index()
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df.head(10)

# of unique user/item interactions: 142670


Unnamed: 0,user_id,item_id,watched_pct
0,21,849,6.375039
1,21,4345,6.658211
2,21,10283,6.658211
3,21,12261,6.658211
4,21,15997,6.658211
5,32,952,6.044394
6,32,4382,4.954196
7,32,4807,6.658211
8,32,10436,6.658211
9,32,12132,6.658211


In [11]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['user_id'],
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 114136
# interactions on Test set: 28534


In [12]:

#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('user_id')
interactions_train_indexed_df = interactions_train_df.set_index('user_id')
interactions_test_indexed_df = interactions_test_df.set_index('user_id')

In [13]:
def get_items_interacted(person_id, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = interactions_df.loc[person_id]['item_id']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [14]:

from IPython.display import display, clear_output

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [15]:

# Constants
SEED = 42 # random seed for reproducibility
LR = 1e-3 # learning rate, controls the speed of the training
WEIGHT_DECAY = 0.01 # lambda for L2 reg. ()
NUM_EPOCHS = 3 # num training epochs (how many times each instance will be processed)
GAMMA = 0.9995 # learning rate scheduler parameter
BATCH_SIZE = 3000 # training batch size
EVAL_BATCH_SIZE = 3000 # evaluation batch size.
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # device to make the calculations on
ALPHA = 0.000002  # kl_divergence coefficient

In [16]:
interactions_train_df.append(interactions_test_indexed_df.reset_index())

  interactions_train_df.append(interactions_test_indexed_df.reset_index())


Unnamed: 0,user_id,item_id,watched_pct
95878,739508,1554,6.614710
112608,867772,2360,6.658211
31419,240254,12544,5.727920
80151,616527,1358,6.658211
140262,1076435,4807,6.629357
...,...,...,...
28529,29703,475,6.658211
28530,332559,15275,3.459432
28531,806304,12614,6.629357
28532,1034992,6145,1.584963


In [17]:
total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())
total_df['user_id'], users_keys = total_df.user_id.factorize()
total_df['item_id'], items_keys = total_df.item_id.factorize()

train_encoded = total_df.iloc[:len(interactions_train_df)].values
test_encoded = total_df.iloc[len(interactions_train_df):].values

  total_df = interactions_train_df.append(interactions_test_indexed_df.reset_index())


In [18]:
users_keys

Int64Index([ 739508,  867772,  240254,  616527, 1076435,  128186,   36381,
             648736,  283016,  270205,
            ...
             553345,  281389, 1005991,  183729, 1017915,   82382,  691900,
             875576,  152180,  985284],
           dtype='int64', length=14563)

In [19]:
users_key_dict = {}
for i in range(len(users_keys)):
    users_key_dict[users_keys[i]] = i

In [20]:
users_key_dict[739508]

0

In [21]:
items_keys

Int64Index([ 1554,  2360, 12544,  1358,  4807,  7134,  1106,   758,  3042,
             5984,
            ...
            14137,  7655,  1924,  7720,  5437, 14644,   204, 13711,   115,
            16422],
           dtype='int64', length=8287)

In [22]:
items_key_dict = {}
for i in range(len(items_keys)):
    items_key_dict[items_keys[i]] = i

In [23]:
items_key_dict[2360]

1

In [24]:
from scipy.sparse import csr_matrix
shape = [int(total_df['user_id'].max()+1), int(total_df['item_id'].max()+1)]
X_train = csr_matrix((train_encoded[:, 2], (train_encoded[:, 0], train_encoded[:, 1])), shape=shape).toarray()
X_test = csr_matrix((test_encoded[:, 2], (test_encoded[:, 0], test_encoded[:, 1])), shape=shape).toarray()

In [25]:

# Initialize the DataObject, which must return an element (features vector x and target value y)
# for a given idx. This class must also have a length atribute
class UserOrientedDataset(Dataset):
    def __init__(self, X):
        super().__init__() # to initialize the parent class
        self.X = X.astype(np.float32)
        self.len = len(X)

    def __len__(self): # We use __func__ for implementing in-built python functions
        return self.len

    def __getitem__(self, index):
        return self.X[index]

In [26]:

# Initialize DataLoaders - objects, which sample instances from DataObject-s
train_dl = DataLoader(
    UserOrientedDataset(X_train),
    batch_size = BATCH_SIZE,
    shuffle = True
)

test_dl = DataLoader(
    UserOrientedDataset(X_test),
    batch_size = EVAL_BATCH_SIZE,
    shuffle = False
)

dls = {'train': train_dl, 'test': test_dl}

# 1. Значимо изменить архитектуру модели (2 балла)

Значимо изменим архитектуру автоэнкодера: сделаем его вариационным

Классы см. в torch_ae.py

In [27]:
import sys  
sys.path.insert(1, '../rec_sys/models/')  # to make torch_ae visible
from torch_ae import AEModel

In [28]:
def kl_divergence(mu, sigma):
    return (sigma**2 + mu**2 - torch.log(sigma) - 1/2).sum()

In [29]:
torch.manual_seed(SEED) # Fix random seed to have reproducible weights of model layers

model = AEModel(device=DEVICE)
model.to(DEVICE)

# Initialize GD method, which will update the weights of the model
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
# Initialize learning rate scheduler, which will decrease LR according to some rule
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=GAMMA)

def rmse_for_sparse(x_pred, x_true):
    mask = (x_true > 0)
    sq_diff = (x_pred * mask - x_true) ** 2
    mse = sq_diff.sum() / mask.sum()
    return mse ** (1/2)

In [30]:
# Training loop
metrics_dict = {
    "Epoch": [],
    "Train RMSE": [],
    "Test RMSE": [],
    "Train KL": [],
    "Test KL": [],
}

# Train loop
for epoch in range(NUM_EPOCHS):
    metrics_dict["Epoch"].append(epoch)
    for stage in ['train', 'test']:
        with torch.set_grad_enabled(stage == 'train'): # Whether to start building a graph for a backward pass
            if stage == 'train':
                model.train()
            else:
                model.eval()

            rmse_loss_at_stage = 0
            kl_loss_at_stage = 0
            for batch in dls[stage]:
                batch = batch.to(DEVICE)
                x_pred, mu, sigma = model(batch) # forward pass: model(x_batch) -> calls forward()
                loss_rmse = rmse_for_sparse(x_pred, batch) # ¡Important! y_pred is always the first arg
                loss_kl = kl_divergence(mu, sigma)*ALPHA
                loss = loss_rmse + loss_kl
                if stage == "train":
                    loss.backward() # Calculate the gradients of all the parameters wrt loss
                    optimizer.step() # Update the parameters
                    scheduler.step()
                    optimizer.zero_grad() # Zero the saved gradient
                rmse_loss_at_stage += loss_rmse.item() * len(batch)
                kl_loss_at_stage += loss_kl.item() * len(batch)
            rmse_at_stage = (rmse_loss_at_stage / len(dls[stage].dataset)) ** (1/2)
            kl_at_stage = (kl_loss_at_stage / len(dls[stage].dataset))
            metrics_dict[f"{stage.title()} RMSE"].append(rmse_at_stage)
            metrics_dict[f"{stage.title()} KL"].append(kl_at_stage)

    if (epoch == NUM_EPOCHS - 1) or epoch % 10 == 9:
        clear_output(wait=True)
        display(pd.DataFrame(metrics_dict))

Unnamed: 0,Epoch,Train RMSE,Test RMSE,Train KL,Test KL
0,0,2.254414,2.111707,1.33145,1.249591
1,1,1.875041,1.917848,1.474797,1.27195
2,2,1.688163,1.612222,1.250524,1.312938


In [31]:
DEVICE='cpu'
model.to(DEVICE)
model.eval()
torch.save(model.state_dict(), '../models/torch_ae.ckpt')

In [28]:
model = AEModel(device=DEVICE)
model.load_state_dict(torch.load('../models/torch_ae.ckpt'))

<All keys matched successfully>

In [29]:
with torch.no_grad():
    X_pred_test, mu, sigma = model(torch.Tensor(X_test).to(DEVICE))
X_pred_test

tensor([[6.7637e+00, 5.7678e+00, 4.4178e+00,  ..., 0.0000e+00, 0.0000e+00,
         3.4048e-01],
        [3.5510e+00, 3.0360e+00, 2.3033e+00,  ..., 0.0000e+00, 0.0000e+00,
         3.6291e-01],
        [5.6472e+00, 4.8511e+00, 3.7399e+00,  ..., 0.0000e+00, 0.0000e+00,
         9.4888e-02],
        ...,
        [4.1712e+00, 3.4522e+00, 2.6328e+00,  ..., 0.0000e+00, 0.0000e+00,
         5.5112e-01],
        [5.5008e+00, 5.6391e+00, 3.6898e+00,  ..., 8.3851e-02, 0.0000e+00,
         1.3951e+00],
        [5.4199e+00, 4.6897e+00, 3.2525e+00,  ..., 0.0000e+00, 0.0000e+00,
         1.5865e-03]])

# Класс для онлайн-инференса (см. ae_recommender.py)

In [30]:
from ae_recommender import AERecommender

In [31]:
with torch.no_grad():
    X_pred_total, mu, sigma = model(torch.Tensor(X_train + X_test).to(DEVICE))

In [32]:
ae_recommender_model = AERecommender(X_pred_total, X_train, X_test, users_key_dict, items_keys)

In [33]:
ae_recommender_model.recommend(122432)

[1554, 7753, 24, 7440, 9214, 1557, 3509, 7417, 8618, 4997]

In [34]:
ae_recommender_model.recommend(0)  # 0 пользователь, видимо, посмотрел меньше 5 фильмов

[]

In [35]:
import dill

with open('../models/autoencoder.dill', 'wb') as model_file:
    dill.dump(ae_recommender_model, model_file)

In [36]:
import pickle
with open('../models/autoencoder.pkl', 'wb') as model_file:
    pickle.dump(ae_recommender_model, model_file)