# RecSys
Modelo do tipo recuperação

## Importando bibliotecas necessárias

In [1]:
from sklearn import preprocessing
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import matplotlib.pyplot as plt
import torch.nn as nn
import pandas as pd
import numpy as np
import torch

Carregando Tensor Board

In [2]:
writer = SummaryWriter('runs/movie-lens')
writer.flush()

Configurando dispositivo para utilizar GPU se possível; caso contrário, CPU

In [3]:
mode = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(mode)
device

device(type='cuda')

In [4]:
torch.set_default_device(device)

In [5]:
generator = torch.Generator(device)

## Carregando dados do Movie Lens

In [6]:
class MovieLens(Dataset):
    '''
    Classe criada com o intuito de ajustar o dataset pandas ao
    treinamento de modelos utilizando o PyTorch, especialmente do que se
    diz respeito à utilização de lotes (batches) durante o treinamento.
    '''
    def __init__(self, dataset_path: str, device: torch.device):
        """
        Construtor da classe, responsável por ler os dados e organizar os dados
        """
        self.device = device

        dataset = pd.read_csv(dataset_path, decimal='.')

        self.user_encoder = preprocessing.LabelEncoder()
        self.users = self.user_encoder.fit_transform(dataset['userId'].values)

        self.movie_encoder = preprocessing.LabelEncoder()
        self.movies = self.movie_encoder.fit_transform(dataset['movieId'].values)
        self.ratings = dataset['rating'].values

        self.n_unique_users = len(np.unique(self.users))
        self.n_unique_movies = len(np.unique(self.movies))

    def __len__(self) -> int:
        """
        Retorna o número de avaliações do conjunto de dados
        """
        return self.ratings.shape[0]

    def __getitem__(self, item) -> dict[torch.tensor]:
        """
        Retorna itens do conjunto de dados em lotes
        """
        users = self.users[item]
        movies = self.movies[item]
        ratings = self.ratings[item]

        return {
            "users": torch.tensor(users, device=self.device, dtype=torch.long),
            "movies": torch.tensor(movies, device=self.device, dtype=torch.long),
            "ratings": torch.tensor(ratings, device=self.device, dtype=torch.long),
        }

    def get_original_user_id(self, users):
        """
        Retorna o ID original do usuário
        """
        return self.user_encoder.inverse_transform(users)
    
    def get_original_movie_id(self, movies):
        """
        Retorna o ID original do filme
        """
        return self.movie_encoder.inverse_transform(movies)

In [7]:
df_movie_lens = MovieLens('./data/movie-lens/raw/ratings.csv', device)

In [8]:
pd.Series(df_movie_lens.ratings).value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: proportion, dtype: float64

## Carregando dados

Dividindo dataset entre conjunto de treinamento, validação e teste

In [14]:
dataset_length = len(df_movie_lens)

train_length = int(dataset_length * 0.7)
valid_length = int(dataset_length * 0.15)
test_length  = dataset_length - train_length - valid_length

train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset=df_movie_lens,
    lengths=(train_length, valid_length, test_length),
    generator=generator
)

Ajustando dataset para ser utilizado pelo PyTorch como um iterável que retorna lotes de dados a cada iteração

In [15]:
BATCH_SIZE = 4

In [16]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, generator=generator)
validation_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

Após isso, toda vez que o DataLoader for requisitado, ele retornará um lote (batch) de 8 itens

In [17]:
dataiter = iter(train_loader)
batch = next(dataiter)
batch

{'users': tensor([131, 331, 599, 379], device='cuda:0'),
 'movies': tensor([ 698,  686, 1004, 2038], device='cuda:0'),
 'ratings': tensor([3, 4, 2, 3], device='cuda:0')}

## Criando modelo de RecSys

Criando modelo de RecSys no estilo de torre-dupla

In [11]:
class MovieLensRecSys(nn.Module):
    '''
    Classe criada com o intuito de modelar a estrutura de torre-dupla,
    isto é, um dos modelos clássicos de RecSys baseado em filtragem
    colaborativa por meio de redes neurais.
    '''
    def __init__(self, n_users, n_movies, embedding_size = 32):
        super().__init__()
        # definindo embedding para clientes, produtos e categorias
        self.users_embedding = nn.Embedding(n_users, embedding_size)
        self.movies_embedding = nn.Embedding(n_movies, embedding_size)
        # definindo primeira camada de reurônios totalmente conectados
        self.fully_conn_1 = nn.Linear(embedding_size * 2, 32)
        self.relu_1 = nn.ReLU()
        self.dropout_1 = nn.Dropout(p=0.2)
        # # # definindo primeira camada de reurônios totalmente conectados
        self.fully_conn_2 = nn.Linear(32, 16)
        self.relu_2 = nn.ReLU()
        self.dropout_2 = nn.Dropout(p=0.2)
        # definindo camada de saída como um neurônio
        self.output_layer = nn.Linear(16, 1)

    def forward(self, batch):
        # criando camada de entrada a partir de embeddings de clientes e produtos
        user_embeddings = self.users_embedding(batch['users'])
        movies_embeddings = self.movies_embedding(batch['movies'])
        # concatenando embeddings de usuários e livros
        concat_embeddings = torch.cat([user_embeddings, movies_embeddings], dim=1).to(torch.float32)
        # primeira camada totalmente conectada
        output = self.fully_conn_1(concat_embeddings)
        output = self.relu_1(output)
        output = self.dropout_1(output)
        # # # segunda camada totalmente conectada
        output = self.fully_conn_2(output)
        output = self.relu_2(output)
        output = self.dropout_2(output)
        # camada de saída
        output = self.output_layer(output)

        return output


Contando quantidade de clientes e produtos distintos envolvidos em compras

In [110]:
n_users = df_movie_lens.n_unique_users
n_movies = df_movie_lens.n_unique_movies
n_users, n_movies

(610, 9724)

Instanciando modelo RecSys, configurando otimizador, taxa de aprendizado e função custo

In [111]:
model = MovieLensRecSys(n_users, n_movies).to(device)
model

MovieLensRecSys(
  (users_embedding): Embedding(610, 32)
  (movies_embedding): Embedding(9724, 32)
  (fully_conn_1): Linear(in_features=64, out_features=32, bias=True)
  (relu_1): ReLU()
  (dropout_1): Dropout(p=0.2, inplace=False)
  (fully_conn_2): Linear(in_features=32, out_features=16, bias=True)
  (relu_2): ReLU()
  (dropout_2): Dropout(p=0.2, inplace=False)
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
)

In [113]:
model.eval()
prev = model(batch)
prev

tensor([[-0.2232],
        [-0.2078],
        [-0.1396],
        [-0.1644]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [126]:
writer.add_graph(model, batch)
writer.flush()

In [114]:
step_size = 3
optimizer = torch.optim.Adam(model.parameters())
scheaduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.7)
loss_function = nn.MSELoss()

Criando o ciclo de treinamento

In [116]:
EPOCHS = 2

In [117]:
model.train()
training_loss = 0.0

for epoch in range(EPOCHS):

    for i, train_batch in enumerate(train_loader):
        # predições do modelo (y-predito)
        predictions = model(train_batch)
        # calculando tamanho do lote retornado
        batch_length = len(train_batch['ratings'])
        # reformatando y-verdeiro para fical igual ao formato da saída do modelo (y-predito)
        ratings = train_batch['ratings'].view(batch_length, -1).to(torch.float32)
        # calculando o erro do modelo
        loss = loss_function(predictions, ratings)
        # somano erro durante o treinamento
        training_loss += loss.sum().item()
        # executando ajuste dos pesos no modelo via algoritmo de retropropagação
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 1000 == 999: # a cada 1000 interações de mini-lotes

            validation_loss = 0.0

            model.eval()

            for j, validation_batch in enumerate(validation_loader):

                validation_predictions = model(validation_batch)
                # calculando tamanho do lote retornado
                validation_batch_length = len(validation_batch['ratings'])
                # reformatando y-verdeiro para fical igual ao formato da saída do modelo (y-predito)
                validation_ratings = validation_batch['ratings'].view(validation_batch_length, -1).to(torch.float32)
                # calculando o erro do modelo
                val_loss = loss_function(validation_predictions, validation_ratings)
                # somando erro de validaçao
                validation_loss += val_loss.sum().item()

            model.train()

            avg_training_loss = training_loss / (batch_length * 1000)
            avg_validation_loss = validation_loss / len(validation_loader)

            print('epoch: %d - batch: %5d - trainig loss: %.5f - validation loss: %.5f' % (epoch + 1, i + 1, avg_training_loss, avg_validation_loss))

            writer.add_scalars(
                main_tag='Training vs. Validation Loss',
                tag_scalar_dict={
                    'Training': avg_training_loss,
                    'Validation': avg_validation_loss
                },
                global_step=epoch * len(train_loader) + i
            )

            training_loss = 0.0

epoch: 1 - batch:  1000 - trainig loss: 0.66836 - validation loss: 1.20138
epoch: 1 - batch:  2000 - trainig loss: 0.40790 - validation loss: 1.12741
epoch: 1 - batch:  3000 - trainig loss: 0.36497 - validation loss: 1.09630
epoch: 1 - batch:  4000 - trainig loss: 0.34460 - validation loss: 1.08944
epoch: 1 - batch:  5000 - trainig loss: 0.31966 - validation loss: 1.03694
epoch: 1 - batch:  6000 - trainig loss: 0.30596 - validation loss: 1.02519
epoch: 1 - batch:  7000 - trainig loss: 0.30128 - validation loss: 1.01519
epoch: 1 - batch:  8000 - trainig loss: 0.31043 - validation loss: 1.01676
epoch: 1 - batch:  9000 - trainig loss: 0.29561 - validation loss: 0.99600
epoch: 1 - batch: 10000 - trainig loss: 0.27918 - validation loss: 0.99353
epoch: 1 - batch: 11000 - trainig loss: 0.28167 - validation loss: 0.98251
epoch: 1 - batch: 12000 - trainig loss: 0.29016 - validation loss: 0.97883
epoch: 1 - batch: 13000 - trainig loss: 0.27906 - validation loss: 0.99621
epoch: 1 - batch: 14000 -

In [9]:
model_path = './model/MovieLensRecSys.pt'

In [None]:
torch.save(model, model_path)

## Avaliando o modelo

In [12]:
model = torch.load(model_path, weights_only=False)
model.eval()

MovieLensRecSys(
  (users_embedding): Embedding(610, 32)
  (movies_embedding): Embedding(9724, 32)
  (fully_conn_1): Linear(in_features=64, out_features=32, bias=True)
  (relu_1): ReLU()
  (dropout_1): Dropout(p=0.2, inplace=False)
  (fully_conn_2): Linear(in_features=32, out_features=16, bias=True)
  (relu_2): ReLU()
  (dropout_2): Dropout(p=0.2, inplace=False)
  (output_layer): Linear(in_features=16, out_features=1, bias=True)
)

Calculando a raiz do erro quadrádico médio

In [18]:
from sklearn.metrics import root_mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():

    for index, test_data in enumerate(test_loader):
        # calculando tamanho do lote retornado
        batch_length = len(test_data['ratings'])
        # predições do modelo (y-predito)
        output = model(test_data)
        # armazenando o erro do modelo
        model_output_list.append(output.sum().item() / batch_length)
        # recuperando valor original (y-verdadeiro)
        target_rating = test_data['ratings']
        target_rating_list.append(target_rating.sum().item() / batch_length)

rms = root_mean_squared_error(target_rating_list, model_output_list)
rms

0.4887937781491371

old: 0.49067842992108573

precisão e recall dos K-produtos (Precision@K and Recall@K)

In [19]:
from collections import defaultdict

users_pred_true = defaultdict(list)

with torch.no_grad():

    for i, test_data in enumerate(test_loader):

        users = test_data['users']
        movies = test_data['movies']
        ratings = test_data['ratings']

        output = model(test_data)

        for j in range(len(users)):

            user_id = users[j].item()
            book_id = movies[j].item()

            pred_ratings = output[j][0].item()
            true_ratings = ratings[j].item()

            users_pred_true[user_id].append((pred_ratings, true_ratings))

            # print(f'customer_id: {user_id}; product_id: {book_id}; true_ratings: {true_ratings}; pred_ratings: {pred_ratings}')

In [20]:
with torch.no_grad():

    precisions = dict()
    recalls = dict()

    k=100
    threshold=3.0

    for uid, user_pred_true in users_pred_true.items():

        # ordenando as predições de filmes comprados por cliente
        user_pred_true.sort(key=lambda x: x[0], reverse=True)
        # registrando o número de filmes relevantes
        n_rel = sum((true_p >= threshold) for (_, true_p) in user_pred_true)
        # regitrando o número de filmes recomendados que foram preditos como relevantes para o top K filmes
        n_rec_k = sum((pred >= threshold) for (pred, _) in user_pred_true[:k])
        # registrando o número de filmes recomendados que são realmente relevantes para o top K filmes
        n_rec_and_rec_k = sum(
            ((true_p >= threshold) and (pred >= threshold))
            for (pred, true_p) in user_pred_true[:k]
        )

        # print(f'uid: {uid}; n_rel: {n_rel}; n_rec_k: {n_rec_k}; n_rec_and_rec_k: {n_rec_and_rec_k}')

        # proporção de filmes recomendados que são relevantes
        precisions[uid] = n_rec_and_rec_k / n_rec_k if n_rec_k != 0 else 0
        # proporção de filmes relevantes que foram recomendados
        recalls[uid] = n_rec_and_rec_k / n_rel if n_rel != 0 else 0

In [21]:
print(f'Precision@{k}: {sum(prec for prec in precisions.values()) / len(precisions)}')
print(f'Recall@{k}: {sum(rec for rec in recalls.values()) / len(recalls)}')

Precision@100: 0.862348823033688
Recall@100: 0.9074978974828789


Salvando modelo

# Investigando embeddings

In [57]:
unique_movies = np.unique(df_movie_lens.movies)
sample_movies = np.random.choice(unique_movies, size=1000, replace=False)

In [58]:
movies_movies_tensor = torch.from_numpy(sample_movies)

with torch.no_grad():
    movies_embeddings = model.movies_embedding(movies_movies_tensor)

In [59]:
sample_movies_ids = df_movie_lens.get_original_movie_id(sample_movies)
df_movies = pd.read_csv('./data/movie-lens/raw/movies.csv')
movies_genres = df_movies.loc[df_movies['movieId'].isin(sample_movies_ids), 'genres'].values.tolist()

In [60]:
writer.add_embedding(
    mat=movies_embeddings,
    metadata=movies_genres
)
writer.flush()
writer.close()

