# Будем обучать Bert из базового предобученного

### Модель будет обучаться на решение задачи прогноза следующего фильма каждого пользователя

Устанавливаем нужные пакеты

In [1]:
!pip install transformers
!pip install torch
!pip install tqdm



Импортируем библиотеки и фиксируем `seed`

In [2]:
import pandas as pd
import numpy as np
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import BertTokenizer, BertModel, BertConfig

import random

def set_global_seed(seed: int) -> None:
    """
    Set global seed for reproducibility.
    """

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)
    np.random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_global_seed(42)

import warnings
warnings.filterwarnings("ignore")

In [3]:
torch.cuda.is_available()

True

Подгружаем данные

In [4]:
data = pd.read_csv('kaggle/events.csv')
data = data.sort_values(by=['user_id', 'timestamp'])
print(data.shape)
data.head(10)

(894149, 4)


Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,0
1,0,3669,3,1
2,0,584,4,2
3,0,3390,3,3
4,0,2885,4,4
5,0,79,5,5
6,0,717,4,7
7,0,187,4,8
8,0,93,5,9
9,0,3016,5,10


Для решения задачи необходимо:</br>
1) Преобразовать все item_id в пространство (0 - кол-во фильмов-1);</br>
   Это нужно, чтобы модель интерпретировала айдишники фильмов как метку класса. Генерируем и сохраняем словарь мэппинга айдишников и меток класса.</br>
   *Предположение: train содержит все фильмы, которые могут быть предложены пользователю/просмотрены пользователем.*
2) Создать таргеты для каждого наблюдения;</br>
   Для этого сортируем датафрейм для каждого пользователя по `timestamp` и создаем новую колонку с помощью шифта данных на 1.

In [5]:
unique_items = data['item_id'].unique()
item_to_idx = {item: idx for idx, item in enumerate(unique_items)}
idx_to_item = {idx: item for item, idx in item_to_idx.items()}

data['item_id'] = data['item_id'].map(item_to_idx)

print(data['item_id'].min(), data['item_id'].max())

0 3689


In [6]:
data['item_id'].nunique()

3690

Формируем test выборку как последние 2 просмотренных фильма пользователя

In [7]:
def train_test_split_by_user(df, test_size=2):
    train_list = []
    test_list = []

    for user_id, user_data in df.groupby('user_id'):
        user_data = user_data.sort_values(by='timestamp')

        test_data = user_data.tail(test_size)
        train_data = user_data.drop(test_data.index)
        train_list.append(train_data)
        test_list.append(test_data)
    train_data = pd.concat(train_list).reset_index(drop=True)
    test_data = pd.concat(test_list).reset_index(drop=True)

    return train_data, test_data

In [8]:
data['next_item_id'] = data.groupby('user_id')['item_id'].shift(-1)
data = data.dropna(subset=['next_item_id'])

data['next_item_id'] = data['next_item_id'].astype(int)

train, test = train_test_split_by_user(data)

print(f'Train size: {train.shape}')
print(f'Test size: {test.shape}')
assert len(pd.merge(train, test, on=['user_id', 'item_id', 'timestamp'], how='inner')) == 0, "Есть перечечения"

Train size: (876029, 5)
Test size: (12080, 5)


Прописываем класс MyDataset для работы с датасетом внутри модели:</br>
Наследуемся от торчового Dataset</br> Основа - метод `__get_item__`, который преобразует данные в нужный формат для output модели на каждом шаге

In [9]:
class MyDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_data = self.df.iloc[idx]
        user_id = int(user_data['user_id'])
        item_id_str = str(user_data['item_id'])

        encoded = self.tokenizer.encode_plus(
            item_id_str,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt")

        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)

        return {'input_ids': torch.tensor(input_ids, dtype=torch.long),
                'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
                'target': torch.tensor(user_data['next_item_id'], dtype=torch.long),
                'user_id': torch.tensor(user_id, dtype=torch.long)}

Прописываем класс нашей модели:</br>
1) Наследуемся от дефолтного `nn.Module`;
2) Загружаем слой BERT из предобученного базового BERT-a
3) Прописываем метод `forward`

In [10]:
class RecommenderBERTModel(nn.Module):
    def __init__(self, bert_model_name="bert-base-uncased", hidden_size=768, num_items=None):
        super(RecommenderBERTModel, self).__init__()

        self.bert = BertModel.from_pretrained(bert_model_name)
        self.linear = nn.Linear(hidden_size, num_items)

    def forward(self, input_ids, attention_mask):

        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        predictions = self.linear(cls_output)
        return predictions


Функция для обучения нашей модели:</br>
1) Оптимизатор - `Adam`
2) Лосс - `CrossEntropyLoss`
3) Сохранение модели раз в n шагов

In [11]:
def train_model(model,
                train_loader,
                val_loader,
                device,
                epochs=3,
                learning_rate=1e-5,
                save_every_n_steps=100,
                save_path="./RecommenderBERTModelv1.pth"):

    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        print(f'Epoch {epoch + 1}/{epochs}')
        train_progress_bar = tqdm(train_loader, desc='Training', leave=False)

        step = 0

        for batch in train_progress_bar:
            optimizer.zero_grad()

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['target'].to(device)

            outputs = model(input_ids, attention_mask)

            assert targets.max().item() < 3690, f"Метка класса {targets.max().item()} неверна"

            loss = loss_fn(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            train_progress_bar.set_postfix({'loss': loss.item()})

            if step % save_every_n_steps == 0:
                torch.save(model.state_dict(), save_path)
                print(f"Сheckpoint saved at step {step}, epoch {epoch + 1}")

            step += 1

        avg_train_loss = total_loss / len(train_loader)
        print(f"Training loss: {avg_train_loss}")

        model.eval()
        val_loss = 0
        correct_predictions = 0

        val_progress_bar = tqdm(val_loader, desc='Validating', leave=False)

        with torch.no_grad():
            for batch in val_progress_bar:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = batch['target'].to(device)

                outputs = model(input_ids, attention_mask)
                loss = loss_fn(outputs, targets)
                val_loss += loss.item()

                _, preds = torch.max(outputs, dim=1)
                correct_predictions += torch.sum(preds == targets)

                val_progress_bar.set_postfix({'val_loss': loss.item()})

        avg_val_loss = val_loss / len(val_loader)
        accuracy = correct_predictions.double() / len(val_loader.dataset)
        print(f"Validation loss: {avg_val_loss}, Accuracy: {accuracy}")

    return model


Токенизируем BERT-ом данные, входящие в модель

In [12]:
def prepare_dataloaders(train_data, test_data, tokenizer, batch_size=16, max_len=128):
    train_dataset = MyDataset(train_data, tokenizer, max_len=max_len)
    test_dataset = MyDataset(test_data, tokenizer, max_len=max_len)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, test_loader

In [13]:
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')

train_loader, val_loader = prepare_dataloaders(train, test, tokenizer_bert)

Наконец-то приступаем к обучению модели

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_items = data['item_id'].nunique()
model = RecommenderBERTModel(bert_model_name='bert-base-uncased',
                             num_items=num_items)
model.bert.resize_token_embeddings(len(tokenizer_bert))

#trained_model = train_model(model, train_loader, val_loader, device, epochs=1,
#                            learning_rate=1e-5, save_every_n_steps=500,
#                            save_path="./RecommenderBERTModelv3.pth")

Embedding(30522, 768, padding_idx=0)

Сохраняем веса обученной модели

In [15]:
# torch.save(model.state_dict(), "./RecommenderBERTModelv1.pth")

In [16]:
checkpoint = torch.load("./RecommenderBERTModelv3.pth")
model.load_state_dict(checkpoint)

<All keys matched successfully>

Функция инференса модели на наших пользователей

In [17]:
user_item_history = train.groupby('user_id')['item_id'].apply(set).to_dict()

In [20]:
def top_recs(model, data_loader, device, idx_to_item, user_item_history, top_n=20):
    model.eval()
    recommendations = {}

    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            user_ids = batch['user_id']
            
            outputs = model(input_ids, attention_mask)
            probabilities = torch.softmax(outputs, dim=1)

            top_k_indices = torch.topk(probabilities, top_n + 10, dim=1).indices

            for i, user_id in enumerate(user_ids):
                predicted_item_indices = top_k_indices[i].tolist()
                predicted_item_ids = [idx_to_item[idx] for idx in predicted_item_indices]

                user_history = user_item_history.get(user_id, set())
                filtered_items = [item_id for item_id in predicted_item_ids if item_id not in user_history]
                recommendations[user_id] = filtered_items[:top_n]

    recommendation_df = pd.DataFrame({
        'user_id': recommendations.keys(),
        'recommendations': [' '.join(map(str, items)) for items in recommendations.values()]
    })

    return recommendation_df

In [21]:
model = model.to(device)
recs_df = top_recs(model, val_loader, device, idx_to_item, user_item_history, top_n=10)
recs_df['user_id'] = recs_df['user_id'].apply(lambda x: x.item() if torch.is_tensor(x) else x)
recs_df.drop_duplicates(subset=['user_id'], inplace=True)
recs_df

  0%|          | 0/378 [00:00<?, ?it/s]

Unnamed: 0,user_id,recommendations
0,0,331 3409 463 472 2980 1315 3677 2862 1337 2688
2,1,331 1337 1315 463 3409 2862 2980 472 3677 2297
4,2,331 463 472 3409 2980 3677 1315 2297 2862 476
6,3,3022 472 1583 3529 3013 463 1223 3409 476 2757
8,4,472 640 3529 2067 2256 2732 2564 2646 1223 2862
...,...,...
12070,6035,3022 1583 3529 3013 2210 512 1223 463 472 922
12072,6036,3022 1583 2210 3529 3013 512 1223 169 2688 463
12074,6037,3022 1583 472 3529 3013 1223 640 2757 463 584
12076,6038,331 2397 3002 1044 3648 3409 3435 983 96 1919


In [30]:
from torch.utils.data import DataLoader, ConcatDataset

train_dataset = train_loader.dataset
val_dataset = val_loader.dataset

combined_dataset = ConcatDataset([train_dataset, val_dataset])

combined_loader = DataLoader(combined_dataset, batch_size=32, shuffle=True)

In [31]:
model = model.to(device)
recs_df = top_recs(model, combined_loader, device, idx_to_item, user_item_history, top_n=10)
recs_df['user_id'] = recs_df['user_id'].apply(lambda x: x.item() if torch.is_tensor(x) else x)
recs_df

  0%|          | 0/27754 [00:00<?, ?it/s]

Unnamed: 0,user_id,recommendations
0,4957,331 3409 2397 3435 2862 1337 2354 1044 3002 983
1,186,3022 1583 463 472 3677 3409 3529 3013 476 169
2,1792,3022 1583 472 3529 640 3013 463 1223 2757 584
3,3385,1583 3022 472 3529 640 463 1223 3013 2757 2210
4,3610,3022 1583 472 3529 640 3013 1223 463 2757 584
...,...,...
888104,3687,3022 1583 472 3529 3013 1223 640 2757 463 584
888105,2711,331 3409 1337 2862 3435 2397 2256 472 3600 3677
888106,5916,3022 1583 472 3529 3013 463 640 1223 2757 584
888107,1578,3409 331 1315 463 2688 472 3677 2980 2646 2297


In [42]:
df_last = recs_df.groupby('user_id').tail(1)

In [43]:
submit_res = df_last.sort_values('user_id', ascending=True)
submit_res.columns = ['user_id', 'item_id']
submit_res

Unnamed: 0,user_id,item_id
881607,0,3022 1583 472 3529 640 3013 1223 463 2757 584
886016,1,331 3409 463 472 1337 2862 2297 1315 2980 3677
886236,2,331 2862 1337 2256 3409 472 2980 463 3677 2297
887651,3,3022 1583 472 3529 640 3013 1223 463 2757 584
882812,4,472 3022 463 1583 3529 3013 3409 3677 476 1223
...,...,...
888042,6035,3022 1583 472 3529 640 3013 463 1223 2757 584
824871,6036,3022 1583 472 3529 640 3013 463 1223 2757 584
881829,6037,3409 463 3022 3677 472 1315 2402 3529 3013 476
887054,6038,331 3409 463 3677 472 1337 2862 1315 2397 2688


In [44]:
submit_res['user_id'] = submit_res['user_id'].astype(str)
submit_res.to_csv('submit_The_boysV4.csv', index=False)