In [18]:
import torch
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
import pandas as pd
from IPython.display import clear_output
from tqdm import tqdm

Глобальная идея здесь такая: я хочу построить эмбеддинги, которые помогут мне далее.
1. Я напишу код для обучения эмбеддингов с нуля и поскладываю их как вектора, чтобы оченить их смысл.
2. Я найду хорошие предобученные эмбеддинги
3. Я найду датасет для классификации текстов и дообучу предобученные эмбеддинги на нем

4. Сравню качество для задачи классификации на базовых предобученных эмбеддингах с качеством на дообученных эмбеддингах.


Итак, вспоминаем torch
Для начала создадим датасет и все вспомогательные для работы с ним функции

In [19]:
# TODO текст - последовательность токенов, так что сначала набо предобработать датасето, токнезироваав текст, добавив служебные символы и обрезав тексты
class MoviesDataset(Dataset):
    def __init__(self, path, max_len, train=True):
        super().__init__()
        self.max_len = max_len
        full_df = pd.read_csv(path)["Plot"]
        processed_df = MoviesDataset.process_data(full_df, max_len)

        train_df, test_df = train_test_split(processed_df, train_size=0.8, random_state=0)
        if train:
            self.data = train_df
        else:
            self.data = test_df

    def __getitem__(self, item):
        return self.data[item]

    def __len__(self):
        return len(self.data)

    def get_data(self):
        return self.data.copy()

    @staticmethod
    def process_data(data, max_len):
        processed_data = []
        slit_pattern = re.compile(r"[,. 0-9()\[\]\-\"\'\\]")
        for text in data:
            processed_text = list(filter(lambda token: token != '', re.split(slit_pattern, text.lower())))
            if len(processed_text) < max_len:
                processed_text.extend(["<pad>"] * (max_len - len(processed_text)))
            else:
                processed_text = processed_text[:max_len]
            processed_data.append(["<start>"] + processed_text)
        return processed_data

In [20]:
MAX_TEXT_LEN = 256
SKIPGRAM_WINDOW_SIZE = 5
BATCH_SIZE = 32
EMBEDDING_DIM = 25
MIN_TOKEN_FREQ = 5
NUM_EPOCHS = 10

In [21]:
movies_dataset_path = "wiki_movie_plots_deduped.csv"
train_dataset = MoviesDataset(movies_dataset_path, train=True, max_len=MAX_TEXT_LEN)
test_dataset = MoviesDataset(movies_dataset_path, train=False, max_len=MAX_TEXT_LEN)

In [22]:
vocab = build_vocab_from_iterator(train_dataset.get_data(), specials=["<unk>"], min_freq=MIN_TOKEN_FREQ)
vocab.set_default_index(vocab["<unk>"])

In [23]:
def my_embeddings_collate_fn(batch):
    x_batch = []
    y_batch = []
    padding = int((SKIPGRAM_WINDOW_SIZE - 1)/2)
    for text in batch:
        text = text[:MAX_TEXT_LEN].split()
        for window_start in range(len(text) - SKIPGRAM_WINDOW_SIZE + 1):
            tokens = text[window_start:window_start + SKIPGRAM_WINDOW_SIZE]
            x_current = tokens.pop(padding)

            x_batch.extend([x_current] * (SKIPGRAM_WINDOW_SIZE - 1))
            y_batch.extend(tokens)
    return torch.tensor(x_batch, dtype=torch.int64), torch.tensor(y_batch, dtype=torch.int64)

In [24]:
train_loader = DataLoader(train_dataset, collate_fn=my_embeddings_collate_fn, shuffle=True, batch_size=BATCH_SIZE, num_workers=6)
test_loader = DataLoader(test_dataset, collate_fn=my_embeddings_collate_fn, shuffle=True, batch_size=BATCH_SIZE, num_workers=6)

In [25]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.linear = nn.Linear(in_features=embedding_dim, out_features=vocab_size)

    def forward(self, x):
        x = self.embeddings(x)
        logits = self.linear(x)
        return logits

In [26]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [27]:
model = Word2VecModel(vocab_size=len(vocab), embedding_dim=EMBEDDING_DIM)
optimizer = torch.optim.Adam(params=model.parameters())
criterion = nn.CrossEntropyLoss()

In [28]:
def plot_losses(train_history, valid_history):
    clear_output()
    plt.plot(train_history, label='train', color='green')
    plt.plot(valid_history, label='valid', color='red')
    plt.legend()
    plt.show()


def training_epoch(model, optimizer, criterion, train_loader, tqdm_desc):
    cumulative_loss = 0
    model.train()
    for inputs, outputs in tqdm(train_loader, desc=tqdm_desc):
        inputs = inputs.to(device)
        outputs = outputs.to(device)

        optimizer.zero_grad()
        logits = model(inputs)
        loss = criterion(logits, outputs)
        loss.backward()
        optimizer.step()

        cumulative_loss += loss.item()

    return cumulative_loss / len(train_loader)


@torch.no_grad()
def validation_epoch(model, criterion, valid_loader, tqdm_desc):
    cumulative_loss = 0
    model.eval()
    for inputs, outputs in tqdm(valid_loader, desc=tqdm_desc):
        inputs = inputs.to(device)
        outputs = outputs.to(device)

        logits = model(inputs)
        loss = criterion(logits, outputs)

        cumulative_loss += loss.item()
    return cumulative_loss / len(valid_loader)


def train(train_loader, valid_loader, num_epochs, model, optimizer, criterion, scheduler=None):
    train_history, valid_history = [], []

    for epoch in range(1, num_epochs + 1):
        train_loss = training_epoch(
            model, optimizer, criterion, train_loader,
            tqdm_desc=f'Training {epoch}/{num_epochs}'
        )
        valid_loss = validation_epoch(
            model, criterion, valid_loader,
            tqdm_desc=f'Validating {epoch}/{num_epochs}'
        )

        if scheduler is not None:
            scheduler.step()

        train_history.append(train_loss)
        valid_history.append(valid_loss)
        plot_losses(train_history, valid_history)

In [29]:
train(
    train_loader,
    test_loader,
    NUM_EPOCHS,
    model,
    optimizer,
    criterion
)

Training 1/10:   0%|          | 0/873 [00:00<?, ?it/s]Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'MoviesDataset' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Training 1/10:   0%|          | 0/873 [03:33<?, ?it/s]


KeyboardInterrupt: 