In [85]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [18]:
class Vocab:
  def __init__(self, data):
    tokens = set()
    max_seq_len = 0
    for item in data:
        max_seq_len = max(max_seq_len, len(item))
        tokens.update(item)

    self.idx_to_token = {0: '<PAD>'}
    self.token_to_idx = {'<PAD>': 0}
    for idx, token in enumerate(tokens, start=1):
        self.idx_to_token[idx] = token
        self.token_to_idx[token] = idx
    self.vocab_len = len(self.idx_to_token)
    self.max_seq_len = max_seq_len

In [19]:
class SurnamesDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, surname):
    surname_t = torch.zeros(self.vocab.max_seq_len, dtype=torch.int64)
    for i, token in enumerate(surname):
        if i >= self.vocab.max_seq_len:
            break
        surname_t[i] = self.vocab.token_to_idx.get(token, 0)
    return surname_t

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    surname = self.X.iloc[idx]
    label = self.y.iloc[idx]
    surname_t = self.vectorize(surname)
    return surname_t, label

In [20]:
surnames = pd.read_csv("data/surnames.csv")
surnames['nationality'], _ = pd.factorize(surnames['nationality'])

X = surnames['surname'].str.lower()
y = surnames['nationality']
n_classes = y.nunique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

vocab = Vocab(X)

train_dataset = SurnamesDataset(X_train, y_train, vocab)
test_dataset = SurnamesDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [60]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_dataloader, test_dataloader, criterion, optimizer, num_epochs):
    model.to(device)
    train_losses, test_losses = [], []

    for epoch in range(num_epochs):
        model.train()
        train_loss, test_loss = 0, 0
        for inputs, labels in train_dataloader:
            x = inputs.to(device)
            y = labels.to(device)

            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()

        # Валидация на val_loader
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_dataloader:
                inputs = inputs.to(device)
                labels = labels.to(device)

                outputs = model(inputs)
                loss = criterion(outputs, labels)

                test_loss += loss.item()

        train_losses.append(train_loss/len(train_dataloader))
        test_losses.append(test_loss/len(test_dataloader))

        print(f'Epoch {epoch+1}, Train Loss: {train_losses[-1]:.4f}, Test Loss: {test_losses[-1]:.4f}')

def evaluate_model(model, dataloader):
    #model.to(device)
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for surnames, labels in dataloader:
            x = surnames#.to(device)
            y = labels#.to(device)

            logits = model(x)
            _, predicted = torch.max(logits, 1)
            correct += (predicted == y).sum().item()
            total += y.size(0)

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.5f}')

def predict(model, dataset, surname):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(surname)
        tensor = vectorized.unsqueeze(0).to(device)

        logits = model(tensor)

        probs = torch.softmax(logits, dim=1).squeeze()
        top3_probs, top3_indices = torch.topk(probs, k=3)
        print(top3_probs, top3_indices)

        top3_nationalities = _[top3_indices.detach().cpu().numpy()]
        print(f'{surname}: {top3_nationalities.values[0]} ({top3_probs[0].item():.4f}), {top3_nationalities.values[1]} ({top3_probs[1].item():.4f}), {top3_nationalities.values[2]} ({top3_probs[2].item():.4f})')

In [55]:
rnn = nn.RNNCell(10, 20)
input = torch.randn(6, 3, 10)

print(input)
print(input.shape)
hx = torch.randn(3, 20)
print(hx, hx.shape)

output = []
for i in range(1):
    hx = rnn(input[i], hx)
    print(hx, hx.shape)
    output.append(hx)

output

tensor([[[-0.5150,  0.0611, -1.5019, -1.3999,  1.8583,  1.1652,  1.8009,
           0.8552,  0.7180,  2.2732],
         [ 0.9502,  1.8316, -0.1161, -1.3544,  0.6022,  0.6594, -0.2064,
           1.6444, -0.8585,  0.1131],
         [ 0.3644, -0.3035,  1.6394,  0.2357, -1.8279, -1.1180,  2.5095,
          -0.3691, -1.2616, -0.3589]],

        [[-0.8034,  0.3764, -0.6987,  0.3136, -1.1621,  0.0486,  0.4882,
          -2.0484, -1.2965, -0.5811],
         [ 1.4812, -0.3783,  1.1985,  0.2440,  1.0847,  0.6997, -1.5057,
          -0.2568,  0.9280, -1.2979],
         [ 0.3636, -1.2794, -1.0562, -1.7243,  0.6841,  0.3693,  1.0832,
          -1.1520,  0.0174,  1.0618]],

        [[ 0.9456, -0.2165,  1.0915,  1.2426, -0.3918, -0.0583,  1.1039,
           0.1533, -0.5311,  0.5147],
         [-0.0715,  0.9403, -1.9025,  0.5476,  2.0563, -0.8737, -0.2694,
           0.9178, -0.8736,  1.6076],
         [-0.3311, -0.0619,  0.2531, -0.3562, -0.0111,  1.1841,  0.2079,
          -1.2520,  0.4783,  0.8634

[tensor([[-0.0441,  0.5508, -0.3919, -0.6514, -0.0477, -0.6857,  0.1197, -0.8900,
           0.4010, -0.7310,  0.9600,  0.7814, -0.9683,  0.2209, -0.6828,  0.0841,
           0.2634,  0.0894,  0.0892,  0.6381],
         [-0.1870,  0.4524, -0.7628, -0.2666,  0.8309,  0.6979, -0.7682, -0.3966,
           0.7184,  0.4711, -0.6722, -0.1217, -0.5239, -0.4409,  0.3023, -0.3200,
           0.3488, -0.0509,  0.1939,  0.6333],
         [-0.4637, -0.6085,  0.2977, -0.5181, -0.0406, -0.0474, -0.1065, -0.7905,
           0.2190,  0.3209,  0.7994, -0.6154, -0.0743,  0.0762, -0.6211,  0.5210,
          -0.1941, -0.5675, -0.0287, -0.3122]], grad_fn=<TanhBackward0>)]

In [23]:
class RNN(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_dim)
    self.rnn_cell = nn.RNNCell(embedding_dim, hidden_size)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    '''
    x.shape = (batch_size, seq_len) - тензор входных данных
    h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
    '''
    batch_size, seq_len = x.shape

    h = torch.zeros(batch_size, self.hidden_size)

    hidden_states = []
    for t in range(seq_len):
      # получаем эмбеддинг текущего символа
      x_t = self.embedding(x[:, t])
      # обновляем скрытое состояние
      h = self.rnn_cell(x_t, h)
      hidden_states.append(h)

    # конкатенируем скрытые состояния и применяем полносвязный слой
    hidden_states = torch.stack(hidden_states, dim=1)
    output = self.fc(hidden_states[:, -1, :])

    return output

In [24]:
train_dataset.vectorize("valyaev")

tensor([15, 38,  2,  6, 38, 27, 15,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [25]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=256,
            hidden_size=1024,
            output_size=len(set(y_train)))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [26]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 1345554


In [27]:
model.forward(train_dataset.vectorize("valyaev").unsqueeze(0))

tensor([[ 0.2276, -0.1097,  0.2088,  0.0419, -0.0660, -0.2510,  0.3035, -0.0100,
          0.2670, -0.0007, -0.1839, -0.1158, -0.0880,  0.2225, -0.0344,  0.1456,
          0.2933, -0.4632]], grad_fn=<AddmmBackward0>)

In [28]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 10)

Epoch 1, Train Loss: 2.3323, Test Loss: 2.2180
Epoch 2, Train Loss: 2.2998, Test Loss: 2.3518
Epoch 3, Train Loss: 2.3117, Test Loss: 2.4430
Epoch 4, Train Loss: 2.3174, Test Loss: 2.3229
Epoch 5, Train Loss: 2.3151, Test Loss: 2.2831
Epoch 6, Train Loss: 2.2317, Test Loss: 2.1473
Epoch 7, Train Loss: 2.1554, Test Loss: 2.0496
Epoch 8, Train Loss: 2.0836, Test Loss: 2.0341
Epoch 9, Train Loss: 2.1080, Test Loss: 2.0658
Epoch 10, Train Loss: 2.0116, Test Loss: 2.0219


In [31]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.40118


1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

In [79]:
class RNN(nn.Module):
  def __init__(self, input_size, embedding_dim, hidden_size, output_size):
    super(RNN, self).__init__()
    self.hidden_size = hidden_size
    self.embedding = nn.Embedding(input_size, embedding_dim)
    self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
    self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
    self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
    self.fc = nn.Linear(hidden_size, output_size)

  def forward(self, x):
    '''
    x.shape = (batch_size, seq_len) - тензор входных данных
    h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
    '''
    batch_size, seq_len = x.shape

    # h = torch.zeros(1, batch_size, self.hidden_size)  # RNN, GRU
    h = (torch.zeros(1, batch_size, self.hidden_size, device=device), torch.zeros(1, batch_size, self.hidden_size, device=device))  # LSTM

    # получаем эмбеддинг всех символов
    x = self.embedding(x)

    # обновляем скрытое состояние
    # output, h = self.rnn(x, h)
    output, (h, c) = self.lstm(x, h)
    # output, h = self.gru(x, h)

    # применяем полносвязный слой
    output = self.fc(output[:, -1, :])

    return output

In [80]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=64,
            hidden_size=256,
            output_size=len(set(y_train)))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [81]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 667666


In [82]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 10)

Epoch 1, Train Loss: 2.1615, Test Loss: 1.9282
Epoch 2, Train Loss: 1.6983, Test Loss: 1.4070
Epoch 3, Train Loss: 1.3010, Test Loss: 1.2051
Epoch 4, Train Loss: 1.0760, Test Loss: 1.0553
Epoch 5, Train Loss: 0.9250, Test Loss: 0.9332
Epoch 6, Train Loss: 0.8041, Test Loss: 0.8761
Epoch 7, Train Loss: 0.7401, Test Loss: 0.8512
Epoch 8, Train Loss: 0.6642, Test Loss: 0.8069
Epoch 9, Train Loss: 0.6058, Test Loss: 0.7892
Epoch 10, Train Loss: 0.5376, Test Loss: 0.7740


#### Результаты для nn.RNN

In [40]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.57195


#### Результаты для nn.LSTM

In [84]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.76685


#### Результаты для nn.GRU

In [45]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.77231


1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

In [62]:
embedding_path = "data/embeddings/glove.6B.50d.txt"

embeddings = {}
with open(embedding_path, "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = torch.tensor([float(val) for val in values[1:]])
        embeddings[word] = vector

input_size = len(embeddings)
embedding_dim = len(next(iter(embeddings.values())))
embedding_matrix = torch.zeros(input_size, embedding_dim)

for i, word in enumerate(embeddings):
    embedding_matrix[i] = embeddings[word]

embedding_layer = nn.Embedding(input_size, embedding_dim)
embedding_layer.weight.data.copy_(embedding_matrix)

tensor([[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        [ 0.0134,  0.2368, -0.1690,  ..., -0.5666,  0.0447,  0.3039],
        [ 0.1516,  0.3018, -0.1676,  ..., -0.3565,  0.0164,  0.1022],
        ...,
        [-0.5118,  0.0587,  1.0913,  ..., -0.2500, -1.1250,  1.5863],
        [-0.7590, -0.4743,  0.4737,  ...,  0.7895, -0.0141,  0.6448],
        [ 0.0726, -0.5139,  0.4728,  ..., -0.1891, -0.5902,  0.5556]])

In [63]:
next(iter(embeddings))

'the'

In [74]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, output_size, pretrained_embeddings):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.rnn = nn.RNN(embedding_dim, hidden_size, batch_first=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        '''
        x.shape = (batch_size, seq_len) - тензор входных данных
        h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
        '''
        batch_size, seq_len = x.shape

        # h = torch.zeros(1, batch_size, self.hidden_size, device=device)  # RNN, GRU
        h = (torch.zeros(1, batch_size, self.hidden_size, device=device), torch.zeros(1, batch_size, self.hidden_size, device=device))  # LSTM

        x = self.embedding(x)

        # обновляем скрытое состояние
        # output, h = self.rnn(x, h)
        output, (h, c) = self.lstm(x, h)
        # output, h = self.gru(x, h)

        output = self.fc(output[:, -1, :])

        return output

In [75]:
input_size = vocab.vocab_len
embedding_dim = len(next(iter(embeddings.values())))
hidden_size = 128
output_size = len(set(y_train))
model = RNN(input_size, embedding_dim, hidden_size, output_size, pretrained_embeddings=embedding_matrix)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [76]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 186642


In [77]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 10)

Epoch 1, Train Loss: 2.2528, Test Loss: 2.2244
Epoch 2, Train Loss: 2.2306, Test Loss: 2.2346
Epoch 3, Train Loss: 2.2308, Test Loss: 2.2325
Epoch 4, Train Loss: 2.2241, Test Loss: 2.2092
Epoch 5, Train Loss: 2.0209, Test Loss: 1.9304
Epoch 6, Train Loss: 1.7222, Test Loss: 1.5226
Epoch 7, Train Loss: 1.3084, Test Loss: 1.1830
Epoch 8, Train Loss: 1.0871, Test Loss: 1.0718
Epoch 9, Train Loss: 0.9469, Test Loss: 1.0425
Epoch 10, Train Loss: 0.8608, Test Loss: 0.9125


#### Результаты для nn.RNN

In [73]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.31740


#### Результаты для nn.LSTM

In [78]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.72678


#### Результаты для nn.GRU

In [68]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.75137


## 2. Классификация обзоров на фильмы (RNN)

Датасет: https://disk.yandex.ru/d/tdinpb0nN_Dsrg

2.1 Создайте набор данных на основе файлов polarity/positive_reviews.csv (положительные отзывы) и polarity/negative_reviews.csv (отрицательные отзывы). Разбейте на обучающую и тестовую выборку.
  * токен = __слово__
  * данные для обучения в датасете представляются в виде последовательности индексов токенов
  * словарь создается на основе _только_ обучающей выборки. Для корректной обработки ситуаций, когда в тестовой выборке встретится токен, который не хранится в словаре, добавьте в словарь специальный токен `<UNK>`
  * добавьте предобработку текста

2.2. Обучите классификатор.
  
  * Для преобразования последовательности индексов в последовательность векторов используйте `nn.Embedding` 
    - подберите адекватную размерность вектора эмбеддинга: 
    - модуль `nn.Embedding` обучается

  * Используйте рекуррентные слои (`nn.RNN`, `nn.LSTM`, `nn.GRU`)


2.3 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)
* Целевое значение accuracy на валидации - 70+%

In [105]:
with open("polarity/positive_reviews.txt") as f:
    positive_reviews = [elem for elem in sent_tokenize(f.read()) if elem != "."]

with open("polarity/negative_reviews.txt") as f:
    negative_reviews = [elem for elem in sent_tokenize(f.read()) if elem != "."]

In [106]:
len(positive_reviews), len(negative_reviews)

(6319, 6120)

In [107]:
reviews_df = pd.DataFrame()

reviews_df["text"] = positive_reviews + negative_reviews
reviews_df["category"] = [1 for i in range(len(positive_reviews))] + [0 for i in range(len(negative_reviews))]

reviews_df = reviews_df
reviews_df

Unnamed: 0,text,category
0,"simplistic , silly and tedious .",1
1,"it's so laddish and juvenile , only teenage bo...",1
2,exploitative and largely devoid of the depth o...,1
3,[garbus] discards the potential for pathologic...,1
4,a visually flashy but narratively opaque and e...,1
...,...,...
12434,may prove to be [tsai's] masterpiece .,0
12435,mazel tov to a film about a family's joyous li...,0
12436,standing in the shadows of motown is the best ...,0
12437,it's nice to see piscopo again after all these...,0


In [108]:
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = ''.join([' ' if not char.isalpha() and char not in ['.', ',', '!', '?'] else char for char in text])

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

reviews_df["text"] = reviews_df["text"].apply(lambda x: preprocess_text(x))
reviews_df

Unnamed: 0,text,category
0,"simplistic , silly and tedious .",1
1,"it s so laddish and juvenile , only teenage bo...",1
2,exploitative and largely devoid of the depth o...,1
3,garbus discard the potential for pathological ...,1
4,a visually flashy but narratively opaque and e...,1
...,...,...
12434,may prove to be tsai s masterpiece .,0
12435,mazel tov to a film about a family s joyous li...,0
12436,standing in the shadow of motown is the best k...,0
12437,it s nice to see piscopo again after all these...,0


In [109]:
from sklearn.model_selection import train_test_split

X = reviews_df['text'].str.lower()
y = reviews_df['category']

n_classes = y.nunique()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [110]:
class Vocab:
  def __init__(self, data):
    self.idx_to_token = {}
    self.token_to_idx = {}
    self.vocab_len = 0
    self.max_seq_len = 0

    for item in data:
      self.max_seq_len = max(self.max_seq_len, len(item))

    # Добавляем токен для неизвестных слов
    self.idx_to_token = {0: '<UNK>'}
    self.token_to_idx = {'<UNK>': 0}
    self.vocab_len += 1

    all_words = [word for sentence in data for word in word_tokenize(sentence)]

    for word in all_words:
        if word not in self.token_to_idx:
            self.idx_to_token[self.vocab_len] = word
            self.token_to_idx[word] = self.vocab_len
            self.vocab_len += 1

vocab = Vocab(X)
vocab.vocab_len, vocab.max_seq_len

(16510, 413)

In [111]:
class ReviewDataset(Dataset):
  def __init__(self, X, y, vocab: Vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    '''Генерирует представление отзыва review при помощи бинарного кодирования (см. 1.2)'''
    vec = torch.zeros(self.vocab.max_seq_len, dtype=torch.int64)

    for i, word in enumerate(word_tokenize(review)):
      if i >= self.vocab.max_seq_len:
        break
      vec[i] = self.vocab.token_to_idx.get(word, 0)

    return vec

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    vec = self.vectorize(self.X[idx])
    label = self.y[idx]
    return vec, label

In [112]:
train_dataset = ReviewDataset(X_train, y_train, vocab)
test_dataset = ReviewDataset(X_test, y_test, vocab)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [113]:
next(iter(test_dataloader))

[tensor([[ 2999,  2398,     2,  ...,     0,     0,     0],
         [  343, 13999,     2,  ...,     0,     0,     0],
         [  173,    29,   428,  ...,     0,     0,     0],
         ...,
         [  340,     2,   373,  ...,     0,     0,     0],
         [   23,   105,   425,  ...,     0,     0,     0],
         [   61,  2615,     2,  ...,     0,     0,     0]]),
 tensor([0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 1, 0, 0])]

In [119]:
class RNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, output_size, num_layers=1, dropout=0.2):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size*2, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        '''
        x.shape = (batch_size, seq_len) - тензор входных данных
        h.shape = (batch_size, hidden_size) - тензор со скрытым состоянием RNN
        '''
        batch_size, seq_len = x.shape

        h = torch.zeros(2*self.num_layers, batch_size, self.hidden_size, device=device)  # RNN, GRU
        # h = (torch.zeros(2*num_layers, batch_size, self.hidden_size, device=device), torch.zeros(2*num_layers, batch_size, self.hidden_size, device=device))  # LSTM

        x = self.embedding(x)
        x = self.dropout(x)

        # output, h = self.rnn(x, h)
        # output, (h, c) = self.lstm(x, h)
        output, h = self.gru(x, h)
        output = self.dropout(output)

        output = self.fc(output[:, -1, :])
        output = self.softmax(output)


        return output

In [120]:
vocab.max_seq_len, vocab.vocab_len

(413, 16510)

In [121]:
model = RNN(
            input_size=vocab.vocab_len,
            embedding_dim=128,
            hidden_size=128,
            output_size=2)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)



In [122]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Количество обучаемых параметров: {total_params}")

Количество обучаемых параметров: 2642178


In [123]:
train_model(model, train_dataloader, test_dataloader, criterion, optimizer, 5)

Epoch 1, Train Loss: 0.7032, Test Loss: 0.6938
Epoch 2, Train Loss: 0.6969, Test Loss: 0.6938
Epoch 3, Train Loss: 0.6948, Test Loss: 0.6820
Epoch 4, Train Loss: 0.6530, Test Loss: 0.6335
Epoch 5, Train Loss: 0.5608, Test Loss: 0.6599


In [124]:
evaluate_model(model, test_dataloader)

Test Accuracy: 0.68368


In [125]:
rating_labels = ["Positive", "Negative"]

def predict(model, dataset, review):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    model.eval()
    with torch.no_grad():
        vectorized = dataset.vectorize(review)
        tensor = vectorized.unsqueeze(0).to(device)
        logits = model(tensor)
        probs = torch.softmax(logits, dim=1).squeeze()
        print(probs)
        print(f'{rating_labels[probs.argmax()]} ({probs.max():.4f}), {rating_labels[probs.argmin()]} ({probs.min():.4f}) \n{review}')

In [126]:
predict(model, train_dataset, "This restaurant is simply amazing! The food is delicious and the service is outstanding.")

tensor([0.7518, 0.2482])
Positive (0.7518), Negative (0.2482) 
This restaurant is simply amazing! The food is delicious and the service is outstanding.


In [127]:
predict(model, train_dataset, "The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.")

tensor([0.1056, 0.8944])
Negative (0.8944), Positive (0.1056) 
The menu at this restaurant is very limited and the food is nothing special. I wouldn't go back.


In [128]:
predict(model, train_dataset, "I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.")

tensor([0.7721, 0.2279])
Positive (0.7721), Negative (0.2279) 
I had the best dining experience in this restaurant. The ambiance is perfect and the staff is very friendly.
