### Урок 7. Рекурентные сети для обработки последовательностей

## Домашнее задание

1. Попробуйте обучить нейронную сеть GRU/LSTM для предсказания сентимента сообщений с твитера на примере https://www.kaggle.com/datasets/arkhoshghalb/twitter-sentiment-analysis-hatred-speech

2. Опишите, какой результат вы получили? Что помогло вам улучшить ее точность?

У кого нет возможности работать через каггл (нет верификации), то можете данные взять по ссылке: https://disk.yandex.ru/d/LV1cYS1orMyRWA

## Подключение библиотек

In [1]:
!pip install torchmetrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchmetrics
  Downloading torchmetrics-0.9.0-py3-none-any.whl (418 kB)
[K     |████████████████████████████████| 418 kB 8.0 MB/s 
Installing collected packages: torchmetrics
Successfully installed torchmetrics-0.9.0


In [2]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 3.3 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 20.1 MB/s 
Installing collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchmetrics

import re
import nltk
from string import punctuation
from pymorphy2 import MorphAnalyzer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from sklearn.model_selection import train_test_split

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Загрузка и подготовка данных

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [8]:
df_train, df_valid = train_test_split(train_df, test_size=0.25, random_state=42)

In [9]:
sw = set(stopwords.words('english'))
punct = set(punctuation)
lemmatizer = WordNetLemmatizer()

In [10]:
def preprocess_text(txt):
    #преобразуем входные данные в строку
    txt = str(txt)
    #удяляем знаки пунктуации
    txt = "".join(c for c in txt if c not in punct)
    #приводим все к нижнему регистру
    txt = txt.lower()
    #лематизация и удаление стоп-слов
    txt = [lemmatizer.lemmatize(w) for w in txt.split() if w not in sw]
    return " ".join(txt)

In [11]:
tqdm.pandas()

df_train['tweet'] = df_train['tweet'].progress_apply(preprocess_text)
df_valid['tweet'] = df_valid['tweet'].progress_apply(preprocess_text)
test_df['tweet'] = test_df['tweet'].progress_apply(preprocess_text)

100%|██████████| 23971/23971 [00:04<00:00, 5701.74it/s]
100%|██████████| 7991/7991 [00:01<00:00, 6992.03it/s]
100%|██████████| 17197/17197 [00:02<00:00, 8586.78it/s]


In [12]:
train_corpus = " ".join(df_train["tweet"])
train_corpus = train_corpus.lower()
tokens = word_tokenize(train_corpus)
tokens[:10]

['weekend',
 'world',
 'really',
 'going',
 'bonkers',
 'really',
 'bad',
 'atm',
 'shooting',
 'deathstroke']

In [13]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [14]:
dist = FreqDist(tokens_filtered)

In [15]:
max_words = 20000
max_len = 10
num_classes = 1

# обучение
epochs = 10
batch_size = 512
print_batch_n = 100

In [16]:
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]  # -1 - padding
len(tokens_filtered_top), tokens_filtered_top[:10]

(19999,
 ['user', 'love', 'day', 'u', 'happy', 'amp', 'time', 'life', 'im', 'today'])

In [17]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top,1)).items()}

In [18]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())  #токенизация
    tokens_filtered = [word for word in tokens if word.isalnum()] #фильтруем (только буквы и цифры)
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word]) #если слово в топе токенов, то добавляем его индекс в результат

    padding = [0] * (maxlen-len(result)) #нули дополняющие до maxlen
    return result[-maxlen:] + padding

In [19]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["tweet"]], dtype=np.int32)
x_valid = np.asarray([text_to_sequence(text, max_len) for text in df_valid["tweet"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in test_df["tweet"]], dtype=np.int32)

x_train.shape, x_valid.shape, x_test.shape

((23971, 10), (7991, 10), (17197, 10))

In [20]:
class DataWrapper(Dataset):
    def __init__(self, data, target, transform=None):
        self.data = torch.from_numpy(data).long() #преобразуем в целочисленный тензор
        self.target = torch.from_numpy(target).long() #преобразуем в целочисленный тензор
        self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index]  #индексация данных
        y = self.target[index]  #индексация целевой переменной
        
        if self.transform:
            x = self.transform(x)
            
        return x, y
    
    def __len__(self):
        return len(self.data)

In [21]:
train_dataset = DataWrapper(x_train, df_train['label'].values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

valid_dataset = DataWrapper(x_valid, df_valid['label'].values)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=True)

## Построение и обучение нейронной сети

In [22]:
class GRUFixedLen(nn.Module) :
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, use_last=True):
        super().__init__()
        self.use_last = use_last
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru = nn.GRU(embedding_dim, hidden_dim, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        gru_out, ht = self.gru(x)
       
        if self.use_last:
            last_tensor = gru_out[:,-1,:]
        else:
            # use mean
            last_tensor = torch.mean(gru_out[:,:], dim=1)
    
        out = self.linear(last_tensor)
        return torch.sigmoid(out)

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [24]:
model = GRUFixedLen(max_words).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

In [25]:
model.train()
th = 0.5

train_loss_history = []
test_loss_history = []


for epoch in range(epochs): 
    model.train() 
    running_items, running_right = 0.0, 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)

        # обнуляем градиент
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()

        # подсчет ошибки на обучении
        loss = loss.item()
        running_items += len(labels)
        # подсчет метрики на обучении
        pred_labels = torch.squeeze((outputs > th).int())
        running_right += (labels == pred_labels).sum()
        
    # выводим статистику о процессе обучения
    model.eval()
    
    print(f'Epoch [{epoch + 1}/{epochs}]. ' \
          f'Step [{i + 1}/{len(train_loader)}]. ' \
          f'Loss: {loss:.3f}. ' \
          f'Acc: {running_right / running_items:.3f}', end='. ')
    running_loss, running_items, running_right = 0.0, 0.0, 0.0
    train_loss_history.append(loss)

    # выводим статистику на тестовых данных
    test_running_right, test_running_total, test_loss = 0.0, 0.0, 0.0
    for j, data in enumerate(valid_loader):
        test_labels = data[1].to(device)
        test_outputs = model(data[0].to(device))
        
        # подсчет ошибки на тесте
        test_loss = criterion(test_outputs, test_labels.float().view(-1, 1))
        # подсчет метрики на тесте
        test_running_total += len(data[1])
        pred_test_labels = torch.squeeze((test_outputs > th).int())
        test_running_right += (test_labels == pred_test_labels).sum()
    
    test_loss_history.append(test_loss.item())
    print(f'Test loss: {test_loss:.3f}. Test acc: {test_running_right / test_running_total:.3f}')
            
print('Training is finished!')

Epoch [1/10]. Step [47/47]. Loss: 0.244. Acc: 0.910. Test loss: 0.172. Test acc: 0.938
Epoch [2/10]. Step [47/47]. Loss: 0.178. Acc: 0.941. Test loss: 0.036. Test acc: 0.944
Epoch [3/10]. Step [47/47]. Loss: 0.181. Acc: 0.950. Test loss: 0.542. Test acc: 0.952
Epoch [4/10]. Step [47/47]. Loss: 0.109. Acc: 0.958. Test loss: 0.014. Test acc: 0.955
Epoch [5/10]. Step [47/47]. Loss: 0.084. Acc: 0.967. Test loss: 0.022. Test acc: 0.955
Epoch [6/10]. Step [47/47]. Loss: 0.132. Acc: 0.972. Test loss: 0.011. Test acc: 0.957
Epoch [7/10]. Step [47/47]. Loss: 0.066. Acc: 0.977. Test loss: 0.005. Test acc: 0.955
Epoch [8/10]. Step [47/47]. Loss: 0.025. Acc: 0.980. Test loss: 0.002. Test acc: 0.955
Epoch [9/10]. Step [47/47]. Loss: 0.018. Acc: 0.985. Test loss: 0.002. Test acc: 0.959
Epoch [10/10]. Step [47/47]. Loss: 0.053. Acc: 0.986. Test loss: 0.002. Test acc: 0.958
Training is finished!


Точность модели на основе GRU сети с первого раза получилась довольно хорошей. Видно, что переобучение отсутствует.