In [27]:
import re
import nltk
import typing as t

import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import torch.optim as optim
import numpy as np

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 1. Классификация фамилий (RNN)

Датасет: https://disk.yandex.ru/d/frNchuaBQVLxyA?w=1

In [28]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


1.1 Используя класс `nn.RNNCell` (абстракцию для отдельного временного шага RNN), реализуйте простейшую рекуррентную сеть Элмана в виде класса `RNN`. Используя созданный класс `RNN`, решите задачу классификации фамилий. 


In [4]:
surnames_df=pd.read_csv('drive/MyDrive/ML_FU/Lab_7/data/surnames.csv')
surnames_df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [5]:
# поделим выборку на тестовую и обучающую 
X_train, X_test, y_train, y_test = train_test_split(surnames_df['surname'].values, surnames_df['nationality'].values, test_size=0.2, random_state=123)
print(f'Размер обучающей выборки: {len(X_train)} \nРазмер тестовой выборки: {len(X_test)}')

Размер обучающей выборки: 8784 
Размер тестовой выборки: 2196


In [6]:
# формирование словарь и добавляем токены
class Vocab:
    
  def __init__(self, data):
    surnames = data["surname"]
    self.nationalities = list(set(data["nationality"])) #берем все национальности
    self.nationality_to_idx = dict(zip(self.nationalities, range(len(self.nationalities)))) #формируем словарь
    self.rexp = re.compile(r'[\s\n\t]+') #регулярное выражение
    self.index_to_token = ["<PAD>", "<UNK>"] #необходимые токены
    self.token_to_idx = {"<PAD>": 0, "<UNK>": 1}
    self.max_seq_len = 0 #здесь будет храниться максимальная длина последовательности
    for surname in surnames:
      surname = self.rexp.sub('', surname).lower()
      for i, token in enumerate(surname, start=1):
        self.max_seq_len = max(self.max_seq_len, i)
        if not(token in self.token_to_idx):
          self.token_to_idx[token] = len(self.index_to_token)
          self.index_to_token.append(token)
                    
  def __len__(self):
    return len(self.index_to_token)

In [7]:
class SurnameDataset(Dataset):

  def __init__(self, X, y, vocab: Vocab):
    self.vocab = vocab
    self.X = X
    self.y = y

  def vectorize(self, surname):
    seq_t = torch.zeros(self.vocab.max_seq_len).to(torch.int64)
    surname = self.vocab.rexp.sub('', surname).lower()
    for i, t in enumerate(surname):
      try:
        seq_t[i] = self.vocab.token_to_idx.get(t, 1)
      except IndexError:
        break
    return seq_t

  def __len__(self):
    return len(self.y)

  def __getitem__(self, index):
    surname = self.X[index]
    nationality = self.y[index]
    return self.vectorize(surname), self.vocab.nationality_to_idx[nationality]

In [8]:
vocab = Vocab(surnames_df) # преобразуем в словарь

In [9]:
# разделяем на обучающую и тестовую выборки
train_data = SurnameDataset(X_train, y_train, vocab)
test_data = SurnameDataset(X_test, y_test, vocab)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size=32, drop_last=True)

In [10]:
#проверим, что все получилось
test_data[0]

(tensor([ 3,  7, 20, 18, 17, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 15)

### RNNCell:

In [17]:
class RNN(nn.Module):
    
  def __init__(self, input_size, hidden_size, num_embeddings, n_classes, aggregate=False):
    super(RNN, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.hidden_state_history = None
    self.aggregate_flag = aggregate
        
    #эмбеддинг   
    self.embedding = nn.Embedding.from_pretrained(
    embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/globe_100_rows.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_cell = nn.RNNCell(input_size=self.input_size, hidden_size=self.hidden_size)
    self.fc = nn.Linear(hidden_size, n_classes)

    #здесь инициализируется тензор скрытых состояний
  def forward(self, x, h=None):
    self.hidden_state_history = torch.empty(x.shape[0], x.shape[1], self.hidden_size)
    x = self.embedding(x)
    if h is None:
      h = torch.randn(x.shape[0], self.hidden_size, requires_grad=True)

      # затем обновляется скрытое состояние при прохождении по каждому элементу последовательностей s в батче
    for i in range(x.shape[1]):
      h = self.rnn_cell(x[:, i, :], h)
      self.hidden_state_history[:, i, :] = h
    if self.aggregate_flag:
      h = torch.sum(self.hidden_state_history, dim=1)
    classes_logits = self.fc(h)
    return classes_logits, self.hidden_state_history, h # возвращать будем тензор всех наблюдавшихся скрытых состояний размера (batch_size, seq_len, hidden_size), а также и тензор скрытых состояний

In [18]:
# Модель
model_rnn_cell = RNN(
    input_size=50,
    hidden_size=32,
    num_embeddings=len(vocab.index_to_token),
    n_classes=len(vocab.nationality_to_idx),
    aggregate=True
)

optimizer_rnn_cell = optim.Adam(model_rnn_cell.parameters(), lr=0.0005) # оптимизатор обновляет веса моделей на основании ошибки
loss_fn = nn.CrossEntropyLoss() # функция ошибки

In [29]:
def get_accuracy(preds, a): # получение точности
  return torch.sum(torch.argmax(torch.softmax(preds, dim=1), dim=1) == a) / len(a)

In [30]:
def train(n_epochs, train_loader, test_loader, model, loss_fn, optimizer, printing):
  for epoch in range(n_epochs):
    accuracy = 0
    losses = 0
    model.train() # задаем модели состояние "будем обучать"
    for x_batch, y_batch in train_loader:
      pred = model(x_batch)[0] # вызываем forward
      loss = loss_fn(pred, y_batch) # считаем ошибку     
      
      optimizer.zero_grad() # это чтобы сбросить состояние оптимизатора
      loss.backward() # обратное распространение ошибки
      optimizer.step() # устанавливаем новые веса
      losses += loss.item()
      accuracy += get_accuracy(pred, y_batch)
    losses /= len(train_loader)
    accuracy /= len(train_loader)
    if (epoch + 1) % printing == 0:
      test_accuracy = 0.0
      test_losses = 0.0
      model.eval() # перевод состояния модели в "не будем обучать"
      # производим подсчет показателей на тесте
      for x_batch, y_batch in test_loader:
        test_pred = model(x_batch)[0]
        test_acc = get_accuracy(test_pred, y_batch)
        test_loss = loss_fn(test_pred, y_batch)
        test_accuracy += test_acc
        test_losses += test_loss
      test_accuracy /= len(test_loader)
      test_losses /= len(test_loader)
      print("_" * 40)
      print('Train:')
      print(f'Loss: {losses}, Accuracy: {accuracy}')
      print("-" * 40)
      print('Test:')            
      print(f'Loss: {test_losses}, Accuracy: {test_accuracy}')

In [28]:
train(100, train_loader, test_loader, model_rnn_cell, loss_fn, optimizer_rnn_cell, printing=10)

________________________________________
Train:
Loss: 0.7924162846629637, Accuracy: 0.7658531069755554
----------------------------------------
Test:
Loss: 0.9018857479095459, Accuracy: 0.732536792755127
________________________________________
Train:
Loss: 0.7699961349041793, Accuracy: 0.7697308659553528
----------------------------------------
Test:
Loss: 0.876166820526123, Accuracy: 0.7421875
________________________________________
Train:
Loss: 0.7618763303778467, Accuracy: 0.7740647792816162
----------------------------------------
Test:
Loss: 0.8750185966491699, Accuracy: 0.7431066036224365
________________________________________
Train:
Loss: 0.7306746935648639, Accuracy: 0.7803375720977783
----------------------------------------
Test:
Loss: 0.8944239616394043, Accuracy: 0.7426470518112183
________________________________________
Train:
Loss: 0.7119454021436454, Accuracy: 0.7814781069755554
----------------------------------------
Test:
Loss: 0.8787550926208496, Accuracy: 0.741

In [19]:
def curr_accuracy_m(model): # определение итоговой точности модели
  correct = 0
  total = 0
  with torch.no_grad():
    for X_batch, y_batch in test_loader:
      out = model(X_batch)[0]
      _, pred = torch.max(out, dim=1)
      total += y_batch.shape[0]
      correct += int((pred == y_batch).sum())
  print(correct / total)

In [30]:
curr_accuracy_m(model_rnn_cell)

0.7490808823529411


Точность модели 75%.

In [40]:
def total_conclusion(surname, model):
  new_surname = train_data.vectorize(surname)
  new_surname = new_surname.unsqueeze(dim=0)
  with torch.inference_mode():
    model.eval()
    print(f'{surname} --> {vocab.nationalities[torch.argmax(model(new_surname)[0], dim=1).item()]}')

In [41]:
# Проверка
students_PI19_3 = [
  "Alexandrova",
  "Baranov",
  "Brusova",
  "Volkova",
  "Gasanova",
  "Danilin",
  "Demenchuk",
  "Egorov",
  "Popova",
  "Polikarpova",
  "Khamikoeva",
]

for surname in students_PI19_3:
  total_conclusion(surname=surname, model=model_rnn_cell)

Alexandrova --> Italian
Baranov --> Russian
Brusova --> Czech
Volkova --> Czech
Gasanova --> Spanish
Danilin --> Russian
Demenchuk --> Russian
Egorov --> Russian
Popova --> Czech
Polikarpova --> Czech
Khamikoeva --> Russian


1.2 Замените модуль `RNN` из 1.1 на модули `nn.RNN`, `nn.LSTM` и `nn.GRU` (не забудьте указать аргумент `batch_first=True`). Сравните результаты работы.

### nn.RNN

In [42]:
class RNN_rnn(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate: bool = False):
    super(RNN_rnn, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate 
        
    # загрузка предобученного эмбендинга из п.1.3.

    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/globe_100_rows.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=self.n_layers, batch_first=True)
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)

  def forward(self, x, h=None):
    x = self.embedding(x)
    all_h, h = self.rnn_block(x, h)
    h = h.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h

In [43]:
model_rnn = RNN_rnn( 
    input_size=50,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.index_to_token),
    n_classes=len(vocab.nationality_to_idx),
    aggregate=True
)
optimizer_rnn = optim.Adam(model_rnn.parameters(), lr=0.001) # оптимизатор обновляет веса моделей на основании ошибки
loss_fn = nn.CrossEntropyLoss() # функция потерь

In [44]:
train(100, train_loader, test_loader, model_rnn, loss_fn, optimizer_rnn, printing=10) # обучение

________________________________________
Train:
Loss: 1.1816601634678179, Accuracy: 0.6418795585632324
----------------------------------------
Test:
Loss: 1.2699986696243286, Accuracy: 0.6240808963775635
________________________________________
Train:
Loss: 0.9349972747103141, Accuracy: 0.72661954164505
----------------------------------------
Test:
Loss: 1.0158412456512451, Accuracy: 0.7063419222831726
________________________________________
Train:
Loss: 0.8200940758424954, Accuracy: 0.7512545585632324
----------------------------------------
Test:
Loss: 0.9346230626106262, Accuracy: 0.724724292755127
________________________________________
Train:
Loss: 0.7578400999960238, Accuracy: 0.7712135314941406
----------------------------------------
Test:
Loss: 0.8846026659011841, Accuracy: 0.748161792755127
________________________________________
Train:
Loss: 0.6968776715824204, Accuracy: 0.7863823175430298
----------------------------------------
Test:
Loss: 0.8704829216003418, Accuracy

In [45]:
curr_accuracy_m(model_rnn) # итоговая точность

0.7449448529411765


**Точность модели 74%.**

In [46]:
# проверка на примерах
for surname in students_PI19_3:
  total_conclusion(surname=surname, model=model_rnn)

Alexandrova --> Spanish
Baranov --> Russian
Brusova --> Czech
Volkova --> Czech
Gasanova --> Russian
Danilin --> Russian
Demenchuk --> Russian
Egorov --> Russian
Popova --> Czech
Polikarpova --> Greek
Khamikoeva --> English


## nn.LSTM

In [47]:
class LSTM(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate=False):
    super(LSTM, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate
        

        # загрузка предобученного эмбендинга из п.1.3.

    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/globe_100_rows.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.LSTM(
        input_size=input_size,
        hidden_size=hidden_size,
        num_layers=self.n_layers,
        batch_first=True
    )
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)

  def forward(self, x, h=None, c=None):
    x = self.embedding(x)
    if h and c:
      all_h, (h_n, c_n) = self.rnn_block(x, (h, c))
    else:
      all_h, (h_n, c_n) = self.rnn_block(x)
    h = h_n.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h_n, c_n

In [48]:
model_lstm = LSTM(
    input_size=50,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.index_to_token),
    n_classes=len(vocab.nationality_to_idx),
    aggregate=True
)

optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.00025) # оптимизатор
loss_fn = nn.CrossEntropyLoss() # функция потерь

In [49]:
train(100, train_loader, test_loader, model_lstm, loss_fn, optimizer_lstm, printing=10)

________________________________________
Train:
Loss: 1.2989819598023908, Accuracy: 0.619297444820404
----------------------------------------
Test:
Loss: 1.328439712524414, Accuracy: 0.6148896813392639
________________________________________
Train:
Loss: 1.0769135177570537, Accuracy: 0.6851049065589905
----------------------------------------
Test:
Loss: 1.1302987337112427, Accuracy: 0.673713207244873
________________________________________
Train:
Loss: 0.9461797128846176, Accuracy: 0.7229698896408081
----------------------------------------
Test:
Loss: 1.0212701559066772, Accuracy: 0.6948529481887817
________________________________________
Train:
Loss: 0.856815772226257, Accuracy: 0.7424726486206055
----------------------------------------
Test:
Loss: 0.9392174482345581, Accuracy: 0.7215073704719543
________________________________________
Train:
Loss: 0.7988362882259118, Accuracy: 0.754676103591919
----------------------------------------
Test:
Loss: 0.8919363021850586, Accuracy:

In [50]:
curr_accuracy_m(model_lstm) # итоговая точность

0.7532169117647058


Модель показала точность = 75%

In [51]:
# проверка на примерах
for surname in students_PI19_3:
  total_conclusion(surname=surname, model=model_lstm)

Alexandrova --> Czech
Baranov --> Russian
Brusova --> Czech
Volkova --> Czech
Gasanova --> Spanish
Danilin --> Russian
Demenchuk --> Russian
Egorov --> Russian
Popova --> Czech
Polikarpova --> Czech
Khamikoeva --> Russian


### nn.GRU

In [52]:
class GRU(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate=False):
    super(GRU, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate
        
# загрузка предобученного эмбендинга из п.1.3.
        
    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/globe_100_rows.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.GRU(
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=self.n_layers,
      batch_first=True
    )
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)


  def forward(self, x, h=None):
    x = self.embedding(x)
    if h:
      all_h, h = self.rnn_block(x, (h, c))
    else:
      all_h, h = self.rnn_block(x)
    h = h.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h

In [55]:
model_gru = GRU(
    input_size=50,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.index_to_token),
    n_classes=len(vocab.nationality_to_idx),
    aggregate=True
)
optimizer_gru = optim.Adam(model_gru.parameters(), lr=0.00035) # оптимизатор
loss_fn = nn.CrossEntropyLoss() # функция потери

In [56]:
train(100, train_loader, test_loader, model_gru, loss_fn, optimizer_gru, printing=10) # обучение

________________________________________
Train:
Loss: 1.138289686537137, Accuracy: 0.666172444820404
----------------------------------------
Test:
Loss: 1.1663275957107544, Accuracy: 0.6603860259056091
________________________________________
Train:
Loss: 0.8975173664571595, Accuracy: 0.7335766553878784
----------------------------------------
Test:
Loss: 1.0007020235061646, Accuracy: 0.7040441036224365
________________________________________
Train:
Loss: 0.7701357183230184, Accuracy: 0.7693886756896973
----------------------------------------
Test:
Loss: 0.8976666331291199, Accuracy: 0.728400707244873
________________________________________
Train:
Loss: 0.6954400322950669, Accuracy: 0.788093090057373
----------------------------------------
Test:
Loss: 0.8578768372535706, Accuracy: 0.7486213445663452
________________________________________
Train:
Loss: 0.6418624639946179, Accuracy: 0.8058850169181824
----------------------------------------
Test:
Loss: 0.8439155220985413, Accuracy

In [57]:
curr_accuracy_m(model_gru) # итоговая точность

0.7665441176470589


**Точность модели 77%.**

In [58]:
# проверка на примерах
for surname in students_PI19_3:
  total_conclusion(surname=surname, model=model_lstm)

Alexandrova --> Czech
Baranov --> Russian
Brusova --> Czech
Volkova --> Czech
Gasanova --> Spanish
Danilin --> Russian
Demenchuk --> Russian
Egorov --> Russian
Popova --> Czech
Polikarpova --> Czech
Khamikoeva --> Russian


## Вывод

Все модели показали отличную точность, однако лучшие показатели (точность, время обучения) у GRU.

1.3 Загрузите предобученные эмбеддинги (https://disk.yandex.ru/d/BHuT2tEXr_yBOQ?w=1) в модуль `nn.Embedding` и обучите модели из 1.2.

Сделано выше.

## 2. Классификация новостей на основе заголовка

Датасет: https://disk.yandex.ru/d/FN-EgWGIpyjLxQ?w=1
<br>Эмбеддинги: https://nlp.stanford.edu/projects/glove/ (находите ссылку на архив
glove.6B.zip, в нем несколько файлов с эмбеддингами слов, выбираете один из файлов в
архиве)
<br><br>2.1 Загрузите набор данных train.csv. Выполните предобработку столбца Title
<br><br>2.2 На основе этих данных создайте датасет NewsDataset . Не забудьте добавить
специальные токены <PAD> для дополнения последовательностей до нужной длины и
<UNK> для корректной обработке ранее не встречавшихся токенов. В данной задаче
рассматривайте отдельные слова как токены. Разбейте датасет на обучающее и
валидационное множество.
<br><br>2.3 Создайте модель для классификации, используя слой nn.Embedding и слой nn.RNN .
эмбеддинги инициализируйте случайным образом
не забудьте указать аргумент padding_idx для nn.Embedding
<br><br>2.4 Переобучите модель, заменив слой nn.RNN на nn.LSTM и nn.GRU . Сравните качество
на тестовой выборке. Результаты сведите в таблицу (модель/метрика качества на
тестовом множестве).
<br><br>2.5 Выполните пункты 2.3 и 2.4, используя предобученные эмбеддинги Glove.
Прокомментируйте результат.
Эмбеддинги из скачанного файла загрузите в виде двумерного тензора
pretrained_embeddings .
Обратите внимание, что номер строки в этом тензоре должен соответствовать
токену (слову), имеющему такой индекс в вашем словаре.
для слов, которых нет в файле с эмбеддингами, инициализуйте эмбеддинг
случайным образом

## 2.1

In [5]:
def preprocess(text): # предобработка текста
  result_text = re.sub(r'[^A-Za-z\!\?\s]', r'', text.lower())
  return result_text.strip()

In [6]:
data = pd.read_csv('drive/MyDrive/ML_FU/Lab_7/data/train.csv') # затягиваем данные
data.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [7]:
titles = [] # это мы будем добавлять предобработанный текст в наш датасет
for i in range(len(data['Title'])):
  titles.append(preprocess(data['Title'][i].strip('\n')))
data['Title'] = titles

In [8]:
data = data[:2000]
data.head()

Unnamed: 0,Class Index,Title,Description
0,3,wall st bears claw back into the black reuters,"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,carlyle looks toward commercial aerospace reuters,Reuters - Private investment firm Carlyle Grou...
2,3,oil and economy cloud stocks outlook reuters,Reuters - Soaring crude prices plus worries\ab...
3,3,iraq halts oil exports from main southern pipe...,Reuters - Authorities have halted oil export\f...
4,3,oil prices soar to alltime record posing new m...,"AFP - Tearaway world oil prices, toppling reco..."


In [9]:
# делим данные на тестовую и обучающую выборки
X_train, X_test, y_train, y_test = train_test_split(data['Title'].values, data['Class Index'].values, test_size=0.2, random_state=123)
print(f'Train size: {X_train.shape[0]}, test size: {X_test.shape[0]}')

Train size: 1600, test size: 400


In [10]:
class Vocab: # создадим словарь с заголовками новостей
  def __init__(self, data):
    self.max_seq_len = 0
    self.idx_to_token = ["<PAD>", "<UNK>"]
    self.token_to_idx = {"<PAD>": 0, "<UNK>": 1}
    self.lemmatizer = WordNetLemmatizer() # для формирования лемм слов
    for row in data:
      for i, t in enumerate(word_tokenize(row), start=1):
        t = self.lemmatizer.lemmatize(t)
        if i > self.max_seq_len:
          self.max_seq_len = i
        if not(t in self.token_to_idx):
          self.token_to_idx[t] = len(self.idx_to_token)
          self.idx_to_token.append(t)

  def __len__(self):
    return len(self.idx_to_token)

In [11]:
class NewDataset(Dataset):
  def __init__(self, X, y, vocab):
    self.X = X
    self.y = y
    self.vocab = vocab

  def vectorize(self, review):
    vectorized = [0] * self.vocab.max_seq_len
    for i, t in enumerate(word_tokenize(review)):
      t = self.vocab.lemmatizer.lemmatize(t)
      try:
        vectorized[i] = self.vocab.token_to_idx.get(t, 1)
      except IndexError:
        break
    return torch.tensor(vectorized, dtype=torch.int64)
    
  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    x, y = self.X[idx], self.y[idx]
    x = self.vectorize(x)
    return x, y

In [12]:
vocab = Vocab(X_train)

In [13]:
train_rdata = NewDataset(X_train, y_train, vocab)
test_rdata = NewDataset(X_test, y_test, vocab)
train_rloader = DataLoader(train_rdata, batch_size=32, shuffle=True)
test_rloader = DataLoader(test_rdata, batch_size=32)

In [14]:
class RNN_rnn(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate: bool = False):
    super(RNN_rnn, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate
        
    # загружаем предобученный эмбеддинг
    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/glove.6B.100d.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.RNN(
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=self.n_layers,
      batch_first=True
    )
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)

  def forward(self, x, h=None):
    x = self.embedding(x)
    all_h, h = self.rnn_block(x, h)
    h = h.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h

In [15]:
model_rnn = RNN_rnn(
    input_size=100,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.idx_to_token),
    n_classes=len(vocab.token_to_idx),
    aggregate=True
)
optimizer_rnn = optim.Adam(model_rnn.parameters(), lr=0.001) # оптимизатор
loss_fn = nn.CrossEntropyLoss() # функция потерь

In [20]:
train(100, train_rloader, test_rloader, model_rnn, loss_fn, optimizer_rnn, printing=10)

________________________________________
Train:
Loss: 0.23933689758181573, Accuracy: 0.9325000047683716
----------------------------------------
Test:
Loss: 1.003892183303833, Accuracy: 0.6177884340286255
________________________________________
Train:
Loss: 0.017730000196024776, Accuracy: 0.9950000047683716
----------------------------------------
Test:
Loss: 2.611769437789917, Accuracy: 0.6153846383094788
________________________________________
Train:
Loss: 0.012246164697280619, Accuracy: 0.9950000047683716
----------------------------------------
Test:
Loss: 3.5142271518707275, Accuracy: 0.6225961446762085
________________________________________
Train:
Loss: 0.010500106659019365, Accuracy: 0.9962499737739563
----------------------------------------
Test:
Loss: 3.748441457748413, Accuracy: 0.6153846383094788
________________________________________
Train:
Loss: 0.007369048601831309, Accuracy: 0.9975000023841858
----------------------------------------
Test:
Loss: 4.2391438484191895

In [47]:
def curr_accuracy_m2(model): # определение итоговой точности модели
  correct = 0
  total = 0
  with torch.no_grad():
    for X_batch, y_batch in test_rloader:
      out = model(X_batch)[0]
      _, pred = torch.max(out, dim=1)
      total += y_batch.shape[0]
      correct += int((pred == y_batch).sum())
  return correct/total

In [48]:
curr_accuracy_m2(model_rnn) # итоговая точность

0.61

**Точность модели 61%.**

In [24]:
class LSTM_rnn(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate=False):
    super(LSTM_rnn, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate   

    # загружаем предобученный эмбеддинг
    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/glove.6B.100d.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.LSTM(
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=self.n_layers,
      batch_first=True
    )
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)


  def forward(self, x, h=None, c=None):
    x = self.embedding(x)
    if h and c:
      all_h, (h_n, c_n) = self.rnn_block(x, (h, c))
    else:
      all_h, (h_n, c_n) = self.rnn_block(x)
    h = h_n.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h_n, c_n

In [25]:
model_lstm = LSTM_rnn(
    input_size=100,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.idx_to_token),
    n_classes=len(vocab.token_to_idx),
    aggregate=True
)
optimizer_lstm = optim.Adam(model_lstm.parameters(), lr=0.001) # оптимизатор
loss_fn = nn.CrossEntropyLoss() # функция ошибки

In [31]:
#  обучение модели
train(100, train_rloader, test_rloader, model_lstm, loss_fn, optimizer_lstm, printing=10)

________________________________________
Train:
Loss: 0.006177381951638381, Accuracy: 0.996874988079071
----------------------------------------
Test:
Loss: 3.0776071548461914, Accuracy: 0.6586538553237915
________________________________________
Train:
Loss: 0.007084304593445267, Accuracy: 0.996874988079071
----------------------------------------
Test:
Loss: 2.796293020248413, Accuracy: 0.6490384340286255
________________________________________
Train:
Loss: 0.004926841203559889, Accuracy: 0.9975000023841858
----------------------------------------
Test:
Loss: 2.946899175643921, Accuracy: 0.6538461446762085
________________________________________
Train:
Loss: 0.004962964605365414, Accuracy: 0.9975000023841858
----------------------------------------
Test:
Loss: 3.0219717025756836, Accuracy: 0.6466346383094788
________________________________________
Train:
Loss: 0.004066937662501004, Accuracy: 0.9975000023841858
----------------------------------------
Test:
Loss: 2.9895970821380615

In [34]:
curr_accuracy_m2(model_lstm) # итоговая точность

0.6625


**Точность модели 66%.**

In [35]:
class GRU_rnn(nn.Module):
    
  def __init__(self, input_size, hidden_size, n_layers, num_embeddings, n_classes, aggregate=False):
    super(GRU_rnn, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.aggregate_flag = aggregate
        

    # загружаем предобученный эмбеддинг
    self.embedding = nn.Embedding.from_pretrained(embeddings=torch.tensor([list(map(lambda x: float(x), row.split()[1:])) for row in open('drive/MyDrive/ML_FU/Lab_7/data/glove.6B.100d.txt').readlines()], dtype=torch.float32), freeze=False, padding_idx=0)
    self.rnn_block = nn.GRU(
      input_size=input_size,
      hidden_size=hidden_size,
      num_layers=self.n_layers,
      batch_first=True
    )
    self.fc = nn.Linear(hidden_size * self.n_layers, n_classes)

  def forward(self, x, h=None):
    x = self.embedding(x)
    if h:
      all_h, h = self.rnn_block(x, (h, c))
    else:
      all_h, h = self.rnn_block(x)
    h = h.permute(1, 0, 2)
    if self.aggregate_flag:
      h = torch.sum(all_h, dim=1)
    else:
      h = torch.flatten(h, start_dim=1, end_dim=-1)
    classes_logits = self.fc(h)
    return classes_logits, all_h, h

In [39]:
model_gru = GRU_rnn(
    input_size=100,
    hidden_size=32,
    n_layers=1,
    num_embeddings=len(vocab.idx_to_token),
    n_classes=len(vocab.token_to_idx),
    aggregate=True
)
optimizer_gru = optim.Adam(model_gru.parameters(), lr=0.001) # оптимизатор
loss_fn = nn.CrossEntropyLoss() # функция ошибки

In [40]:
train(100, train_rloader, test_rloader, model_gru, loss_fn, optimizer_gru, printing=10) # обучаем модель 

________________________________________
Train:
Loss: 0.06738208796828986, Accuracy: 0.9868749976158142
----------------------------------------
Test:
Loss: 1.2541390657424927, Accuracy: 0.6418269276618958
________________________________________
Train:
Loss: 0.014703454421833158, Accuracy: 0.9950000047683716
----------------------------------------
Test:
Loss: 1.6005685329437256, Accuracy: 0.6682692170143127
________________________________________
Train:
Loss: 0.009154227080289274, Accuracy: 0.996874988079071
----------------------------------------
Test:
Loss: 2.1719167232513428, Accuracy: 0.6586538553237915
________________________________________
Train:
Loss: 0.006034274957492016, Accuracy: 0.9975000023841858
----------------------------------------
Test:
Loss: 1.915507197380066, Accuracy: 0.6658653616905212
________________________________________
Train:
Loss: 0.00980524693208281, Accuracy: 0.9956250190734863
----------------------------------------
Test:
Loss: 2.1456961631774902

In [42]:
curr_accuracy_m2(model_gru) # итоговая точность

0.6725


**Точноть составила 67%.**

In [49]:
results = pd.DataFrame({'name': ['RNN', 'LSTM', 'GRU'], 'accuracy': [curr_accuracy_m2(model_rnn), curr_accuracy_m2(model_lstm), curr_accuracy_m2(model_gru)]})
results

Unnamed: 0,name,accuracy
0,RNN,0.61
1,LSTM,0.6625
2,GRU,0.6725


## Вывод
После обучения моделей можем сделать вывод, что лучшая из них - GRU, точность которой составила 67%, является более простой версией сетей долгой краткосрочной памяти (LSTM). При одинаковом размере скрытого слоя обучается быстрее, потому что имеет куда меньше параметров.

Хуже всего показала себя модель RNN, результат которой: точность = 61% .