### Загружаем данные для обучения: `train_texts`, `val_texts`, `test_texts`

In [None]:
from datasets import load_dataset

from sklearn.model_selection import train_test_split

from os.path import isfile

from src.data_utils import clean_text

import pandas as pd

import os


train_texts, val_texts, test_texts = None, None, None
foldername = 'data'
raw_dataset_filepath = os.path.join(foldername, 'raw_dataset.csv')
processed_dataset_filepath = os.path.join(foldername, 'processed_dataset.csv')
train_filepath = os.path.join(foldername, 'train.csv')
val_filepath = os.path.join(foldername, 'val.csv')
test_filepath = os.path.join(foldername, 'test.csv')
text_column = 'text'

# Если такие файлы уже есть, загружаем данные для обучения из них
if isfile(train_filepath) and isfile(val_filepath) and isfile(test_filepath):
    train_texts = pd.read_csv(train_filepath).dropna()[text_column].to_list()
    val_texts = pd.read_csv(val_filepath).dropna()[text_column].to_list()
    test_texts = pd.read_csv(test_filepath).dropna()[text_column].to_list()
# В противном случае скачиваем датасет, обрабатываем, делим его на выборки и сохраняем
else:
    dataset = load_dataset('sentiment140', trust_remote_code=True, split='train', cache_dir='data')['text']
    dataset_df = pd.DataFrame({ text_column: dataset })
    dataset_df.to_csv(raw_dataset_filepath, index=False)

    texts = list(map(clean_text, dataset))
    print(len(texts))
    texts_df = pd.DataFrame({ text_column: texts })
    texts_df.to_csv(processed_dataset_filepath, index=False)

    val_test = 0.2
    test = 0.5
    train_texts, val_test_texts = train_test_split(texts, test_size=val_test, random_state=42)
    val_texts, test_texts = train_test_split(val_test_texts, test_size=test)

    train_df = pd.DataFrame({ text_column: train_texts })
    train_df.to_csv(train_filepath, index=False)

    test_df = pd.DataFrame({ text_column: test_texts })
    test_df.to_csv(test_filepath, index=False)

    val_df = pd.DataFrame({ text_column: val_texts })
    val_df.to_csv(val_filepath, index=False)

  from .autonotebook import tqdm as notebook_tqdm


### Из полученных данных формируем датасеты:

In [3]:
from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

from src.next_token_dataset import NextTokenDataset, EvalROUGEDataset, collate_fn


tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# Добавим в токенайзер признак конца строки
tokenizer.add_special_tokens({'eos_token': '[EOS]'})
print('EOS token added:', tokenizer.eos_token, tokenizer.eos_token_id)

train_dataset = NextTokenDataset(train_texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

val_dataset = NextTokenDataset(val_texts, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)

# Датасет для вычисления метрик ROUGE на валидационных данных
val_rouge_dataset = EvalROUGEDataset(val_texts, tokenizer)

test_dataset = NextTokenDataset(test_texts, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

EOS token added: [EOS] 30522


In [5]:
from src.lstm_model import LSTMAutoComplete
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Определяем модель:
## 4 скрытых слоя
## размерность скрытого состояния 128
model = LSTMAutoComplete(len(tokenizer.get_vocab()), hidden_size=128, num_layers=4)

### Обучаем модель:

In [None]:
from src.lstm_model import LSTMAutoComplete
from src.model_train import train_one_epoch
from src.model_eval import evaluate

from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR

import matplotlib.pyplot as plt

import torch.nn as nn
import torch


device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Определяем модель:
## 4 скрытых слоя
## размерность скрытого состояния 128
model = LSTMAutoComplete(len(tokenizer.get_vocab()), hidden_size=128, num_layers=4)

optimizer = Adam(model.parameters(), lr=0.01, weight_decay=0.01)
scheduler = StepLR(optimizer, 5, 0.1)
criterion = nn.CrossEntropyLoss(ignore_index=0)
epochs = 20
test_phrase = "what are you talking"

# Посмотрим на валидационные метрики до обучения:
print('=== Before training ===')
evaluate(model, tokenizer, device, val_dataloader, val_rouge_dataset, criterion, test_phrase=test_phrase)
print('===================================\n')

train_losses = []
val_losses = []

# Обучаем модель в течение 20-ти эпох
for epoch in range(1, epochs+1):
    train_loss = train_one_epoch(model, device, epoch, train_dataloader, optimizer, criterion, scheduler)
    val_loss = evaluate(model, tokenizer, device, val_dataloader, val_rouge_dataset, criterion, test_phrase=test_phrase)
    print('===================================\n')
    train_losses.append(train_loss)
    val_losses.append(val_loss)

plt.plot(list(range(1, epochs+1)), train_losses, color='b')
plt.plot(list(range(1, epochs+1)), val_losses, color='g')

=== Before training ===


Evaluating: 100%|██████████| 106/106 [00:10<00:00,  9.72it/s]


Validation loss: 10.326239720830378
Rouge metrics:
rouge1: 0.0002
rouge2: 0.0000
rougeL: 0.0002
rougeLsum: 0.0001
well my friend branching distorted lawyerspipe nellie continued owes × darkeningdoor promotingrove started rebelsetched boxingbolic



Training epoch 1: 100%|██████████| 423/423 [04:47<00:00,  1.47it/s]


Epoch: 1, training loss: 10.1672


Evaluating: 100%|██████████| 106/106 [00:10<00:00, 10.20it/s]


Validation loss: 10.165345524841884
Rouge metrics:
rouge1: 0.0003
rouge2: 0.0000
rougeL: 0.0003
rougeLsum: 0.0003
well my friend nascar vector sighting mammal කgram classrooms concurrency booth callerᅧ pastortree csilithic vicious vibrating



Training epoch 2: 100%|██████████| 423/423 [04:43<00:00,  1.49it/s]


Epoch: 2, training loss: 10.1642


Evaluating: 100%|██████████| 106/106 [00:10<00:00, 10.36it/s]


Validation loss: 10.165345524841884
Rouge metrics:
rouge1: 0.0004
rouge2: 0.0000
rougeL: 0.0004
rougeLsum: 0.0004
well my friend teachings hyper eurasian unanimously pajamas nowadays advocacy ported stairsio opened prom lebanon soviets mouth terrified wealth



Training epoch 3: 100%|██████████| 423/423 [04:43<00:00,  1.49it/s]


Epoch: 3, training loss: 10.1644


Evaluating: 100%|██████████| 106/106 [00:09<00:00, 10.89it/s]


Validation loss: 10.165345524841884
Rouge metrics:
rouge1: 0.0002
rouge2: 0.0000
rougeL: 0.0002
rougeLsum: 0.0002
well my friend jew silly explorer© voivodeship dinamo willy browne clarityber nouvelle [unused841]ף cages bottomsₛ stevie



Training epoch 4: 100%|██████████| 423/423 [04:45<00:00,  1.48it/s]


Epoch: 4, training loss: 10.1644


Evaluating: 100%|██████████| 106/106 [00:09<00:00, 11.01it/s]


Validation loss: 10.165345524841884
Rouge metrics:
rouge1: 0.0003
rouge2: 0.0000
rougeL: 0.0003
rougeLsum: 0.0003
well my friend boone broncos garden principality relate martian mazdailingnder extremely debbie preoccupied cluster morality matched ᆷ 健



Training epoch 5: 100%|██████████| 423/423 [04:09<00:00,  1.69it/s]


Epoch: 5, training loss: 10.1643


Evaluating: 100%|██████████| 106/106 [00:08<00:00, 12.37it/s]


KeyboardInterrupt: 

In [6]:
from transformers import pipeline

generator = pipeline('text-generation', model='distilgpt2')
print(type(generator))

Device set to use cpu


<class 'transformers.pipelines.text_generation.TextGenerationPipeline'>


In [9]:
print(generator.get_inference_context())

<class 'torch.autograd.grad_mode.no_grad'>


### Теперь на тестовых данных сравним работу LSTM модели и GPT2:

In [None]:
from src.gpt_vs_lstm_eval import evaluate_gpt_vs_lstm

evaluate_gpt_vs_lstm(model, tokenizer, test_texts, device=device)

Device set to use cpu
Autocompleting inputs: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


Rouge metrics for test (GPT2):
rouge1: 0.0337
rouge2: 0.0000
rougeL: 0.0337
rougeLsum: 0.0333
Rouge metrics for test (LSTM-based):
rouge1: 0.0000
rouge2: 0.0000
rougeL: 0.0000
rougeLsum: 0.0000
=== Samples ===
Input: "i cant believe that i am still waiting for a damn"
	GPT2-autocomplete: " good time."
	LSTM-autocomplete: "rebelled toxic silently locomotives sliding gunmen cross engaging convoys supporters [unused361] zoo"

Input: "tchat with jake lmfao its frikkin awesome i miss this effin boy so much"
	GPT2-autocomplete: " i love it i love it im going to be a guy i love it i love it i love it"
	LSTM-autocomplete: "tori achievements incredible authority improper meath← obituary airessters yingupt tiger internship remarked humble cortex slack pains suspicion announce"

Input: "i find my bk in the fd bks section and n the pho section because"
	GPT2-autocomplete: " that way it's not a bk anymore and i just want to get rid of it"
	LSTM-autocomplete: "smashed warehouse archival kia yoga bul