In [1]:
!pip install -q sentencepiece
!pip install -q transformers

# Explore Dataset

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/kaggle/input/sentences-preprocessed/sentences_preprocessed.csv')

In [4]:
df.rename(columns={'corrected': 'output', 'text': 'input'}, inplace=True)

In [5]:
df = df[['input', 'output']]
df = df.dropna()

In [6]:
df.head()

Unnamed: 0,input,output
0,Загрязнение тяжелыми металлами Дальнозоркого р...,Загрязнение тяжелыми металлами Дальнегорского ...
1,Одной из самых главных экологических проблем н...,Одной из самых главных экологических проблем н...
2,Эта проблема особенно характерна для тех местн...,Эта проблема особеннo характерна для тех местн...
3,Рудная Пристань .,Рудная Пристань .
4,Согласно проведенным исследованиям Тихоокеанск...,Согласно проведенным исследованиям Тихоокеанск...


# Train

## Installing libraries


In [7]:
!pip install datasets tqdm pandas -q

In [8]:
!pip install wandb -q

In [9]:
!nvidia-smi

Wed May 15 18:18:38 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0              27W / 250W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Importing libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

from tqdm.auto import tqdm
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import torch

from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq,
    get_linear_schedule_with_warmup, AdamW
  )

from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2024-05-15 18:18:48.213846: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 18:18:48.213965: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 18:18:48.335372: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Set seed

In [None]:
def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

set_random_seed(69)

## The dataset

**For T5**

In [12]:
model_name = "cointegrated/rut5-base-multitask"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/977M [00:00<?, ?B/s]

In [13]:
def calc_token_len(example):
    return len(tokenizer(example).input_ids)

In [14]:
df['input_token_len'] = df['input'].apply(calc_token_len)

In [15]:
df['input_token_len'].describe()

count    30163.000000
mean        27.624872
std         17.961295
min          3.000000
25%         15.000000
50%         24.000000
75%         35.000000
max        315.000000
Name: input_token_len, dtype: float64

In [16]:
df['input_token_len'].quantile([0.001]), df['input_token_len'].quantile([0.95])

(0.001    4.0
 Name: input_token_len, dtype: float64,
 0.95    61.0
 Name: input_token_len, dtype: float64)

In [17]:
df = df[(df['input_token_len'] <= 61) & (df['input_token_len'] > 2)]

In [18]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.4, shuffle=False)
val_df, test_df = train_test_split(test_df, test_size=0.5, shuffle=False)
train_df.shape, val_df.shape, test_df.shape

((17222, 3), (5741, 3), (5741, 3))

In [19]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [20]:
from torch.utils.data import Dataset, DataLoader

class GrammarDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len=74):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len


    def __len__(self):
        return len(self.dataset)


    def tokenize_data(self, example):
        tokenized_inputs = self.tokenizer(example['input'], max_length=self.max_len,
                                          truncation=True, padding='max_length', return_tensors="pt")
        tokenized_outputs = self.tokenizer(example['output'], max_length=self.max_len,
                                           truncation=True, padding='max_length', return_tensors="pt")

        tokenized_outputs['input_ids'][tokenized_outputs['input_ids'] == self.tokenizer.pad_token_id] = -100

        return {"input_ids": tokenized_inputs['input_ids'].squeeze(0),
                "attention_mask": tokenized_inputs['attention_mask'].squeeze(0),
                "labels": tokenized_outputs['input_ids'].squeeze(0)}

    def __getitem__(self, index):
        return self.tokenize_data(self.dataset[index])


## Train Model

In [21]:
!pip install rouge-score -q

In [22]:
!pip install evaluate -q

In [23]:
import evaluate
from rouge_score import rouge_scorer
from nltk.translate.gleu_score import sentence_gleu

In [24]:
# Get the vocabulary as a dictionary of token to id
vocab_dict = {tokenizer.convert_ids_to_tokens(i): i for i in range(tokenizer.vocab_size)}
vocab_tokens = set(vocab_dict.values())

In [25]:
def safe_decode(tokens, tokenizer, vocab_tokens=vocab_tokens):
    safe_tokens = []
    for tid in tokens:
        if tid in vocab_tokens:
            safe_tokens.append(tid)
    decoded_text = tokenizer.decode(safe_tokens, skip_special_tokens=True, truncation=True)
    return decoded_text

In [26]:
rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = [safe_decode(seq, tokenizer) for seq in predictions.tolist()]
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, truncation=True)

    scores = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_aggregator=True, tokenizer=lambda x: x.split())
    return scores

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [27]:
train_data = GrammarDataset(train_dataset, tokenizer)
val_data = GrammarDataset(val_dataset, tokenizer)
test_data = GrammarDataset(test_dataset, tokenizer)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
val_loader = DataLoader(val_data, batch_size=16, shuffle=False)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(30000, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(30000, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=768, out_features=2048, bias=False)
              (wi_1): Linear(in_features=768, out_features=2048, bias=False)
              (wo):

In [28]:
!pip3 install -q pytorch_lightning torchmetrics

In [29]:
from transformers import get_scheduler
from pytorch_lightning.loggers import WandbLogger

In [30]:
from collections import defaultdict

In [31]:
import wandb


wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [32]:
wandb.init(project='seq2seq_model_training')

[34m[1mwandb[0m: Currently logged in as: [33ma-zamyshevskaya[0m ([33mmine_nimble[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [33]:
lr = 0.0001  
NUM_EPOCH = 1

#wandb.init()
wandb_logger = WandbLogger(project='seq2seq_model_training', log_model='all')

wandb.config = {
  "learning_rate": lr,
  "epochs": NUM_EPOCH,
  "batch_size": 16,
  "model_type": "cointegrated/rut5-base-multitask"
}

train_losses = []
val_losses = []

optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8, weight_decay=0.05)
scheduler = get_scheduler("linear", optimizer, num_training_steps=len(train_loader), num_warmup_steps=100)

for epoch in range(1, NUM_EPOCH + 1):
    model.train()
    train_loss = 0
    total_train = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch}"):
        inputs = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item() * inputs.size(0)
        total_train += inputs.size(0)

    avg_train_loss = train_loss / total_train
    wandb.log({"train_loss": avg_train_loss})

    model.eval()
    val_loss = 0
    total_val = 0
    metrics = defaultdict(list)
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validation Epoch {epoch}"):
            inputs = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            predictions = torch.argmax(outputs.logits, dim=-1)
            eval_pred = (predictions.cpu(), labels.cpu())
            batch_metrics = compute_metrics(eval_pred)
            for key, value in batch_metrics.items():
                metrics[key].append(value)

            loss = outputs.loss
            val_loss += loss.item() * inputs.size(0)
            total_val += inputs.size(0)

    avg_val_loss = val_loss / total_val
    avg_metrics = {key: np.mean(vals) for key, vals in metrics.items()}
    wandb.log({"validation_loss": avg_val_loss,
               **avg_metrics})

    print(f'Epoch {epoch}: Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')
    print('-' * 75)

Training Epoch 1:   0%|          | 0/1077 [00:00<?, ?it/s]

Validation Epoch 1:   0%|          | 0/359 [00:00<?, ?it/s]

Epoch 1: Train Loss: 0.5439, Validation Loss: 0.2789
---------------------------------------------------------------------------


In [34]:
import torch

model.eval()

predictions = []
references = []

with torch.no_grad():
    for batch in tqdm(test_loader):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model.generate(max_length=84, input_ids=inputs, attention_mask=attention_mask)

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = [safe_decode(seq, tokenizer) for seq in labels.tolist()]

        predictions.extend(decoded_preds)
        references.extend(decoded_labels)


  0%|          | 0/359 [00:00<?, ?it/s]

In [35]:
df = pd.DataFrame({
    'Predicted output': predictions,
    'Manual corrections': references,
    'Initial sentences': test_df['input']
})

df.head()


Unnamed: 0,Predicted output,Manual corrections,Initial sentences
24702,"Томе Таим, что в РФ можно обучать тряска к на ...","Отме тим, что в РФ можно обуча ться ка к на ру...","Томе Таим , что в РФ можно , обучаю тряс ка к ..."
24703,Почему я хочу учиться в России?,Почему я хочу учиться в России?,Почему я хочу учиться в России ?
24704,"Известно, что цель образования — не только зна...","Известно, что цель образования — не только зна...","Известно , что цель образования — не только зн..."
24705,А главное — верные действия.,А главное — верные действия.,А главное — верные действия .
24706,Мы в тысячи раз больше беспокоимся о приобрете...,Мы в тысячу раз больше беспокоимся о приобрете...,Мы в тысячи раз больше беспокоимся о приобрете...


In [36]:
df.to_csv("/kaggle/working/seq2seq_t5_preproc_predictions.csv", index=False)

In [37]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.018 MB uploaded\r'), FloatProgress(value=0.07505015996963288, max=1.…

0,1
rouge1,▁
rouge2,▁
rougeL,▁
rougeLsum,▁
train_loss,▁
validation_loss,▁

0,1
rouge1,0.92017
rouge2,0.84477
rougeL,0.9202
rougeLsum,0.92014
train_loss,0.54391
validation_loss,0.27891
