# Finetuning ruDialoGPT-medium

Finetuning [tinkoff-ai/ruDialoGPT-medium](https://huggingface.co/tinkoff-ai/ruDialoGPT-medium) conversational language model on a custom toxic (!) dialogue dataset arranged from [raw data](https://github.com/Koziev/NLP_Datasets).  
See [this notebook](https://github.com/RakePants/nerdless/blob/main/notebooks/dataset.ipynb) for dataset arrangment.

Training on Colab T4 runtime with 350000 train/val samples.

## Setup

In [None]:
!pip install -U transformers accelerate datasets --quiet

In [2]:
import json
import pandas as pd
import numpy as np

from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset, DatasetDict

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

In [None]:
checkpoint = "tinkoff-ai/ruDialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map='cuda')

Downloading the custom dataset.

In [None]:
!gdown 'https://drive.google.com/uc?id=19Opycplqv-2oGUFGsjNPFwGHmaYgaEoh'

In [9]:
# Read to pandas DataFrame first
df = pd.read_csv("io_dialogues.csv")

In [10]:
df

Unnamed: 0,input,output
0,"Себорея, витилиго, нейродермит, псориаз и проч...",От всориаза - нет. От вторичной инфекции - да.
1,Могу хуй отсосать. Куда подъезжать?\nна декабр...,"А я думал улица пушкина, дом калатушкина"
2,Го\nНадо что-то придумать интересное\nГо КАЗАК...,А еще что есть?
3,Ура!,Ура!
4,Надеюсь с G&K и BNW?,есесна
...,...,...
592543,А сегодня не будет иммиграции в бессонную лощи...,Насколько? думаю мне стоило бы их нарубить
592544,"Как поживаете, уютные?\nНервно.\nФеназепамчику...",Не пиши мне такие ужасные вещи.
592545,Уютного утречка бодрствующим уютняшкам и сладк...,Ага. А ведь еще совсем недавно тред был живой.
592546,"С какой части нужно начать, чтобы постичь всел...",Не нужно.


Dialogue lines in the 'input' column are divided by '\n'.

## Data transformation

In [6]:
max_length = 96  # Maximum length of a sample including special tokens (2048 for default model, will keep all samples)

Put special tokens 50257 and 50258 as dialogue lines separators. 50257 is also the default eos token.

In [12]:
# Function to transform 'input' column
def transform_text(row):
    input_text = row['input']

    if '\n' in input_text:
        lines = input_text.split('\n')
    else:
        lines = [input_text]

    token_first = tokenizer.convert_ids_to_tokens(50257)  # @@ПЕРВЫЙ@@
    token_second = tokenizer.convert_ids_to_tokens(50258)  # @@ВТОРОЙ@@

    if len(lines) % 2 == 0:
        flag_first = False
    else:
        flag_first = True

    for i, line in enumerate(lines):
        if flag_first:
            lines[i] = token_first + line
        else:
            lines[i] = token_second + line
        flag_first = not flag_first

    transformed_text = ''.join(lines)

    # Append the appropriate token at the end
    transformed_text += token_second

    if max_length < 2048:  # Else no need to check the length
        if len(tokenizer.tokenize(transformed_text)) > max_length:  # Drop samples longer that max length
            return None

    return transformed_text


# Function to transform 'output' column
def transform_output(row):

    token_first = tokenizer.convert_ids_to_tokens(50257)  # @@ПЕРВЫЙ@@
    transformed_text = row['output'] + token_first

    if max_length < 2048:  # Else no need to check the length
        if len(tokenizer.tokenize(transformed_text)) > max_length:  # Drop samples longer that max length
            return None

    return transformed_text

In [13]:
# Apply transformations to each row in both columns
df['input'] = df.apply(transform_text, axis=1)
df['output'] = df.apply(transform_output, axis=1)
df = df.dropna().reset_index(drop=True)

In [14]:
df

Unnamed: 0,input,output
0,"@@ВТОРОЙ@@Себорея, витилиго, нейродермит, псор...",От всориаза - нет. От вторичной инфекции - да....
1,@@ВТОРОЙ@@Могу хуй отсосать. Куда подъезжать?@...,"А я думал улица пушкина, дом калатушкина@@ПЕРВ..."
2,@@ПЕРВЫЙ@@Го@@ВТОРОЙ@@Надо что-то придумать ин...,А еще что есть?@@ПЕРВЫЙ@@
3,@@ПЕРВЫЙ@@Ура!@@ВТОРОЙ@@,Ура!@@ПЕРВЫЙ@@
4,@@ПЕРВЫЙ@@Надеюсь с G&K и BNW?@@ВТОРОЙ@@,есесна@@ПЕРВЫЙ@@
...,...,...
592452,@@ВТОРОЙ@@А сегодня не будет иммиграции в бесс...,Насколько? думаю мне стоило бы их нарубить@@ПЕ...
592453,"@@ВТОРОЙ@@Как поживаете, уютные?@@ПЕРВЫЙ@@Нерв...",Не пиши мне такие ужасные вещи.@@ПЕРВЫЙ@@
592454,@@ПЕРВЫЙ@@Уютного утречка бодрствующим уютняшк...,Ага. А ведь еще совсем недавно тред был живой....
592455,"@@ПЕРВЫЙ@@С какой части нужно начать, чтобы по...",Не нужно.@@ПЕРВЫЙ@@


In [15]:
dataset = Dataset.from_pandas(df)
shuffled_dataset = dataset.shuffle(seed=42)

limit = 350000  # Limit train & val size, 350000 is safe for T4 Colab configuration RAM
test_size = 500  # Limit test size

limited_dataset = Dataset.from_dict(shuffled_dataset[:limit])
test_dataset = Dataset.from_dict(shuffled_dataset[limit:limit+test_size])  # Create test dataset of fixed length

In [16]:
# Split the dataset into train, test, and validation sets
train_dataset_dict = limited_dataset.train_test_split(test_size=0.2, shuffle=True)

# Combine the splits into a single DatasetDict
splits = DatasetDict({
    'train': train_dataset_dict['train'],
    'val': train_dataset_dict['test'],
    'test': test_dataset,
})

In [17]:
splits

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 280000
    })
    val: Dataset({
        features: ['input', 'output'],
        num_rows: 70000
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 500
    })
})

## Tokenization and dataset creation

In [None]:
# Pad every sample to the same max_length

X_train_tokenized = tokenizer(splits['train']['input'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
y_train_tokenized = tokenizer(splits['train']['output'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

X_val_tokenized = tokenizer(splits['val']['input'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
y_val_tokenized = tokenizer(splits['val']['output'], padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets

    def __len__(self):
        return len(self.inputs["input_ids"])

    def __getitem__(self, index):
        input_ids = self.inputs["input_ids"][index]
        attention_mask = self.inputs["attention_mask"][index]
        target_ids = self.targets["input_ids"][index]

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": target_ids}

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train_tokenized)
val_dataset = Dataset(X_val_tokenized, y_val_tokenized)

## Training

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

Training for 1 epoch with 4e-7 LR and 1 warmup steps.  
Train batch size of 24 is safe for T4 GPU.

In [None]:
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    learning_rate=4e-7,  # a heuristic for 350,000 dataset length
    num_train_epochs=1,  # number of training epochs
    per_device_train_batch_size=24,  # batch size for training
    per_device_eval_batch_size=24,  # batch size for evaluation
    warmup_steps=100,  # number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=16,  # to make "virtual" batch size larger
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Train
trainer.train()

In [None]:
# Save the model
model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

## Testing

In [None]:
# Function for inferencing
def test_model(model, input):

  inputs = tokenizer(input, return_tensors='pt')
  inputs.to("cuda")

  generated_token_ids = model.generate(
      **inputs,
      top_k=10,
      top_p=0.95,
      num_beams=3,
      num_return_sequences=1,
      do_sample=True,
      no_repeat_ngram_size=2,
      temperature=0.7,
      repetition_penalty=1.2,
      length_penalty=1.0,
      eos_token_id=50257,
      max_new_tokens=400,
      pad_token_id=0
  )

  context_with_response = [tokenizer.decode(sample_token_ids) for sample_token_ids in generated_token_ids]
  return context_with_response

In [None]:
print(test_model(model, input="""@@ПЕРВЫЙ@@Привет, как дела?@@ВТОРОЙ@@"""))

In [None]:
data = []

# Run on test split
for i in tqdm(range(test_size), desc="Testing"):
    # Get the input and output for the current row
    input_value = splits["test"][i]['input']
    output_value = splits["test"][i]['output']

    # Apply your function to the input
    predicted_output = test_model(model, input_value)[0].split("@@ВТОРОЙ@@")[-1]

    # Add the new row to the DataFrame
    data.append({
        'input': input_value,
        'predict': predicted_output,
        'output': output_value
    })

test_df = pd.DataFrame(data)

In [None]:
test_df

In [None]:
# Save testing results
test_df.to_csv("results.csv")

Done with finetuning. Don't forget to copy the saved model elsewhere.