In [None]:
!nvidia-smi

In [1]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
import numpy as np
import torch
from torchtext.data.metrics import bleu_score
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, recall_score
import json
import random

In [None]:
models_path = "C:/Users/Stud/Documents/models/"

MODEL_NAMES = {
    "t5-large": models_path + "ruT5-large",
    "mt": models_path + "mt",
    "dialogpt3": models_path + "dialogpt3",
    "t5-base-trained": models_path + "t5-base-trained",
    "mt-trained": models_path + "mt-trained",
    }

you_token = '<you>'
other_token = '<oth>'
persona_token = '<per>'
ATTR_TO_SPECIAL_TOKEN = {'additional_special_tokens': [you_token, other_token, persona_token]}

data_path = "C:/Users/Stud/Documents/datasets/toloka_speller.txt"
aug_path = "/content/drive/MyDrive/test_both_original_aug_ru2.json"

save_path = models_path + "rudialogpt3-trained"

max_length=128

In [3]:
path = MODEL_NAMES['dialogpt3']

tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)
#model = AutoModelForSeq2SeqLM.from_pretrained(path, max_length=max_length, output_attentions=True)
model = AutoModelForCausalLM.from_pretrained(path, max_length=max_length, output_attentions=True)

In [4]:
num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)

model.resize_token_embeddings(new_num_tokens=model.config.vocab_size + num_added_tokens)

#Only for MT model
#model.target_vocab_size=model.config.vocab_size

Embedding(50260, 1024)

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.all_special_ids

In [None]:
tokenizer.eos_token

In [None]:
model.config.vocab_size

In [None]:
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}, {param.dtype}\|  grad-{param.requires_grad}")

In [None]:
model.num_parameters()

In [None]:
#Augmented data

with open(aug_path, 'r') as infile:
    lines = infile.readlines()

aug_context = []
aug_labels = []
original_persona1 = None
original_persona2 = None
for line in lines:
    data = json.loads(line)

    if original_persona1 is None:
        original_persona1 = data['persona']
    elif original_persona2 is None:
        original_persona2 = data['persona']
    elif original_persona1 != data['persona'] and original_persona2 != data['persona']:
        original_persona1 = data['persona']
        original_persona2 = None

    dialogue = data['context']

    other_persona = None
    your_persona = []
    if original_persona1 == data['persona']:
        d_len = max_length*2 - len(original_persona1)
        other_persona = original_persona2
    else:
        d_len = max_length*2 - len(original_persona2)
        other_persona = original_persona1
    for persona in data['persona_aug']:
        your_persona.append(persona[random.randint(0, len(persona)-1)])

    your_persona = (persona_token + persona_token.join(your_persona)).replace('.', '')

    label = you_token + data['responce_aug'][random.randint(0, len(data['responce_aug'])-1)]

    dialogue_history = ""

    start_token = you_token if len(data['context']) % 2 == 0 else other_token
    for j in range(len(dialogue)-1, 0, -1):
        if len(dialogue[j] + dialogue_history) <= d_len:
            dialogue_history = start_token + dialogue[j] + dialogue_history
            start_token = you_token if start_token != you_token else other_token
        else:
            break

    if dialogue_history != "":
        aug_context.append(your_persona + dialogue_history)
        aug_labels.append(label)

aug_dataset = {
    "context": aug_context,
    "labels": aug_labels
}

In [5]:
labels = []
context = []

with open(data_path, 'r', encoding="utf-8") as infile:
    lines = infile.readlines()
    lines = [line.rstrip() for line in lines]

    for i in range(0, len(lines), 3):
        persona1 = lines[i].replace('\n', '').replace('|', persona_token).replace('.', '')
        persona2 = lines[i+1].replace('\n', '').replace('|', persona_token).replace('.', '')

        dialogue = lines[i+2].replace("Пользователь 1: ", "\n<p1>").replace("Пользователь 2: ", "\n<p2>").split("\n")
        dialogue.pop(0)

        for i in range(1, len(dialogue) - 1):
            your_persona = ""
            persona_id = ""
            if dialogue[i+1][:4] == "<p1>":
                d_len = max_length*2 - len(persona1)
                your_persona = persona1
                persona_id = "<p1>"
            else:
                d_len = max_length*2 - len(persona2)
                your_persona = persona2
                persona_id = "<p2>"
            
            label = you_token + dialogue[i + 1][4:]
            dialogue_history = ""

            for j in range(i, 0, -1):
                if len(dialogue[j][4:] + dialogue_history) <= d_len:
                    if dialogue[j][:4] == persona_id:
                        dialogue_history = you_token + dialogue[j][4:] + dialogue_history
                    else:
                        dialogue_history = other_token + dialogue[j][4:] + dialogue_history
                else:
                    break

            if dialogue_history != "":
                #'<s>' only for DialoGPT3
                context.append('<s>' + your_persona + dialogue_history)
                labels.append(dialogue_history)

dataset = {
    "context": context, 
    "labels": labels
    }

In [None]:
dataset["context"][:10]

In [None]:
tokenizer("<s><per>Я фитнестренер<per>")

In [6]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[1, 2, 3, 0, 4, 50257, 50258, 50259]
['<s>', '</s>', '<unk>', '<pad>', '<mask>', '<you>', '<oth>', '<per>']


In [7]:
def collate_fn(data):
    texts, labels = zip(*data)
    
    other_token = 50258
    you_token = 50257
    bos_token = 1
    eos_token = 2
    
    labels = tokenizer(list(labels), max_length=max_length, truncation=True)["input_ids"]

    inputs = tokenizer(list(texts), return_tensors='pt', padding='max_length', max_length=max_length, truncation=True)
    
    for i in range(len(labels)):
        temp = []
        j = 1
        while inputs['input_ids'][i][j+1] != other_token and inputs['input_ids'][i][j+1] != you_token:
            temp += [-100]
            j += 1
        labels[i] = temp + [bos_token] + labels[i]
        labels[i].extend([eos_token])
        labels[i].extend([-100 for _ in range(len(labels[i]), max_length)])
    labels = torch.LongTensor(labels)

    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')

    inputs['labels'] = labels

    return inputs

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        super().__init__()
        self.texts = list(data['context'])
        self.labels = list(data['labels'])
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        if hasattr(self, 'labels'):
            return self.texts[idx], self.labels[idx]
        else:
            return self.texts[idx], []

In [8]:
dataset_size = len(dataset['labels'])

train_size = int(0.8*dataset_size)
eval_size = dataset_size - train_size
print(train_size)
print(eval_size)
data = TextDataset(dataset)
train, val = torch.utils.data.random_split(
    data, 
    [train_size, eval_size], 
    generator=torch.Generator().manual_seed(42)
)

189236
47309


In [9]:
train_loader = torch.utils.data.DataLoader(train, batch_size=4, shuffle=True, num_workers=0, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val, batch_size=4, shuffle=True, num_workers=0, collate_fn=collate_fn)

In [10]:
#from datasets import load_metric

def compute_bleu_f1(model, inputs, labels):
    #metric = load_metric("bleu")
    results = {'bleu': [],
               'f1': [],
               'recall': []}

    preds = model.generate(
        **inputs,
        do_sample=True,
        temperature=1.0,
        top_p=0.9,
        no_repeat_ngram_size=3
    ).to('cpu').detach().numpy()
    
    preds = list(preds)
    str_preds = []
    str_labels = []
    for i in range(len(preds)):
        line = []
        for j in range(len(preds[i])):
            line.append(str(preds[i][j]))
        str_preds.append(line)
            
    for i in range(len(labels)):
        line = []
        for j in range(len(labels[i])):
            line.append(str(labels[i][j]))
        str_labels.append(line)
        while '-100' in str_labels[i]:
            str_labels[i].remove('-100')

    for pred, label in zip(str_preds, str_labels):
        #print(f"{len(label)}, {len(pred)}")
        if len(label) < len(pred):
            label.extend(['-100' for _ in range(len(label), len(pred))])
        elif len(label) > len(pred):
            pred.extend([str(tokenizer.pad_token_id) for _ in range(len(pred), len(label))])

        results['bleu'].append(bleu_score([pred], [[label]], max_n=4, weights=[0.25, 0.25, 0.25, 0.25]))
        results['f1'].append(f1_score(label, pred, average='macro'))
        results['recall'].append(recall_score(label, pred, average='macro'))
    return results
    

def eval_model(model, val_loader, skip_generation=False):
    print("Validation")
    losses = []
    bleu = []
    f1 = []
    recall = []

    #if not skip_generation:
    #    _, val = torch.utils.data.random_split(data, [dataset_size - 320, 320])
    #    val_loader = torch.utils.data.DataLoader(val, batch_size=4, shuffle=True, num_workers=0, collate_fn=collate_fn)

    progress_bar = tqdm(range(len(val_loader)))

    model.eval()
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits.to('cpu')

        labels = batch.pop('labels').to('cpu').detach().numpy()

        if not skip_generation:
            results = compute_bleu_f1(model, batch, labels)
            bleu.extend(results['bleu'])
            f1.extend(results['f1'])
            recall.extend(results['recall'])


        loss = outputs.loss
        losses.append(loss.to('cpu').detach().numpy())

        progress_bar.update(1)

    print(f"val loss: {np.mean(losses)}")
    print(f"val ppl: {np.exp(np.mean(losses))}")
    if not skip_generation:
        print(f"val bleu: {np.mean(bleu)}")
        print(f"val f1: {np.mean(f1)}")
        print(f"val recall: {np.mean(recall)}")

    #if np.exp(np.mean(losses)) < 80:
    #    model.save_pretrained("/content/drive/MyDrive/BB_90_BPE/")
    #    tokenizer.save_pretrained("/content/drive/MyDrive/BB_90_BPE/")

def train_model(model, train_loader, val_loader,
                num_epochs, optimizer, scheduler,
                accumulation_steps, fp16):
    ppl = []

    for epoch in range(num_epochs):
        print(f"Epoch: {epoch+1}")
        print("Training")
        progress_bar = tqdm(range(len(train_loader)))
        optimizer.zero_grad()
        
        i = 1
        model.train()
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss / accumulation_steps
            loss.backward()
            
            if i % accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                
            progress_bar.update(1)
            i += 1
        
        eval_model(model, val_loader, skip_generation=True)

In [11]:
from torch.optim import AdamW
from transformers import get_scheduler


def config_training(layers, lr, num_epochs, num_warmup_steps):
    if layers != None:
        for i, param in model.named_parameters():
            param.requires_grad = False
            for layer in layers:
                if layer in i and param.requires_grad == False:
                    param.requires_grad = True
                    print(f"{i}:   {param.shape}")

    optimizer = AdamW(model.parameters(), lr=lr)

    num_training_steps = num_epochs * len(train_loader)
    lr_scheduler = get_scheduler(
        name="linear", optimizer=optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
    )

    return optimizer, lr_scheduler


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

layers = None
num_epochs = 5
optimizer, lr_scheduler = config_training(layers, lr=5e-5,
                                          num_epochs=num_epochs,
                                          num_warmup_steps=500)

train_model(model, train_loader, val_loader,
            num_epochs, optimizer, lr_scheduler,
            accumulation_steps=16, fp16=False
           )

Epoch: 1
Training


  0%|          | 0/47309 [00:00<?, ?it/s]

Validation


  0%|          | 0/11828 [00:00<?, ?it/s]

val loss: 4.322206974029541
val ppl: 75.35475158691406
Epoch: 2
Training


  0%|          | 0/47309 [00:00<?, ?it/s]

Validation


  0%|          | 0/11828 [00:00<?, ?it/s]

val loss: 3.9818365573883057
val ppl: 53.61540985107422
Epoch: 3
Training


  0%|          | 0/47309 [00:00<?, ?it/s]

Validation


  0%|          | 0/11828 [00:00<?, ?it/s]

val loss: 3.6982533931732178
val ppl: 40.3767204284668
Epoch: 4
Training


  0%|          | 0/47309 [00:00<?, ?it/s]

Validation


  0%|          | 0/11828 [00:00<?, ?it/s]

val loss: 3.4333341121673584
val ppl: 30.979761123657227
Epoch: 5
Training


  0%|          | 0/47309 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
import warnings
warnings.filterwarnings('ignore')
torch.cuda.empty_cache()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

eval_model(model, val_loader, skip_generation=False)

In [None]:
from bertviz import head_view, model_view

encoder_input_ids = tokenizer("А откуда ты приехал к нам?", return_tensors="pt", add_special_tokens=True).input_ids.to('cuda')
decoder_input_ids = tokenizer("Я в Краснодаре жил 5 лет, потом вот сюда приехал.", return_tensors="pt", add_special_tokens=True).input_ids.to('cuda')

outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens = decoder_text
)

In [None]:
head_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens = decoder_text
)

# Тестирование модели

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model.to(device)

persona = ["Я программист",
           "Живу в Ростове",
           "Не умею играть на гитаре",
           "Обожаю готовить",
           "Не люблю кошек"]

personas = ["<per>Я программист<per>Живу в Ростове<per>Не умею играть на гитаре<per>Работаю два дня<per>Не люблю кошек",
            "<per>Я домохозяйка<per>Вышла замуж после школы<per>Есть двое детей<per>Обожаю готовить<per>Мечтаю об отпуске"]

utterance = "Ты умеешь играть на гитаре?"

for i in persona:
    print(i)
print("________________________________________")
print(utterance + '\n')
outs = []

context = "<per>" + "<per>".join(persona) + "<oth>" + utterance

inputs = tokenizer.encode(
    context,
    return_tensors="pt"
    ).to(device)
for i in range(5):
    outs.append(model.generate(
        inputs,
        #min_length=5,
        #max_length=25,
        do_sample=True,
        temperature=1.0,
        top_k=100,
        no_repeat_ngram_size=3,
        num_return_sequences=2
        ))
for i in range(5):
    for j in range(2):
        print(f"Candidate {i*2 + j + 1}: " + tokenizer.decode(outs[i][j], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Я программист
Живу в Ростове
Не умею играть на гитаре
Обожаю готовить
Не люблю кошек
________________________________________
Ты умеешь играть на гитаре?



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Candidate 1: Я программистЖиву в РостовеНе умею играть на гитареОбожаю готовитьНе люблю кошекТы умеешь играть на гитаре?Да У есть гита тебя какие?у естьКК?КЕ?МТ вообще работаюстом но в время рыба в хожу собира модели их зовутияимаатимяинурое���ха�к������������
Candidate 2: Я программистЖиву в РостовеНе умею играть на гитареОбожаю готовитьНе люблю кошекТы умеешь играть на гитаре?Нится,)Ну бывает подру)А любишь?)Не не, на играю на,х освоить у есть то умеешь нету на больше умею ты������������(����))��������������������😄��
Candidate 3: Я программистЖиву в РостовеНе умею играть на гитареОбожаю готовитьНе люблю кошекТы умеешь играть на гитаре?Лю играть гита люблю, не гита, мараючок поциюсо)))ты?Седи,юемцыФты?мят,к я естьч в уппеч тыуптаправраканммм������ло
Candidate 4: Я программистЖиву в РостовеНе умею играть на гитареОбожаю готовитьНе люблю кошекТы умеешь играть на гитаре?Я играть гита, я не играть песни музыке слушаюНе гита вотюрик ты?ДауУ да своеобраз голосДа отличною уяс!����������)����

In [None]:
persona = "<per>Я программист<per>Люблю кошек<per>Есть двое детей<per>Работаю два дня<per>Ем пиццу"
dialogue_session = persona

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

stop_word = "stop"
inp = input("In: ")
while inp != stop_word:
    dialogue_session += other_token + inp
    inputs = tokenizer.encode(dialogue_session, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs,
        #min_length=5,
        #max_length=25,
        do_sample=True,
        temperature=1.0,
        top_p=0.9,
        no_repeat_ngram_size=3,
        num_return_sequences=1
    )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Out: " + decoded + "\n")
    dialogue_session += you_token + decoded

    inp = input("In: ")