In [1]:
!pip install transformers
!pip install rouge

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 23.8 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.6 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.5-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 31.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [132]:
import pandas as pd
import numpy as np
import csv
import random
import os
import re

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
import pickle

## Prepare data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
os.chdir("/content/drive/My Drive/ds-texts")

In [45]:
# загружаем датафрейм с описаниями предметов и выкидываем нулевые строки
ds_info = pd.read_csv('/content/drive/MyDrive/ds-texts/ds3.csv')
ds_info = ds_info.dropna()

In [46]:
# разбиваем на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(ds_info['description'], ds_info['name'], test_size=0.1, random_state=42)

## Prepare dataset

In [56]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [61]:
# загружаем токенизатор и модель distilgpt2
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model = model.to(device)

In [62]:
# класс датасетов для модели, в котором данные токенизируются
class DescDataset(Dataset):
    
    def __init__(self, X, y, max_length=1024, 
                 tokenizer=tokenizer):
        super().__init__()

        X.reset_index(drop=True, inplace = True)
        y.reset_index(drop=True, inplace = True)

        self.tokenizer = tokenizer
        self.end_token = "<|endoftext|>"
        self.start_token = "<|startoftext|>"
        self.descriptions = []

        for i in range(y.shape[0]):
          self.descriptions.append(torch.tensor(
                self.tokenizer.encode(
                    f"{self.start_token}{y[i]}\n{X[i][:max_length]}{self.end_token}")))
                
        
    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, item):
        return self.descriptions[item]

In [63]:
dataset = DescDataset(X_train, y_train)
dataset_test = DescDataset(X_test, y_test)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

## Model training

In [64]:
batch_size = 16
epochs = 8
learning_rate = 3e-5
warmup_steps = 100
max_seq_len = 400

In [65]:
# метод обучения модели
def train(model, output_path = "trained_models"):

    model = model.to(device)
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps,
                                                num_training_steps = -1)
    tmp_items_tens = None
    for epoch in range(epochs):
        proc_seq_count = 0
        sum_loss = 0.0
        for _, item in tqdm(enumerate(data_loader), total=len(data_loader)):


            # хотим запихнуть как можно больше токенизированных итемов в последовательность длины max_seq_len
            item_tens = item.to(device)
            # пропускаем если он длиннее max_seq_len
            if item_tens.size()[1] > max_seq_len:
                continue
            
            # кладем во временный накопительный тензор первый элемент
            if not torch.is_tensor(tmp_items_tens):
                tmp_items_tens = item_tens
                continue
            else:
                # если новый элемент не помещается в накопительный тензор, то мы кладем его во временный
                # а продолжаем работать с заполненным
                if tmp_items_tens.size()[1] + item_tens.size()[1] > max_seq_len:
                    work_items_tens = tmp_items_tens
                    tmp_items_tens = item_tens
                else:
                    # иначе кладем в накопительный тензор
                    tmp_items_tens = torch.cat([tmp_items_tens, item_tens[:,1:]], dim=1)
                    continue

            # обучаем модель
            outputs = model(work_items_tens, labels=work_items_tens)
            loss, logits = outputs[:2]                        
            loss.backward()
            sum_loss += loss.detach().data
                          
            if proc_seq_count % batch_size == 0:  
                optimizer.step()
                scheduler.step() 
                optimizer.zero_grad()
                model.zero_grad()
            proc_seq_count +=  1
        print( f"Epoch {epoch+1} | Train loss: {sum_loss}")

        # сохраняем чекпоинты
        if not os.path.exists(output_path):
            os.mkdir(output_path)
        torch.save(model.state_dict(), os.path.join(output_path, f"distilgpt2_ds_{epoch+1}.pt"))
train(model)
            

  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 1 | Train loss: 1251.16796875


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 2 | Train loss: 1114.3729248046875


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 3 | Train loss: 1007.4694213867188


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 4 | Train loss: 967.27490234375


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 5 | Train loss: 915.7435302734375


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 6 | Train loss: 866.21728515625


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 7 | Train loss: 829.076416015625


  0%|          | 0/1451 [00:00<?, ?it/s]

Epoch 8 | Train loss: 839.59423828125


In [15]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=35,
    top_p=0.8,
    temperature=0.8,
    model_path = "trained_models",
    model_epoch = 8):

    # загружаем обученную ранее модель
    model.load_state_dict(torch.load(os.path.join(
        model_path, f"distilgpt2_ds_{model_epoch}.pt"), 
        map_location=torch.device(device)))

    output_file_path = f'generated_{model_epoch}.txt'
    if os.path.exists(output_file_path):
        os.remove(output_file_path)

    model.eval()
    model = model.to('cpu')
        
    generated_num = 0
    generated_list = []
    with torch.no_grad():
      
            for idx in range(entry_count):
            
                description_finished = False

                # достаем поданный на вход промт и генерируем текст необходимой длины
                cur_ids = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

                for i in range(entry_length):
                    outputs = model(cur_ids, labels=cur_ids)
                    loss, logits = outputs[:2]
                    # преподготавливаем logits - скоры для всех словарных токенов
                    logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                    sorted_indices_to_remove = cumulative_probs > top_p
                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                        ..., :-1
                    ].clone()
                    sorted_indices_to_remove[..., 0] = 0

                    indices_to_remove = sorted_indices[sorted_indices_to_remove]
                    logits[:, indices_to_remove] = -float("Inf")
                    
                    # случайно выбираем токен
                    next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                    cur_ids = torch.cat((cur_ids, next_token), dim=1)
                    if next_token in tokenizer.encode('<|endoftext|>'):
                        description_finished = True

                    # если встретили завершающий токен, значит завершаем генерацию
                    # и кладем результат в generated_list
                    if description_finished:
                        
                        generated_num += 1
                        
                        output_list = list(cur_ids.squeeze().numpy())
                        output_text = tokenizer.decode(output_list)
                        generated_list.append(output_text)

                        with open(output_file_path, 'a') as f:
                            f.write(f"{output_text}\n")
                        break
                # если до конца текста необходимой длины генерация не завершилась,
                # завершаем принудительно
                if not description_finished:
                    output_list = list(cur_ids.squeeze().numpy())
                    output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
                    generated_list.append(output_text)
    return generated_list
              

In [139]:
# генерация текста для списка названий предметов
def text_generation(test_data, entry_count = 1):
  generated_descriptions = []
  for i in range(len(test_data)):
    prompt = f'<|startoftext|>'+test_data[i]+ f'\n'
    x = generate(model, tokenizer, prompt, entry_count=entry_count)
    for j in range(0, entry_count):
        x[j]=x[j].replace(prompt,'')
    generated_descriptions.append((x,test_data[i]))
  return generated_descriptions

In [136]:
gens = text_generation(['Your mom', 'Your dad'])

In [137]:
# обрезание сгенерированных предложений до последней точки
# или до последней запятой, если точек нет

def prepare_results(descriptions):
    labels, descs = [], []
    for i in range(len(descriptions)):
        cur_descriptions=[]
        for j in range(len(descriptions[i][0])):
            remains = descriptions[i][0][j]
            remains = re.sub('[<|>]', '', remains)
            remains = re.sub(r'startoftext', '', remains)
            remains = re.sub(r'endoftext', '', remains)
            result = remains.replace(remains.split('.')[-1],'')
            if result == '':
                result = remains.replace(","+remains.split(',')[-1],'.')

            cur_descriptions.append(result)
        descs.append(cur_descriptions)
        labels.append(descriptions[i][1])
    return labels, descs

In [134]:
# проверим, как модель справляется с генерацией
labels, descs = prepare_results(gens)
for i in range(len(labels)):
    print(labels[i])
    for j in range(len(descs[i])):
        print(descs[i][j])



Your mom
Falling on the river from the depths of the Black Sea. Your mom, a proud mother, is a great lady.
Your dad
A very unusual, blue-haired blacksmith who is the last of the Order of the Gondor, who is the last of the Order of the Gondor.


## Analysis of performance

In [140]:
# посчитаем среднее значение метрики BLEU на тестовом датасете
def bleu_analysis(X_test, y_test):
    scores=[]
    generated_list = []
    y_test_list = y_test.tolist()
    for num, item in tqdm(enumerate(X_test), total = len(X_test)):
      reference = item
      candidate = prepare_results(text_generation([y_test_list[num]]))[1][0][0]
      generated_list.append(candidate)
      scores.append(sentence_bleu(reference, candidate))
    return scores, generated_list
scores_ft, generated_list_ft = bleu_analysis(X_test, y_test)

  0%|          | 0/162 [00:00<?, ?it/s]

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [141]:
# сохраним результаты чтобы потом сравнить их с результатами distilgpt без файнтьюна
if not os.path.exists('reports'):
    os.mkdir('reports')
with open("reports/test_generation_score", "wb") as fp:
    pickle.dump(scores_ft, fp)
with open("reports/test_generation", "wb") as fp:
    pickle.dump(generated_list_ft, fp)


In [145]:
# считаем значение метрики Rouge
rouge=Rouge()
with open('reports/test_generation_score', "rb") as fp:
    scores = pickle.load(fp)
print(np.mean(scores))
with open('reports/test_generation', "rb") as fp:
    generated_list = pickle.load(fp)

rouge_score = rouge.get_scores(generated_list, X_test.tolist(), avg=True)

0.6800679419694969


## Without finetune

In [148]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

In [149]:
# генерация предложений с помощью исходного distilgpt2
def generate_no_finetune(prompt_text, model, tokenizer, n_seqs=1, max_length=35):

  encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")

  output_sequences = model.generate(
      input_ids=encoded_prompt,
      max_length=max_length,
      temperature=0.8,
      top_k=0,
      top_p=0.8,
      repetition_penalty = 1.0,
      do_sample=True,
      num_return_sequences=n_seqs
  ) 

  # детокенизируем получившиеся последовательности в строку
  generated_sequences = []
  for generated_sequence in output_sequences:
    generated_sequence = generated_sequence.tolist()
    text = tokenizer.decode(generated_sequence)
    total_sequence = (
        prompt_text + text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True, )) :]
    )
    generated_sequences.append(total_sequence)
  return generated_sequences

In [150]:
# методы аналогичные методам для finetuned gpt:
# для генерации описаний по списку названий и для преобразования в более удобный для вывода вид
def text_generation_no_finetune(test_data, entry_count = 1):
    generated_descriptions = []
    for i in range(len(test_data)):
      prompt = test_data[i]
      x = generate_no_finetune(prompt, model, tokenizer, n_seqs=entry_count)
      for j in range(0, entry_count):
          x[j]=x[j].replace(prompt,'')
      generated_descriptions.append((x,test_data[i]))
    return generated_descriptions


def prepare_results_no_finetune(descriptions):
    labels, descs = [], []
    for i in range(len(descriptions)):
        cur_descriptions=[]
        for j in range(len(descriptions[i][0])):
            remains = descriptions[i][0][j]
            cur_descriptions.append(remains)
        descs.append(cur_descriptions)
        labels.append(descriptions[i][1])
    return labels, descs

In [None]:

def bleu_analysis_no_finetune(X_test, y_test):
    scores=[]
    generated_list = []
    y_test_list = y_test.tolist()
    for num, item in tqdm(enumerate(X_test), total = len(X_test)):
      reference = item
      candidate = prepare_results_no_finetune(
          text_generation_no_finetune([y_test_list[num]]))[1][0][0]
      generated_list.append(candidate)

      scores.append(sentence_bleu(reference, candidate))

    mean_score = statistics.mean(scores)
    return scores, generated_list
scores, generated_list = bleu_analysis_no_finetune(X_test, y_test)

with open("reports/test_generation_score_no_finetune", "wb") as fp:
    pickle.dump(scores, fp)
with open("reportst/test_generation_no_finetune", "wb") as fp:
    pickle.dump(generated_list, fp)


In [152]:
rouge=Rouge()

with open('reports/test_generation_no_finetune', "rb") as fp:
    generated_list = pickle.load(fp)

rouge_score_no_finetune = rouge.get_scores(generated_list, X_test.tolist(), avg=True)

In [153]:
# наконец сравним значения метрик для distilgpt2 с finetune и без:
with open("reports/test_generation_score", "rb") as fp:
    scores = pickle.load(fp)
mean_score = np.mean(scores)

with open("reports/test_generation_score_no_finetune", "rb") as fp:
    scores_no_finetune = pickle.load(fp)
mean_score_no_finetune = np.mean(scores_no_finetune)


print('BLEU scores on test dataset: \ndistilgpt2 with finetune = {} \
 \ndistilgpt2 without finetune = {}\n'.format(mean_score, mean_score_no_finetune))

print('Rouge scores on test dataset: \ndistilgpt2 with finetune: \n{} \
 \ndistilgpt2 without finetune: \n{}'.format(rouge_score, rouge_score_no_finetune))


BLEU scores on test dataset: 
distilgpt2 with finetune = 0.6800679419694969  
distilgpt2 without finetune = 0.5248875355174933

Rouge scores on test dataset: 
distilgpt2 with finetune: 
{'rouge-1': {'r': 0.10809590673019505, 'p': 0.28539422735282427, 'f': 0.1494857160767181}, 'rouge-2': {'r': 0.016483544299957027, 'p': 0.061028605536951386, 'f': 0.02402135474553153}, 'rouge-l': {'r': 0.09999876134119325, 'p': 0.2667124449259214, 'f': 0.13844507041090506}}  
distilgpt2 without finetune: 
{'rouge-1': {'r': 0.0440963820031414, 'p': 0.1441243007630924, 'f': 0.061045051756254846}, 'rouge-2': {'r': 0.00425779761681156, 'p': 0.015001627556858228, 'f': 0.006233271749088017}, 'rouge-l': {'r': 0.03988230763942435, 'p': 0.13461678622538462, 'f': 0.055434764218912365}}
