# Automatic predictor

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

## Dataset

In [2]:
class MetricsCorrelationDataset(Dataset):

    def __init__(self, texts, summaries, labels, tokenizer, max_length):
        self.texts = texts
        self.summaries = summaries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        label = self.labels[idx]
        text_encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        summary_encoding = self.tokenizer(summary, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoding = self.tokenizer(text, summary, truncation='only_first')
        ans = {
            # 'text_input_ids': text_encoding['input_ids'].flatten(), 
            # 'text_attention_mask': text_encoding['attention_mask'].flatten(), 
            # 'summary_input_ids': summary_encoding['input_ids'].flatten(),
            # 'summary_attention_mask': summary_encoding['attention_mask'].flatten(),
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'label': torch.tensor(label),
            # 'labels': torch.tensor(label)
        }
        
        return ans

## Model

## Data

In [3]:
import pandas as pd
import numpy as np

index = "Ind"
title = "title"
article = "text"
ground_truth = "summary"

files = [
    "mbart_predictions.txt",
    "mt5_predictions.txt",
    "summarunner_predictions.txt",
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]
summaries_fields = []
human_metrics = [
    "Актуальность",
    "Последовательность",
    "Беглость",
    "Согласованность",
    "Комментарий"
]

for file in files:
    model_name = file.split(".")[0]
    summaries_fields.append(model_name)
    for metric in human_metrics:
        summaries_fields.append(f"{model_name}_{metric}")

summaries_fields_types = {field : ('Float64' if field.split("_")[-1] != human_metrics[-1] and field + ".txt" not in files and field + ".csv" not in files else str) for field in summaries_fields}
print(summaries_fields_types)
summaries_fields_types[index] = 'Int64'
summaries_fields_types[article] = summaries_fields_types[ground_truth] = str

metrics_data = pd.read_csv("metrics_data.csv", dtype=summaries_fields_types)
expert_data = pd.read_csv("compiled_expert_data.csv", dtype=summaries_fields_types)
# data = pd.read_csv("export_data.csv")

{'mbart_predictions': <class 'str'>, 'mbart_predictions_Актуальность': 'Float64', 'mbart_predictions_Последовательность': 'Float64', 'mbart_predictions_Беглость': 'Float64', 'mbart_predictions_Согласованность': 'Float64', 'mbart_predictions_Комментарий': <class 'str'>, 'mt5_predictions': <class 'str'>, 'mt5_predictions_Актуальность': 'Float64', 'mt5_predictions_Последовательность': 'Float64', 'mt5_predictions_Беглость': 'Float64', 'mt5_predictions_Согласованность': 'Float64', 'mt5_predictions_Комментарий': <class 'str'>, 'summarunner_predictions': <class 'str'>, 'summarunner_predictions_Актуальность': 'Float64', 'summarunner_predictions_Последовательность': 'Float64', 'summarunner_predictions_Беглость': 'Float64', 'summarunner_predictions_Согласованность': 'Float64', 'summarunner_predictions_Комментарий': <class 'str'>, 'llama_7b_predictions': <class 'str'>, 'llama_7b_predictions_Актуальность': 'Float64', 'llama_7b_predictions_Последовательность': 'Float64', 'llama_7b_predictions_Бегло

In [4]:
metrics_data.head()

Unnamed: 0,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions,...,mbart_predictions_meteor,mt5_predictions_meteor,summarunner_predictions_meteor,llama_7b_predictions_meteor,starling_predictions_meteor,yagpt_predictions_meteor,yagpt3_predictions_meteor,mbart_predictions_bleu.1,mt5_predictions_bleu.1,summarunner_predictions_bleu.1
0,Названа опасность постоянно включенного Blueto...,Постоянно включенный Bluetooth на смартфоне гр...,Активированный в смартфоне Bluetooth может пре...,Постоянно включенный Bluetooth на смартфоне гр...,"Эксперты предупреждают о том, что отключать Bl...",постоянно включенный bluetooth на смартфоне гр...,"Bluetooth постоянно включен, это опасно, так к...",Постоянно включенный Bluetooth может создавать...,- Постоянно включенный Bluetooth на смартфоне ...,Доцент кафедры информатики РЭУ им. Плеханова А...,...,0.126459,0.053648,0.168245,0.207257,0.127202,0.168304,0.139373,,,
1,Колесникова проходит подозреваемой по делу о з...,Член президиума координационного совета оппози...,Члена президиума оппозиционного Координационно...,Член президиума координационного совета оппози...,Член президиума координационного совета оппози...,член президиума координационного совета оппози...,"Мария Колесникова, член президиума координацио...","Мария Колесникова, член президиума координацио...",- Член президиума координационного совета оппо...,"Мария Колесникова, член президиума координацио...",...,0.248829,0.101734,0.263807,0.231588,0.310913,0.327753,0.224195,,,
2,Deutsche Bank: в мире наступает эпоха беспорядка,Аналитики Deutsche Bank обнародовали исследова...,В истории человечества наступает эпоха беспоря...,В 2020 году в мире наступит новая эпоха беспор...,Пандемия коронавируса привела к появлениям нов...,аналитики deutsche bank обнародовали исследова...,Аналитики Deutsche Bank предсказали наступлени...,"Аналитики Deutsche Bank предполагают, что 2020...",- Аналитики Deutsche Bank предсказали наступле...,Аналитики Deutsche Bank предсказали наступлени...,...,0.309631,0.086664,0.314012,0.262029,0.23865,0.295642,0.311487,,,
3,«Ъ»: Минтранс подготовил проект поправок к ПДД,ГИБДД совместно с Минтрансом разработала масшт...,Масштабный проект изменений в ПДД подготовили ...,ГИБДД совместно с Минтрансом разработала масшт...,В Госдуму вступили в силу поправки в правила д...,гибдд совместно с минтрансом разработала масшт...,ГИБДД и Минтранс разработали проект изменений ...,ГИБДД и Минтранс разработали проект изменений ...,- ГИБДД и Минтранс разработали масштабный прое...,ГИБДД совместно с Минтрансом разработали масшт...,...,0.306315,0.082645,0.233184,0.256739,0.270406,0.338463,0.296722,,,
4,Tesla выпустит бюджетный беспилотник в 2023 году,Вечером 22 сентября на конференции Battery Day...,Миллиардер Илон Маск пообещал представить бюдж...,На конференции Battery Day глава Tesla Илон Ма...,Илон Маск рассчитывает вывести на рынок бюджет...,вечером 22 сентября на конференции battery day...,Илон Маск представил планы Tesla по выходу на ...,"В ходе конференции Tesla Battery Day, Илон Мас...",- Глава Tesla Илон Маск анонсировал вывод на р...,На конференции Battery Day Илон Маск объявил о...,...,0.098891,0.100573,0.146562,0.227205,0.19343,0.202198,0.251442,,,


In [5]:
expert_data.head()

Unnamed: 0,text,summary,summary_Актуальность,summary_Последовательность,summary_Беглость,summary_Согласованность,mbart_predictions,mbart_predictions_Актуальность,mbart_predictions_Последовательность,mbart_predictions_Беглость,...,yagpt_predictions,yagpt_predictions_Актуальность,yagpt_predictions_Последовательность,yagpt_predictions_Беглость,yagpt_predictions_Согласованность,yagpt3_predictions,yagpt3_predictions_Актуальность,yagpt3_predictions_Последовательность,yagpt3_predictions_Беглость,yagpt3_predictions_Согласованность
0,Постоянно включенный Bluetooth на смартфоне гр...,Активированный в смартфоне Bluetooth может пре...,5.0,5.0,5.0,5.0,Постоянно включенный Bluetooth на смартфоне гр...,4.0,4.0,5.0,...,- Постоянно включенный Bluetooth на смартфоне ...,4.0,4.0,4.0,4.0,Доцент кафедры информатики РЭУ им. Плеханова А...,4.0,4.0,5.0,5.0
1,Член президиума координационного совета оппози...,Члена президиума оппозиционного Координационно...,5.0,5.0,5.0,5.0,Член президиума координационного совета оппози...,4.0,4.0,5.0,...,- Член президиума координационного совета оппо...,3.0,5.0,5.0,5.0,"Мария Колесникова, член президиума координацио...",3.0,3.0,5.0,5.0
2,Аналитики Deutsche Bank обнародовали исследова...,В истории человечества наступает эпоха беспоря...,5.0,5.0,5.0,5.0,В 2020 году в мире наступит новая эпоха беспор...,5.0,5.0,5.0,...,- Аналитики Deutsche Bank предсказали наступле...,2.0,5.0,5.0,4.0,Аналитики Deutsche Bank предсказали наступлени...,4.0,5.0,5.0,5.0
3,ГИБДД совместно с Минтрансом разработала масшт...,Масштабный проект изменений в ПДД подготовили ...,4.0,5.0,5.0,5.0,ГИБДД совместно с Минтрансом разработала масшт...,5.0,5.0,4.0,...,- ГИБДД и Минтранс разработали масштабный прое...,3.0,4.0,4.0,4.0,ГИБДД совместно с Минтрансом разработали масшт...,3.0,4.0,5.0,4.0
4,Вечером 22 сентября на конференции Battery Day...,Миллиардер Илон Маск пообещал представить бюдж...,4.0,4.0,5.0,4.0,На конференции Battery Day глава Tesla Илон Ма...,5.0,5.0,5.0,...,- Глава Tesla Илон Маск анонсировал вывод на р...,2.0,5.0,5.0,4.0,На конференции Battery Day Илон Маск объявил о...,4.0,5.0,5.0,5.0


In [6]:
import math

human_metrics = human_metrics[:-1]

models = {
    "mbart_predictions",
    "mt5_predictions",
    "summarunner_predictions",
    "llama_7b_predictions",
    "starling_predictions",
    "yagpt_predictions",
    "yagpt3_predictions"
}
metrics = {
    "bleu",
    "rouge1",
    "meteor",
    "bertscore_f1"
}

texts = []
summaries = []
labels = {k: {"bleu": [], "bertscore": [], "rouge": [], "meteor": []} for k in [0.25, 0.5, 0.75]}
human_scores = []
auto_scores = {"bleu": [], "bertscore": [], "rouge": [], "meteor": []}
totals = {"bleu": 0, "bertscore": 0, "rouge": 0, "meteor": 0}
deviations = {"bleu": [], "bertscore": [], "rouge": [], "meteor": []}


for model in models:
    for (mteric_index, metric_row), (expert_index, expert_row)  in zip(metrics_data.iterrows(), expert_data.iterrows()):
        # if any([row[f"{model}_{metric}"] is None or math.isnan(row[f"{model}_{metric}"]) for metric in metrics]) or row[model] is None or type(row[model]) != str:
        #     continue
        texts.append(metric_row["summary"])
        summaries.append(metric_row[model])
        human_scores.append(np.mean([expert_row[f"{model}_{metric}"] for metric in human_metrics]) / 5)
        # print(model, human_scores[-1])

        auto_scores["bleu"].append(metric_row[f"{model}_bleu"])
        auto_scores["rouge"].append(metric_row[f"{model}_rouge1"])
        auto_scores["meteor"].append(metric_row[f"{model}_meteor"])
        auto_scores["bertscore"].append(metric_row[f"{model}_bertscore_f1"])

mean_human = np.mean(human_scores)
mean_bleu = np.mean(auto_scores["bleu"])
mean_rouge = np.mean(auto_scores["rouge"])
mean_meteor = np.mean(auto_scores["meteor"])
mean_bertscore = np.mean(auto_scores["bertscore"])

for i in range(len(texts)):
    human_scores[i] -= mean_human
    auto_scores["bleu"][i] -= mean_bleu
    auto_scores["rouge"][i] -= mean_rouge
    auto_scores["meteor"][i] -= mean_meteor
    auto_scores["bertscore"][i] -= mean_bertscore

    deviations["bleu"].append(abs(auto_scores["bleu"][i] - human_scores[i]))
    deviations["rouge"].append(abs(auto_scores["rouge"][i] - human_scores[i]))
    deviations["meteor"].append(abs(auto_scores["meteor"][i] - human_scores[i]))
    deviations["bertscore"].append(abs(auto_scores["bertscore"][i] - human_scores[i]))

print("Median devs:")
print("\tBLEU: ", np.median(deviations["bleu"]), np.max(deviations["bleu"]))
print("\tROUGE: ", np.median(deviations["rouge"]), np.max(deviations["rouge"]))
print("\tMETEOR: ", np.median(deviations["meteor"]), np.max(deviations["meteor"]))
print("\tBERTSCORE: ", np.median(deviations["bertscore"]), np.max(deviations["bertscore"]))

for k in [0.25, 0.5, 0.75]:
    labels[k]["bleu"] = [1.0 if deviations["bleu"][i] < np.quantile(deviations["bleu"], k) else 0.0 for i in range(len(texts))]
    labels[k]["rouge"] = [1.0 if deviations["rouge"][i] < np.quantile(deviations["rouge"], k) else 0.0 for i in range(len(texts))]
    labels[k]["meteor"] = [1.0 if deviations["meteor"][i] < np.quantile(deviations["meteor"], k) else 0.0 for i in range(len(texts))]
    labels[k]["bertscore"] = [1.0 if deviations["bertscore"][i] < np.quantile(deviations["bertscore"], k) else 0.0  for i in range(len(texts))]

Median devs:
	BLEU:  0.08652315462924738 0.5889453306820962
	ROUGE:  0.08388135593220336 0.69087546980561
	METEOR:  0.0933422032178387 0.5547894310323825
	BERTSCORE:  0.08324829339981082 0.510270966206278


In [20]:
print(summaries[695:])
print(texts[695:])

['Попробуйте обработать утюжок по всей длине волос, от корней до кончиков, а не карандаш.\n', 'В очередной подборке интересных научных новостей недели:\n', 'Если вы хотите покопаться в гнезде роющих ос, то наверняка узнаете, что они могут укусить вас.\n', 'В мире есть множество способов повысить физическую форму.\n', 'Таиландские лабео - одна из самых опасных рыб в мире, однако, как выяснила обозреватель BBC Earth, они очень агрессивные к представителям своего вида.\n']
['Расчешите волосы расческой или щеткой. Выделите тонкую прядь волос и завейте ее вокруг карандаша. Возьмите утюжок для выпрямления волос и прижмите им волосы, обернутые вокруг карандаша. Без спешки распустите волосы, сняв их с карандаша. Нанесите лак для волос.', 'По возможности расспросите ветеринара о состоянии рыбки. Изучите информацию о виде своей рыбки. Держите больную рыбу в спокойной обстановке. Не кормите рыбку за 24–48 часов до медикаментозной эвтаназии. Проведите эвтаназию одним из указанных далее способов.',

## Dataset instance

In [7]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

def get_dataset(tokenizer, metric_name, k):
    texts_len = len(texts)
    train = int(texts_len * 0.9)
    test_val = int(texts_len * 0.1)
    # max_length = 512
    return {
        "train": MetricsCorrelationDataset(texts=texts[0:train], summaries=summaries[0:train], labels=labels[k][metric_name][0:train], tokenizer=tokenizer, max_length=max_length),
        "test": MetricsCorrelationDataset(texts=texts[train:texts_len], summaries=summaries[train:texts_len], labels=labels[k][metric_name][train:texts_len], tokenizer=tokenizer, max_length=max_length),
        #"val": MetricsCorrelationDataset(texts=texts[train + test_val:train + 2 * test_val], summaries=summaries[train + test_val:train + 2 * test_val], labels=labels[metric_name][train + test_val:train + 2 * test_val], tokenizer=tokenizer, max_length=max_length)
    }

## Trainer

In [8]:
from transformers import TrainingArguments
from transformers import Trainer

In [9]:
training_args = TrainingArguments(
    output_dir="automatic_predictor",
    learning_rate=3e-4,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=2,
    num_train_epochs=15,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

## Work

In [10]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.preprocessing import label_binarize
import numpy as np

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=-1, keepdims=True)
    predictions = torch.tensor([float(round(x)) for x in predictions.flatten()])
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    p = precision_score(labels, predictions, average="macro")
    r = recall_score(labels, predictions, average="macro")
    return {"precision": p, "recall": r, "f1": f1, "accuracy": accuracy}

In [11]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
    LoraConfig,
    LoraModel
)
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=30, encoder_hidden_size=256)

# config = LoraConfig(
#     task_type="SEQ_2_SEQ_LM",
#     r=8,
#     lora_alpha=32,
#     target_modules=["query", "value"],
#     lora_dropout=0.01,
# )

# BERT

## K=0.5

In [12]:
k = 0.5

In [13]:
model

'mt5_predictions'

In [14]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split


model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

# model = LoraModel(model, config, "default")
#model = get_peft_model(model, config)

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
# tokenizer.model_max_length=514

rouge_dataset = get_dataset(tokenizer, "bleu", 0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1289: indexSelectLargeIndex: block: [458,

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [15]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### Rouge

In [None]:
k = 0.25

In [37]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "rouge", 0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 357,033,474 || trainable%: 0.4685


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.382715,0.257143,0.5,0.339623,0.514286
2,No log,0.258838,0.450175,0.453431,0.445833,0.457143
3,0.434200,0.268265,0.23913,0.485294,0.320388,0.471429
4,0.434200,0.355408,0.242857,0.5,0.326923,0.485714
5,0.303900,0.281692,0.401042,0.468954,0.353741,0.457143
6,0.303900,0.243493,0.470516,0.470588,0.470456,0.471429
7,0.303900,0.247887,0.558908,0.533497,0.481961,0.542857
8,0.297200,0.263486,0.531746,0.511438,0.412611,0.5
9,0.297200,0.25916,0.401042,0.468954,0.353741,0.457143
10,0.314100,0.257431,0.532389,0.519608,0.471517,0.528571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: 'method' object is not subscriptable

In [49]:
model

PeftModelForSequenceClassification(
  (base_model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaS

In [38]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### Bertscore

In [39]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "bertscore", k=0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 357,033,474 || trainable%: 0.4685


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.247336,0.487791,0.490385,0.479741,0.557143
2,No log,0.225914,0.818841,0.519231,0.426418,0.642857
3,0.437800,0.239834,0.314286,0.5,0.385965,0.628571
4,0.437800,0.258104,0.314286,0.5,0.385965,0.628571
5,0.304500,0.33613,0.185714,0.5,0.270833,0.371429
6,0.304500,0.591543,0.185714,0.5,0.270833,0.371429
7,0.304500,0.246795,0.549959,0.553322,0.54593,0.557143
8,0.298300,0.231391,0.623077,0.534965,0.482095,0.642857
9,0.298300,0.237157,0.575,0.539336,0.513889,0.628571
10,0.300800,0.232065,0.311594,0.488636,0.380531,0.614286


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: 'method' object is not subscriptable

In [41]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### METEOR

In [42]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "meteor", k=0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.253531,0.278571,0.5,0.357798,0.557143
2,No log,0.255761,0.278571,0.5,0.357798,0.557143
3,0.571600,0.331028,0.278571,0.5,0.357798,0.557143
4,0.571600,0.309653,0.278571,0.5,0.357798,0.557143
5,0.327700,0.247108,0.278571,0.5,0.357798,0.557143
6,0.327700,0.305088,0.221429,0.5,0.306931,0.442857
7,0.327700,0.25276,0.221429,0.5,0.306931,0.442857


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

## K=0.75

# ELECTRA

In [None]:
K = 0.75

## BLEU

In [15]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruElectra-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruElectra-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "bleu", 0.50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruElectra-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 428,582,402 || trainable%: 0.3903


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.305537,0.486486,0.473684,0.48,0.442857
2,No log,0.294317,0.487179,0.5,0.493506,0.442857
3,0.348000,0.289797,0.56,0.368421,0.444444,0.5
4,0.348000,0.259369,0.526316,0.789474,0.631579,0.5
5,0.272200,0.313369,0.542857,1.0,0.703704,0.542857
6,0.272200,0.300057,0.454545,0.263158,0.333333,0.428571
7,0.272200,0.275571,0.575758,0.5,0.535211,0.528571
8,0.253500,0.325151,0.4,0.052632,0.093023,0.442857
9,0.253500,0.285546,0.615385,0.421053,0.5,0.542857
10,0.242900,0.290005,0.64,0.421053,0.507937,0.557143




TypeError: 'method' object is not subscriptable

In [None]:
model

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### Rouge

In [16]:
k = 0.5

In [17]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruElectra-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruElectra-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "rouge", 0.50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruElectra-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 428,582,402 || trainable%: 0.3903


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.371094,0.5,0.617647,0.552632,0.514286
2,No log,0.291069,0.551724,0.470588,0.507937,0.557143
3,0.344300,0.271982,0.473684,0.794118,0.593407,0.471429
4,0.344300,0.353613,0.333333,0.029412,0.054054,0.5
5,0.276100,0.280487,0.541667,0.382353,0.448276,0.542857
6,0.276100,0.283094,0.513514,0.558824,0.535211,0.528571
7,0.276100,0.274621,0.5,0.529412,0.514286,0.514286
8,0.255500,0.299621,0.4375,0.205882,0.28,0.485714
9,0.255500,0.293541,0.5,0.323529,0.392857,0.514286
10,0.235000,0.291606,0.5,0.529412,0.514286,0.514286




TypeError: 'method' object is not subscriptable

In [None]:
model

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### Bertscore

In [18]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruElectra-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruElectra-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "bertscore", 0.50)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruElectra-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 428,582,402 || trainable%: 0.3903


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.265064,0.528302,0.823529,0.643678,0.557143
2,No log,0.240751,0.526316,0.294118,0.377358,0.528571
3,0.370500,0.256923,0.463415,0.558824,0.506667,0.471429
4,0.370500,0.281591,0.666667,0.058824,0.108108,0.528571
5,0.270300,0.273623,0.541667,0.764706,0.634146,0.571429
6,0.270300,0.242673,0.489796,0.705882,0.578313,0.5
7,0.270300,0.23993,0.521739,0.705882,0.6,0.542857
8,0.261500,0.245713,0.6,0.529412,0.5625,0.6
9,0.261500,0.240915,0.588235,0.588235,0.588235,0.6
10,0.240100,0.242676,0.62069,0.529412,0.571429,0.614286




TypeError: 'method' object is not subscriptable

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

### METEOR

In [22]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "meteor", k=0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 357,033,474 || trainable%: 0.4685


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.304983,0.456522,0.617647,0.525,0.457143
2,No log,0.262649,0.428571,0.088235,0.146341,0.5
3,0.422800,0.271917,0.333333,0.117647,0.173913,0.457143
4,0.422800,0.356842,0.0,0.0,0.0,0.514286
5,0.327400,0.296755,0.492754,1.0,0.660194,0.5
6,0.327400,0.287252,0.5,0.970588,0.66,0.514286
7,0.327400,0.393698,0.477612,0.941176,0.633663,0.471429
8,0.292200,0.279386,0.428571,0.441176,0.434783,0.442857
9,0.292200,0.286523,0.517241,0.441176,0.47619,0.528571
10,0.273500,0.287622,0.513514,0.558824,0.535211,0.528571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: 'method' object is not subscriptable

In [None]:
import gc
import torch

model = None
tokenizer = None
gc.collect()
torch.cuda.empty_cache() 

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

path = "RLHFlow/ArmoRM-Llama3-8B-v0.1"
model = AutoModelForSequenceClassification.from_pretrained(path, device_map="cuda", 
                               trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(path, use_fast=True)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 

In [None]:
rouge_dataset = get_dataset(tokenizer, "meteor", k=0.75)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Multiclass

# Most correlated

In [8]:
import math

human_metrics = [x for x in human_metrics[:-1] if "Последовательность" in x]

models = {
    "mbart_predictions",
    "mt5_predictions",
    "summarunner_predictions",
    "llama_7b_predictions",
    "starling_predictions",
    "yagpt_predictions",
    "yagpt3_predictions"
}
metrics = {
    "meteor"
}

texts = []
summaries = []
labels = {k: {"meteor": []} for k in [0.5]}
human_scores = []
auto_scores = {"meteor": []}
totals = {"meteor": 0}
deviations = {"meteor": []}


for model in models:
    for (mteric_index, metric_row), (expert_index, expert_row)  in zip(metrics_data.iterrows(), expert_data.iterrows()):
        # if any([row[f"{model}_{metric}"] is None or math.isnan(row[f"{model}_{metric}"]) for metric in metrics]) or row[model] is None or type(row[model]) != str:
        #     continue
        texts.append(metric_row["text"])
        summaries.append(metric_row[model])
        human_scores.append(np.mean([expert_row[f"{model}_{metric}"] for metric in human_metrics]) / 5)
        # print(model, human_scores[-1])

        auto_scores["meteor"].append(metric_row[f"{model}_meteor"])
mean_human = np.mean(human_scores)
mean_meteor = np.mean(auto_scores["meteor"])

for i in range(len(texts)):
    human_scores[i] -= mean_human
    auto_scores["meteor"][i] -= mean_meteor

    deviations["meteor"].append(abs(auto_scores["meteor"][i] - human_scores[i]))
   
print("Median devs:")
print("\tMETEOR: ", np.median(deviations["meteor"]), np.max(deviations["meteor"]))

for k in [0.5]:
    labels[k]["meteor"] = [1.0 if deviations["meteor"][i] < np.quantile(deviations["meteor"], k) else 0.0 for i in range(len(texts))]

Median devs:
	METEOR:  0.15108287373274837 0.8115955489002815


In [9]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

def get_dataset(tokenizer, metric_name, k):
    texts_len = len(texts)
    train = int(texts_len * 0.9)
    test_val = int(texts_len * 0.1)
    max_length = 512
    return {
        "train": MetricsCorrelationDataset(texts=texts[0:train], summaries=summaries[0:train], labels=labels[k][metric_name][0:train], tokenizer=tokenizer, max_length=max_length),
        "test": MetricsCorrelationDataset(texts=texts[train:texts_len], summaries=summaries[train:texts_len], labels=labels[k][metric_name][train:texts_len], tokenizer=tokenizer, max_length=max_length),
        #"val": MetricsCorrelationDataset(texts=texts[train + test_val:train + 2 * test_val], summaries=summaries[train + test_val:train + 2 * test_val], labels=labels[metric_name][train + test_val:train + 2 * test_val], tokenizer=tokenizer, max_length=max_length)
    }

## Trainer

In [15]:
from transformers import TrainingArguments
from transformers import Trainer

In [16]:
training_args = TrainingArguments(
    output_dir="automatic_predictor",
    learning_rate=3e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

## Work

In [17]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.preprocessing import label_binarize
import numpy as np

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=-1, keepdims=True)
    predictions = torch.tensor([float(round(x)) for x in predictions.flatten()])
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    p = precision_score(labels, predictions, average="macro")
    r = recall_score(labels, predictions, average="macro")
    return {"precision": p, "recall": r, "f1": f1, "accuracy": accuracy}

In [18]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
)
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=30, encoder_hidden_size=256)

In [19]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "meteor", k=0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 357,033,474 || trainable%: 0.4685


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.466658,0.221429,0.5,0.306931,0.442857
2,No log,0.267806,0.570768,0.563275,0.536807,0.542857
3,0.309800,0.276626,0.417004,0.449132,0.403409,0.485714
4,0.309800,0.281267,0.557765,0.550455,0.520647,0.528571
5,0.281000,0.270674,0.504934,0.504963,0.499898,0.5
6,0.281000,0.272758,0.488095,0.48842,0.48734,0.5
7,0.281000,0.265915,0.501068,0.500827,0.481947,0.528571
8,0.267600,0.275343,0.492142,0.492142,0.485714,0.485714
9,0.267600,0.268625,0.520833,0.520678,0.520647,0.528571
10,0.246600,0.268643,0.520833,0.520678,0.520647,0.528571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TypeError: 'method' object is not subscriptable

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

## Dataset

In [2]:
class MetricsCorrelationDataset(Dataset):

    def __init__(self, texts, summaries, labels, tokenizer, max_length):
        self.texts = texts
        self.summaries = summaries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]
        label = self.labels[idx]
        text_encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        summary_encoding = self.tokenizer(summary, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoding = self.tokenizer(text, summary, truncation='only_first')
        ans = {
            # 'text_input_ids': text_encoding['input_ids'].flatten(), 
            # 'text_attention_mask': text_encoding['attention_mask'].flatten(), 
            # 'summary_input_ids': summary_encoding['input_ids'].flatten(),
            # 'summary_attention_mask': summary_encoding['attention_mask'].flatten(),
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'label': torch.tensor(label)
        }
        
        return ans

## Model

## Data

In [3]:
import pandas as pd
import numpy as np

index = "Ind"
title = "title"
article = "text"
ground_truth = "summary"

files = [
    f"{ground_truth}.txt", # Dummy for summary field
    "mbart_predictions.txt",
    "mt5_predictions.txt",
    "summarunner_predictions.txt",
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]
summaries_fields = []
human_metrics = [
    "Актуальность",
    "Последовательность",
    "Беглость",
    "Согласованность",
    "Комментарий"
]

for file in files:
    model_name = file.split(".")[0]
    summaries_fields.append(model_name)
    for metric in human_metrics:
        summaries_fields.append(f"{model_name}_{metric}")

summaries_fields_types = {field : ('Float64' if field.split("_")[-1] != human_metrics[-1] and field + ".txt" not in files and field + ".csv" not in files else str) for field in summaries_fields}
print(summaries_fields_types)
summaries_fields_types[index] = 'Int64'
summaries_fields_types[article] = summaries_fields_types[ground_truth] = str

metrics_data = pd.read_csv("metrics_data.csv", dtype=summaries_fields_types)
expert_data = pd.read_csv("compiled_expert_data.csv", dtype=summaries_fields_types)
# data = pd.read_csv("export_data.csv")

{'summary': <class 'str'>, 'summary_Актуальность': 'Float64', 'summary_Последовательность': 'Float64', 'summary_Беглость': 'Float64', 'summary_Согласованность': 'Float64', 'summary_Комментарий': <class 'str'>, 'mbart_predictions': <class 'str'>, 'mbart_predictions_Актуальность': 'Float64', 'mbart_predictions_Последовательность': 'Float64', 'mbart_predictions_Беглость': 'Float64', 'mbart_predictions_Согласованность': 'Float64', 'mbart_predictions_Комментарий': <class 'str'>, 'mt5_predictions': <class 'str'>, 'mt5_predictions_Актуальность': 'Float64', 'mt5_predictions_Последовательность': 'Float64', 'mt5_predictions_Беглость': 'Float64', 'mt5_predictions_Согласованность': 'Float64', 'mt5_predictions_Комментарий': <class 'str'>, 'summarunner_predictions': <class 'str'>, 'summarunner_predictions_Актуальность': 'Float64', 'summarunner_predictions_Последовательность': 'Float64', 'summarunner_predictions_Беглость': 'Float64', 'summarunner_predictions_Согласованность': 'Float64', 'summarunner_

In [4]:
metrics_data.head()

Unnamed: 0,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions,...,mbart_predictions_meteor,mt5_predictions_meteor,summarunner_predictions_meteor,llama_7b_predictions_meteor,starling_predictions_meteor,yagpt_predictions_meteor,yagpt3_predictions_meteor,mbart_predictions_bleu.1,mt5_predictions_bleu.1,summarunner_predictions_bleu.1
0,Названа опасность постоянно включенного Blueto...,Постоянно включенный Bluetooth на смартфоне гр...,Активированный в смартфоне Bluetooth может пре...,Постоянно включенный Bluetooth на смартфоне гр...,"Эксперты предупреждают о том, что отключать Bl...",постоянно включенный bluetooth на смартфоне гр...,"Bluetooth постоянно включен, это опасно, так к...",Постоянно включенный Bluetooth может создавать...,- Постоянно включенный Bluetooth на смартфоне ...,Доцент кафедры информатики РЭУ им. Плеханова А...,...,0.126459,0.053648,0.168245,0.207257,0.127202,0.168304,0.139373,,,
1,Колесникова проходит подозреваемой по делу о з...,Член президиума координационного совета оппози...,Члена президиума оппозиционного Координационно...,Член президиума координационного совета оппози...,Член президиума координационного совета оппози...,член президиума координационного совета оппози...,"Мария Колесникова, член президиума координацио...","Мария Колесникова, член президиума координацио...",- Член президиума координационного совета оппо...,"Мария Колесникова, член президиума координацио...",...,0.248829,0.101734,0.263807,0.231588,0.310913,0.327753,0.224195,,,
2,Deutsche Bank: в мире наступает эпоха беспорядка,Аналитики Deutsche Bank обнародовали исследова...,В истории человечества наступает эпоха беспоря...,В 2020 году в мире наступит новая эпоха беспор...,Пандемия коронавируса привела к появлениям нов...,аналитики deutsche bank обнародовали исследова...,Аналитики Deutsche Bank предсказали наступлени...,"Аналитики Deutsche Bank предполагают, что 2020...",- Аналитики Deutsche Bank предсказали наступле...,Аналитики Deutsche Bank предсказали наступлени...,...,0.309631,0.086664,0.314012,0.262029,0.23865,0.295642,0.311487,,,
3,«Ъ»: Минтранс подготовил проект поправок к ПДД,ГИБДД совместно с Минтрансом разработала масшт...,Масштабный проект изменений в ПДД подготовили ...,ГИБДД совместно с Минтрансом разработала масшт...,В Госдуму вступили в силу поправки в правила д...,гибдд совместно с минтрансом разработала масшт...,ГИБДД и Минтранс разработали проект изменений ...,ГИБДД и Минтранс разработали проект изменений ...,- ГИБДД и Минтранс разработали масштабный прое...,ГИБДД совместно с Минтрансом разработали масшт...,...,0.306315,0.082645,0.233184,0.256739,0.270406,0.338463,0.296722,,,
4,Tesla выпустит бюджетный беспилотник в 2023 году,Вечером 22 сентября на конференции Battery Day...,Миллиардер Илон Маск пообещал представить бюдж...,На конференции Battery Day глава Tesla Илон Ма...,Илон Маск рассчитывает вывести на рынок бюджет...,вечером 22 сентября на конференции battery day...,Илон Маск представил планы Tesla по выходу на ...,"В ходе конференции Tesla Battery Day, Илон Мас...",- Глава Tesla Илон Маск анонсировал вывод на р...,На конференции Battery Day Илон Маск объявил о...,...,0.098891,0.100573,0.146562,0.227205,0.19343,0.202198,0.251442,,,


In [5]:
expert_data.head()

Unnamed: 0,text,summary,summary_Актуальность,summary_Последовательность,summary_Беглость,summary_Согласованность,mbart_predictions,mbart_predictions_Актуальность,mbart_predictions_Последовательность,mbart_predictions_Беглость,...,yagpt_predictions,yagpt_predictions_Актуальность,yagpt_predictions_Последовательность,yagpt_predictions_Беглость,yagpt_predictions_Согласованность,yagpt3_predictions,yagpt3_predictions_Актуальность,yagpt3_predictions_Последовательность,yagpt3_predictions_Беглость,yagpt3_predictions_Согласованность
0,Постоянно включенный Bluetooth на смартфоне гр...,Активированный в смартфоне Bluetooth может пре...,5.0,5.0,5.0,5.0,Постоянно включенный Bluetooth на смартфоне гр...,4.0,4.0,5.0,...,- Постоянно включенный Bluetooth на смартфоне ...,4.0,4.0,4.0,4.0,Доцент кафедры информатики РЭУ им. Плеханова А...,4.0,4.0,5.0,5.0
1,Член президиума координационного совета оппози...,Члена президиума оппозиционного Координационно...,5.0,5.0,5.0,5.0,Член президиума координационного совета оппози...,4.0,4.0,5.0,...,- Член президиума координационного совета оппо...,3.0,5.0,5.0,5.0,"Мария Колесникова, член президиума координацио...",3.0,3.0,5.0,5.0
2,Аналитики Deutsche Bank обнародовали исследова...,В истории человечества наступает эпоха беспоря...,5.0,5.0,5.0,5.0,В 2020 году в мире наступит новая эпоха беспор...,5.0,5.0,5.0,...,- Аналитики Deutsche Bank предсказали наступле...,2.0,5.0,5.0,4.0,Аналитики Deutsche Bank предсказали наступлени...,4.0,5.0,5.0,5.0
3,ГИБДД совместно с Минтрансом разработала масшт...,Масштабный проект изменений в ПДД подготовили ...,4.0,5.0,5.0,5.0,ГИБДД совместно с Минтрансом разработала масшт...,5.0,5.0,4.0,...,- ГИБДД и Минтранс разработали масштабный прое...,3.0,4.0,4.0,4.0,ГИБДД совместно с Минтрансом разработали масшт...,3.0,4.0,5.0,4.0
4,Вечером 22 сентября на конференции Battery Day...,Миллиардер Илон Маск пообещал представить бюдж...,4.0,4.0,5.0,4.0,На конференции Battery Day глава Tesla Илон Ма...,5.0,5.0,5.0,...,- Глава Tesla Илон Маск анонсировал вывод на р...,2.0,5.0,5.0,4.0,На конференции Battery Day Илон Маск объявил о...,4.0,5.0,5.0,5.0


In [13]:
import math
import numpy as np

human_metrics = human_metrics[:-1]

models = {
    "mbart_predictions",
    "mt5_predictions",
    "summarunner_predictions",
    "llama_7b_predictions",
    "starling_predictions",
    "yagpt_predictions",
    "yagpt3_predictions"
}
metrics = {
    "bleu",
    "rouge1",
    "meteor",
    "bertscore_f1"
}

texts = []
summaries = []
labels = {k: {"multi": [], "bleu": [], "bertscore": [], "rouge": [], "meteor": []} for k in [0.25, 0.5, 0.75]}
human_scores = []
auto_scores = {"bleu": [], "bertscore": [], "rouge": [], "meteor": []}
totals = {"bleu": 0, "bertscore": 0, "rouge": 0, "meteor": 0}
deviations = {"bleu": [], "bertscore": [], "rouge": [], "meteor": []}


for model in models:
    for (mteric_index, metric_row), (expert_index, expert_row)  in zip(metrics_data.iterrows(), expert_data.iterrows()):
        # if any([row[f"{model}_{metric}"] is None or math.isnan(row[f"{model}_{metric}"]) for metric in metrics]) or row[model] is None or type(row[model]) != str:
        #     continue
        texts.append(metric_row["text"])
        summaries.append(metric_row[model])
        human_scores.append(np.mean([expert_row[f"{model}_{metric}"] for metric in human_metrics]) / 5)
        # print(model, human_scores[-1])

        auto_scores["bleu"].append(metric_row[f"{model}_bleu"])
        auto_scores["rouge"].append(metric_row[f"{model}_rouge1"])
        auto_scores["meteor"].append(metric_row[f"{model}_meteor"])
        auto_scores["bertscore"].append(metric_row[f"{model}_bertscore_f1"])

mean_human = np.mean(human_scores)
mean_bleu = np.mean(auto_scores["bleu"])
mean_rouge = np.mean(auto_scores["rouge"])
mean_meteor = np.mean(auto_scores["meteor"])
mean_bertscore = np.mean(auto_scores["bertscore"])

for i in range(len(texts)):
    human_scores[i] -= mean_human
    auto_scores["bleu"][i] -= mean_bleu
    auto_scores["rouge"][i] -= mean_rouge
    auto_scores["meteor"][i] -= mean_meteor
    auto_scores["bertscore"][i] -= mean_bertscore

    deviations["bleu"].append(abs(auto_scores["bleu"][i] - human_scores[i]))
    deviations["rouge"].append(abs(auto_scores["rouge"][i] - human_scores[i]))
    deviations["meteor"].append(abs(auto_scores["meteor"][i] - human_scores[i]))
    deviations["bertscore"].append(abs(auto_scores["bertscore"][i] - human_scores[i]))

print("Median devs:")
print("\tBLEU: ", np.median(deviations["bleu"]), np.max(deviations["bleu"]))
print("\tROUGE: ", np.median(deviations["rouge"]), np.max(deviations["rouge"]))
print("\tMETEOR: ", np.median(deviations["meteor"]), np.max(deviations["meteor"]))
print("\tBERTSCORE: ", np.median(deviations["bertscore"]), np.max(deviations["bertscore"]))

for k in [0.5]:
    for i in range(len(texts)):
        abs_bleu_dev = abs(deviations["bleu"][i] - np.quantile(deviations["bleu"], k)) / np.quantile(deviations["bleu"], k) 
        abs_rouge_dev = abs(deviations["rouge"][i] - np.quantile(deviations["rouge"], k)) / np.quantile(deviations["rouge"], k) 
        abs_meteor_dev = abs(deviations["meteor"][i] - np.quantile(deviations["meteor"], k)) / np.quantile(deviations["meteor"], k) 
        abs_bertscore_dev = abs(deviations["bertscore"][i] - np.quantile(deviations["bertscore"], k)) / np.quantile(deviations["bertscore"], k) 
    
        argm = np.argmin([abs_bleu_dev, abs_rouge_dev, abs_meteor_dev, abs_bertscore_dev])
        # dta = [0.0] * 4
        # dta[argm] = 1
        labels[k]["multi"].append(argm)    

Median devs:
	BLEU:  0.08804808220346548 0.5697786640154294
	ROUGE:  0.09462855215448177 0.6717088031389432
	METEOR:  0.09814062886468539 0.5356227643657158
	BERTSCORE:  0.0897634963194529 0.4911042995396112


## Dataset instance

In [14]:
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer

def get_dataset(tokenizer, metric_name, k):
    texts_len = len(texts)
    train = int(texts_len * 0.9)
    test_val = int(texts_len * 0.1)
    max_length = 512
    return {
        "train": MetricsCorrelationDataset(texts=texts[0:train], summaries=summaries[0:train], labels=labels[k][metric_name][0:train], tokenizer=tokenizer, max_length=max_length),
        "test": MetricsCorrelationDataset(texts=texts[train:texts_len], summaries=summaries[train:texts_len], labels=labels[k][metric_name][train:texts_len], tokenizer=tokenizer, max_length=max_length),
        #"val": MetricsCorrelationDataset(texts=texts[train + test_val:train + 2 * test_val], summaries=summaries[train + test_val:train + 2 * test_val], labels=labels[metric_name][train + test_val:train + 2 * test_val], tokenizer=tokenizer, max_length=max_length)
    }

## Trainer

In [15]:
from transformers import TrainingArguments
from transformers import Trainer

In [16]:
training_args = TrainingArguments(
    output_dir="automatic_predictor",
    learning_rate=3e-4,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

## Work

In [17]:
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score
)
from sklearn.preprocessing import label_binarize
import numpy as np

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions
    labels = eval_pred.label_ids
    probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=-1, keepdims=True)
    predictions = torch.tensor([float(round(x)) for x in predictions.flatten()])
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="macro")
    p = precision_score(labels, predictions, average="macro")
    r = recall_score(labels, predictions, average="macro")
    return {"precision": p, "recall": r, "f1": f1, "accuracy": accuracy}

In [18]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptEncoderConfig,
)
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=30, encoder_hidden_size=256)

In [19]:
from transformers import AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split



model = AutoModelForSequenceClassification.from_pretrained("ai-forever/ruRoberta-large", num_labels=1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

tokenizer = AutoTokenizer.from_pretrained("ai-forever/ruRoberta-large")
tokenizer.model_max_length=482

rouge_dataset = get_dataset(tokenizer, "multi", k=0.5)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=rouge_dataset["train"],
    eval_dataset=rouge_dataset["test"],
    tokenizer=rouge_dataset["train"].tokenizer,
    # data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,672,705 || all params: 357,033,474 || trainable%: 0.4685




RuntimeError: Found dtype Long but expected Float