# Metrics calculation (LLM)

In [1]:
import pandas as pd
# summary_dataset = pd.read_csv("summary_dataset_with_bertscore.csv")
# columns = [column for column in summary_dataset.columns if "bertscore" not in column] 
# print(columns)
# summary_dataset = summary_dataset[columns]
# summary_dataset.to_csv("summary_dataset.csv", index=False)
# summary_dataset = pd.read_csv("summary_dataset.csv")
# summary_dataset.head()

In [2]:
summary_dataset = pd.read_csv("summary_dataset_with_bertscore_bleu_rouge_meteor_indexed.csv")
summary_dataset = summary_dataset.rename(columns={"Unnamed: 0": "Ind"})
import random

random.seed(43)
indices_for_evaluation = random.sample(range(0, len(summary_dataset)), 55)
indices_for_evaluation = sorted(indices_for_evaluation)
summary_dataset = summary_dataset[summary_dataset["Ind"].isin(indices_for_evaluation)]

In [3]:
summary_files = [
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]

llm_data = pd.DataFrame()
for summary_file in summary_files:
    model_name = summary_file.split(".")[0]
    df = pd.read_csv(summary_file)[[model_name]]
    llm_data = pd.concat([llm_data, df], axis=1)

llm_data.head()

Unnamed: 0,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions
0,"Чистите зубы каждый день, пользовайтесь зубной...",Для сохранения белизны и здоровья зубов важно ...,"- Чистить зубы каждый день, использовать зубну...",Для поддержания здоровья зубов и предотвращени...
1,После доставки травы домой ее следует подготов...,"После приобретения травы, её необходимо промыт...","К сожалению, я не могу ничего сказать об этом....","К сожалению, я не могу ничего сказать об этом...."
2,"Ваши руки - ваше сокровище, ухаживайте за ними...",Важнейшая инструкция для ухода за руками и их ...,- Руки являются важным инструментом для моделе...,"Автор текста призывает заботиться о руках, пос..."
3,Используйте посудомоечную машину для стирки од...,Важно учить детей умению стирки и прививать им...,- Задача пользователя - генерировать короткий ...,Нужно положить грязную одежду в стиральную маш...
4,"Записывайте сны сразу после пробуждения, чтобы...","Сны - это частый явление, но часто трудно запо...","- Все люди видят сны, но часто забывают их.\n-...","Всем снятся сны, но не всегда их удаётся запом..."


In [4]:
summary_dataset = pd.concat([summary_dataset, llm_data.set_index(summary_dataset.index)], axis=1)
summary_dataset = summary_dataset[[column for column in summary_dataset.columns if "bertscore" not in column and "rouge" not in column and "meteor" not in column]]
summary_dataset = summary_dataset[summary_dataset["yagpt_predictions"] != "К сожалению, я не могу ничего сказать об этом. Давайте сменим тему?"]
summary_dataset.head()
print("Total: ", len(summary_dataset))

Total:  51


## BERTScore

In [5]:
from evaluate import load
bertscore = load("bertscore")
files = [
    "mbart_predictions.txt",
    "mt5_predictions.txt",
    "summarunner_predictions.txt",
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]

In [6]:
results = {}
references = list(summary_dataset["summary"])
for filename in files:
    predictions_field = filename.split(".")[0]
    predictions = list(summary_dataset[predictions_field])
    results[predictions_field] = bertscore.compute(predictions=predictions, references=references, lang="ru", verbose=True, model_type="microsoft/mdeberta-v3-base", nthreads=16, batch_size=16, use_fast_tokenizer=False)

calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364215.17 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364215.30 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364215.44 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364215.73 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364216.11 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364216.39 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 364216.68 seconds, 0.00 sentences/sec


In [7]:
data_keys = list(results.keys())
for data_key in data_keys:
    old_keys = list(results[data_key].keys())
    for old_metric_key in old_keys:
        results[data_key][f"{data_key}_bertscore_" + old_metric_key] = results[data_key].pop(old_metric_key)

In [8]:
print(results[filename.split(".")[0]].keys())
print(len(results[filename.split(".")[0]]))

dict_keys(['yagpt3_predictions_bertscore_precision', 'yagpt3_predictions_bertscore_recall', 'yagpt3_predictions_bertscore_f1', 'yagpt3_predictions_bertscore_hashcode'])
4


In [9]:
print(len(results[filename.split(".")[0]][f"{filename.split('.')[0]}_bertscore_precision"]))

51


In [10]:
new_data = {}
for data_key in results.keys():
    for metric_key in results[data_key].keys():
        if "hashcode" in metric_key:
            continue
        new_data[metric_key] = pd.Series(results[data_key][metric_key])
new_data_df = pd.DataFrame(new_data)
new_data_df.head()

Unnamed: 0,mbart_predictions_bertscore_precision,mbart_predictions_bertscore_recall,mbart_predictions_bertscore_f1,mt5_predictions_bertscore_precision,mt5_predictions_bertscore_recall,mt5_predictions_bertscore_f1,summarunner_predictions_bertscore_precision,summarunner_predictions_bertscore_recall,summarunner_predictions_bertscore_f1,llama_7b_predictions_bertscore_precision,...,llama_7b_predictions_bertscore_f1,starling_predictions_bertscore_precision,starling_predictions_bertscore_recall,starling_predictions_bertscore_f1,yagpt_predictions_bertscore_precision,yagpt_predictions_bertscore_recall,yagpt_predictions_bertscore_f1,yagpt3_predictions_bertscore_precision,yagpt3_predictions_bertscore_recall,yagpt3_predictions_bertscore_f1
0,0.631593,0.624381,0.627966,0.673249,0.605494,0.637577,0.687089,0.689886,0.688485,0.726297,...,0.744807,0.643891,0.736579,0.687123,0.647951,0.749931,0.695221,0.668113,0.78476,0.721754
1,0.644282,0.575288,0.607833,0.64215,0.577963,0.608368,0.676075,0.680641,0.67835,0.63588,...,0.656465,0.592378,0.669815,0.628721,0.605613,0.660121,0.631693,0.608544,0.651451,0.629267
2,0.681743,0.589087,0.632037,0.681641,0.586275,0.630371,0.641098,0.658043,0.64946,0.628046,...,0.661089,0.60998,0.653202,0.630851,0.640942,0.676423,0.658205,0.638269,0.68621,0.661372
3,0.631319,0.656633,0.643727,0.620151,0.616955,0.618549,0.652692,0.708193,0.679311,0.552914,...,0.623575,0.603925,0.72202,0.657713,0.649337,0.715555,0.68084,0.577001,0.686377,0.626954
4,0.595945,0.63999,0.617182,0.614952,0.591488,0.602992,0.594532,0.699398,0.642715,0.588521,...,0.662679,0.587893,0.704453,0.640917,0.608653,0.678829,0.641828,0.586071,0.746065,0.656461


In [11]:
new_summary_dataset = pd.concat([summary_dataset, new_data_df.set_index(summary_dataset.index)], axis=1)
new_summary_dataset.head()
print(len(new_summary_dataset))

51


In [12]:
summary_dataset = pd.DataFrame(new_summary_dataset)
del new_summary_dataset

In [13]:
summary_dataset.head()
print(len(summary_dataset))

51


In [14]:
from statistics import mean 
from collections import defaultdict
score_stats = defaultdict(dict) if (not "score_stats" in locals() and not "score_stats" in globals()) else score_stats
for filename in files:
    predictions_field = filename.split(".")[0]
    
    mean_f1 = mean(results[predictions_field][f"{predictions_field}_bertscore_f1"])
    var_f1 = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_bertscore_f1"])) - mean_f1 ** 2
    
    mean_precision = mean(results[predictions_field][f"{predictions_field}_bertscore_precision"])
    var_precision = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_bertscore_precision"])) - mean_precision ** 2
    
    mean_recall = mean(results[predictions_field][f"{predictions_field}_bertscore_recall"])
    var_recall = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_bertscore_recall"])) - mean_recall ** 2
    
    score_stats[predictions_field].update({"mean_precision": mean_precision, "var_precision": var_precision, "mean_recall": mean_recall, "var_recall": var_recall, "mean_f1": mean_f1, "var_f1": var_f1})
print(dict(score_stats))

{'mbart_predictions': {'mean_precision': 0.6136284266032425, 'var_precision': 0.002562404920058825, 'mean_recall': 0.6459829117737564, 'var_recall': 0.0020065185461710655, 'mean_f1': 0.6278165169790679, 'var_f1': 0.0014819173965948673}, 'mt5_predictions': {'mean_precision': 0.6229039161813026, 'var_precision': 0.002883070243606789, 'mean_recall': 0.5968168857050877, 'var_recall': 0.001236734393415706, 'mean_f1': 0.6083403989380481, 'var_f1': 0.0013447706139774196}, 'summarunner_predictions': {'mean_precision': 0.6202631779745513, 'var_precision': 0.002442860837790939, 'mean_recall': 0.6661550426015667, 'var_recall': 0.001323556002591153, 'mean_f1': 0.6411289011730867, 'var_f1': 0.0012757973201810158}, 'llama_7b_predictions': {'mean_precision': 0.6178074286264532, 'var_precision': 0.0028896956557149522, 'mean_recall': 0.7163438925556108, 'var_recall': 0.0022519779464809098, 'mean_f1': 0.6624223227594414, 'var_f1': 0.002046067511045413}, 'starling_predictions': {'mean_precision': 0.59787

In [15]:
results.keys()

dict_keys(['mbart_predictions', 'mt5_predictions', 'summarunner_predictions', 'llama_7b_predictions', 'starling_predictions', 'yagpt_predictions', 'yagpt3_predictions'])

In [16]:
stats_pd = pd.DataFrame(score_stats)

In [17]:
stats_pd.head(None)

Unnamed: 0,mbart_predictions,mt5_predictions,summarunner_predictions,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions
mean_precision,0.613628,0.622904,0.620263,0.617807,0.597878,0.622112,0.609768
var_precision,0.002562,0.002883,0.002443,0.00289,0.002965,0.002084,0.001972
mean_recall,0.645983,0.596817,0.666155,0.716344,0.69702,0.705082,0.712831
var_recall,0.002007,0.001237,0.001324,0.002252,0.001594,0.002066,0.002063
mean_f1,0.627817,0.60834,0.641129,0.662422,0.642646,0.660237,0.656557
var_f1,0.001482,0.001345,0.001276,0.002046,0.001885,0.001643,0.001592


In [18]:
summary_dataset.to_csv("summary_dataset_with_bertscore.csv", index=False)

In [19]:
len(summary_dataset)

51

## BLEU

In [20]:
import razdel
from evaluate import load
from tqdm import tqdm
import nltk

# results = {}
bleu = load('bleu')
references = list(summary_dataset["summary"])

def a(x):
    tokens = list(razdel.tokenize(x))
    return [_.text for _ in tokens]
    # return x.split()

for filename in files:
    predictions_field = filename.split(".")[0]
    predictions = list(summary_dataset[predictions_field])
    # results[predictions_field] = rouge.compute(predictions=predictions, references=references, use_aggregator=False, tokenizer=lambda x : list(razdel.tokenize(x)))
    results[predictions_field]["bleu"] = []
    for ref, pred in tqdm(zip(references, predictions)):
        bl = bleu.compute(predictions=[pred], references=[ref], tokenizer=a, smooth=True)
        # bl = nltk.translate.bleu(pred, ref)
        results[predictions_field]["bleu"].append(bl["bleu"])

51it [00:00, 648.19it/s]
51it [00:00, 687.36it/s]
51it [00:00, 649.74it/s]
51it [00:00, 571.67it/s]
51it [00:00, 569.43it/s]
51it [00:00, 581.20it/s]
51it [00:00, 580.70it/s]


In [21]:
data_keys = list(results.keys())
print(data_keys)
for data_key in data_keys:
    old_keys = list(results[data_key].keys())
    print(old_keys)
    for old_metric_key in old_keys:
        results[data_key][f"{data_key}_" + old_metric_key] = results[data_key].pop(old_metric_key)

['mbart_predictions', 'mt5_predictions', 'summarunner_predictions', 'llama_7b_predictions', 'starling_predictions', 'yagpt_predictions', 'yagpt3_predictions']
['mbart_predictions_bertscore_precision', 'mbart_predictions_bertscore_recall', 'mbart_predictions_bertscore_f1', 'mbart_predictions_bertscore_hashcode', 'bleu']
['mt5_predictions_bertscore_precision', 'mt5_predictions_bertscore_recall', 'mt5_predictions_bertscore_f1', 'mt5_predictions_bertscore_hashcode', 'bleu']
['summarunner_predictions_bertscore_precision', 'summarunner_predictions_bertscore_recall', 'summarunner_predictions_bertscore_f1', 'summarunner_predictions_bertscore_hashcode', 'bleu']
['llama_7b_predictions_bertscore_precision', 'llama_7b_predictions_bertscore_recall', 'llama_7b_predictions_bertscore_f1', 'llama_7b_predictions_bertscore_hashcode', 'bleu']
['starling_predictions_bertscore_precision', 'starling_predictions_bertscore_recall', 'starling_predictions_bertscore_f1', 'starling_predictions_bertscore_hashcode',

In [22]:
print(results[filename.split(".")[0]].keys())

dict_keys(['yagpt3_predictions_yagpt3_predictions_bertscore_precision', 'yagpt3_predictions_yagpt3_predictions_bertscore_recall', 'yagpt3_predictions_yagpt3_predictions_bertscore_f1', 'yagpt3_predictions_yagpt3_predictions_bertscore_hashcode', 'yagpt3_predictions_bleu'])


In [23]:
print(results[filename.split(".")[0]][f"{filename.split('.')[0]}_bleu"])

[0.026750816423785816, 0.03202289411234805, 0.038028422576808, 0.014206586137818344, 0.018042471224739884, 0.030529807855792186, 0.023915238684182282, 0.045680352631985464, 0.031983487823250265, 0.042020425849160166, 0.01408357381457623, 0.029096451811953992, 0.041134845763007126, 0.02645541924746324, 0.011302915099255477, 0.01648193227038116, 0.0584192288329162, 0.02664094361367871, 0.021156719190824593, 0.03200512175383545, 0.014063537945480188, 0.02744917703739533, 0.057446387286086247, 0.01879243431596414, 0.02289355938731725, 0.06108557268562174, 0.02465448790945244, 0.020448357708217915, 0.029883432477608395, 0.019996667521972298, 0.032739755503851865, 0.014018080775451417, 0.023615074687245826, 0.02160090509900319, 0.016470076454180194, 0.03225641780355764, 0.04124160134389764, 0.01909058567104395, 0.035621546005625296, 0.023199901472763716, 0.02106055451080167, 0.06616259609591661, 0.011079891322284109, 0.020578146917622196, 0.03946507878446118, 0.04899017558375205, 0.016851206

In [24]:
new_data = {}
for data_key in results.keys():
    for metric_key in results[data_key].keys():
        if not "bleu" in metric_key:
            continue
        new_data[metric_key] = pd.Series(results[data_key][metric_key])
new_data_df = pd.DataFrame(new_data)
new_data_df.head()

Unnamed: 0,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,llama_7b_predictions_bleu,starling_predictions_bleu,yagpt_predictions_bleu,yagpt3_predictions_bleu
0,0.017496,0.005383,0.060654,0.042217,0.024531,0.019532,0.026751
1,0.016922,0.01195,0.075167,0.025555,0.019571,0.015097,0.032023
2,0.023233,0.002937,0.036801,0.030596,0.032465,0.032633,0.038028
3,0.066542,0.058565,0.039209,0.014875,0.02445,0.02837,0.014207
4,0.051349,0.049504,0.029521,0.017103,0.01592,0.017944,0.018042


In [25]:
len(new_data_df)

51

In [27]:
new_summary_dataset = pd.concat([summary_dataset, new_data_df.set_index(summary_dataset.index)], axis=1)
new_summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,yagpt3_predictions_bertscore_precision,yagpt3_predictions_bertscore_recall,yagpt3_predictions_bertscore_f1,mbart_predictions_bleu.1,mt5_predictions_bleu.1,summarunner_predictions_bleu.1,llama_7b_predictions_bleu,starling_predictions_bleu,yagpt_predictions_bleu,yagpt3_predictions_bleu
156,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.668113,0.78476,0.721754,0.017496,0.005383,0.060654,0.042217,0.024531,0.019532,0.026751
400,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.608544,0.651451,0.629267,0.016922,0.01195,0.075167,0.025555,0.019571,0.015097,0.032023
472,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.638269,0.68621,0.661372,0.023233,0.002937,0.036801,0.030596,0.032465,0.032633,0.038028
500,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.577001,0.686377,0.626954,0.066542,0.058565,0.039209,0.014875,0.02445,0.02837,0.014207
501,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.586071,0.746065,0.656461,0.051349,0.049504,0.029521,0.017103,0.01592,0.017944,0.018042


In [28]:
print(len(new_summary_dataset), len(summary_dataset))

51 51


In [29]:
summary_dataset = pd.DataFrame(new_summary_dataset)
del new_summary_dataset

In [30]:
summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,yagpt3_predictions_bertscore_precision,yagpt3_predictions_bertscore_recall,yagpt3_predictions_bertscore_f1,mbart_predictions_bleu.1,mt5_predictions_bleu.1,summarunner_predictions_bleu.1,llama_7b_predictions_bleu,starling_predictions_bleu,yagpt_predictions_bleu,yagpt3_predictions_bleu
156,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.668113,0.78476,0.721754,0.017496,0.005383,0.060654,0.042217,0.024531,0.019532,0.026751
400,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.608544,0.651451,0.629267,0.016922,0.01195,0.075167,0.025555,0.019571,0.015097,0.032023
472,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.638269,0.68621,0.661372,0.023233,0.002937,0.036801,0.030596,0.032465,0.032633,0.038028
500,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.577001,0.686377,0.626954,0.066542,0.058565,0.039209,0.014875,0.02445,0.02837,0.014207
501,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.586071,0.746065,0.656461,0.051349,0.049504,0.029521,0.017103,0.01592,0.017944,0.018042


In [31]:
# from statistics import mean 
# from collections import defaultdict
# score_stats = defaultdict(dict) if (not "score_stats" in locals() and not "score_stats" in globals()) else score_stats
# for filename in files:
#     predictions_field = filename.split(".")[0]
#     mean_rouge1 = mean(results[predictions_field][f"{predictions_field}_rouge1"])
#     var_rouge1 = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rouge1"])) - mean_rouge1 ** 2
    
#     mean_rouge2 = mean(results[predictions_field][f"{predictions_field}_rouge2"])
#     var_rouge2 = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rouge2"])) - mean_rouge2 ** 2
    
#     mean_rougeL = mean(results[predictions_field][f"{predictions_field}_rougeL"])
#     var_rougeL = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rougeL"])) - mean_rougeL ** 2
    
#     mean_rougeLsum = mean(results[predictions_field][f"{predictions_field}_rougeLsum"])
#     var_rougeLsum = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rougeLsum"])) - mean_rougeLsum ** 2
#     score_stats[predictions_field].update({"mean_rouge1": mean_rouge1, "var_rouge1": var_rouge1, "mean_rouge2": mean_rouge2, "var_rouge2": var_rouge2, "mean_rougeL": mean_rougeL, "var_rougeL": var_rougeL, "mean_rougeLsum": mean_rougeLsum, "var_rougeLsum": var_rougeLsum})
# print(dict(score_stats))

In [32]:
# stats_pd = pd.DataFrame(score_stats)

In [33]:
# stats_pd.head(len(stats_pd))

In [34]:
summary_dataset.to_csv("summary_dataset_with_bertscore_bleu.csv", index=False)

In [35]:
len(summary_dataset)

51

## ROUGE

In [36]:
import pandas as pd
summary_dataset = pd.read_csv("summary_dataset_with_bertscore_bleu.csv")
files = [
    "mbart_predictions.txt",
    "mt5_predictions.txt",
    "summarunner_predictions.txt",
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]

In [37]:
import razdel
from evaluate import load
# results = {}
rouge = load('rouge')
references = list(summary_dataset["summary"])

def a(x):
    tokens = list(razdel.tokenize(x))
    return [_.text for _ in tokens]
    # return x.split()

for filename in files:
    predictions_field = filename.split(".")[0]
    predictions = list(summary_dataset[predictions_field])
    # results[predictions_field] = rouge.compute(predictions=predictions, references=references, use_aggregator=False, tokenizer=lambda x : list(razdel.tokenize(x)))
    results[predictions_field] = rouge.compute(predictions=predictions, references=references, use_aggregator=False, tokenizer=a)

In [38]:
data_keys = list(results.keys())
for data_key in data_keys:
    old_keys = list(results[data_key].keys())
    for old_metric_key in old_keys:
        results[data_key][f"{data_key}_" + old_metric_key] = results[data_key].pop(old_metric_key)

In [39]:
print(results[filename.split(".")[0]].keys())

dict_keys(['yagpt3_predictions_rouge1', 'yagpt3_predictions_rouge2', 'yagpt3_predictions_rougeL', 'yagpt3_predictions_rougeLsum'])


In [40]:
print(len(results[filename.split(".")[0]][f"{filename.split('.')[0]}_rouge1"]))

51


In [41]:
new_data = {}
for data_key in results.keys():
    for metric_key in results[data_key].keys():
        if "hashcode" in metric_key:
            continue
        new_data[metric_key] = pd.Series(results[data_key][metric_key])
new_data_df = pd.DataFrame(new_data)
new_data_df.head()

Unnamed: 0,mbart_predictions_rouge1,mbart_predictions_rouge2,mbart_predictions_rougeL,mbart_predictions_rougeLsum,mt5_predictions_rouge1,mt5_predictions_rouge2,mt5_predictions_rougeL,mt5_predictions_rougeLsum,summarunner_predictions_rouge1,summarunner_predictions_rouge2,...,starling_predictions_rougeL,starling_predictions_rougeLsum,yagpt_predictions_rouge1,yagpt_predictions_rouge2,yagpt_predictions_rougeL,yagpt_predictions_rougeLsum,yagpt3_predictions_rouge1,yagpt3_predictions_rouge2,yagpt3_predictions_rougeL,yagpt3_predictions_rougeLsum
0,0.129032,0.0,0.096774,0.096774,0.109091,0.037736,0.109091,0.109091,0.225,0.051282,...,0.135593,0.135593,0.202128,0.021505,0.159574,0.106383,0.194286,0.034682,0.182857,0.125714
1,0.101695,0.0,0.067797,0.067797,0.142857,0.0,0.142857,0.142857,0.282353,0.072289,...,0.134831,0.11236,0.179775,0.0,0.11236,0.089888,0.183333,0.033898,0.15,0.083333
2,0.145455,0.037736,0.145455,0.145455,0.041667,0.0,0.041667,0.041667,0.222222,0.0,...,0.161616,0.161616,0.234234,0.018349,0.144144,0.09009,0.257426,0.020202,0.178218,0.217822
3,0.190476,0.0,0.142857,0.142857,0.095238,0.0,0.095238,0.095238,0.196721,0.0,...,0.113208,0.113208,0.176471,0.04,0.176471,0.117647,0.089552,0.0,0.074627,0.059701
4,0.169492,0.0,0.135593,0.135593,0.122449,0.0,0.081633,0.081633,0.228571,0.019417,...,0.115607,0.127168,0.153846,0.0,0.123077,0.061538,0.166667,0.012987,0.115385,0.128205


In [42]:
new_summary_dataset = pd.concat([summary_dataset, new_data_df.set_index(summary_dataset.index)], axis=1)
new_summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,starling_predictions_rougeL,starling_predictions_rougeLsum,yagpt_predictions_rouge1,yagpt_predictions_rouge2,yagpt_predictions_rougeL,yagpt_predictions_rougeLsum,yagpt3_predictions_rouge1,yagpt3_predictions_rouge2,yagpt3_predictions_rougeL,yagpt3_predictions_rougeLsum
0,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.135593,0.135593,0.202128,0.021505,0.159574,0.106383,0.194286,0.034682,0.182857,0.125714
1,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.134831,0.11236,0.179775,0.0,0.11236,0.089888,0.183333,0.033898,0.15,0.083333
2,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.161616,0.161616,0.234234,0.018349,0.144144,0.09009,0.257426,0.020202,0.178218,0.217822
3,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.113208,0.113208,0.176471,0.04,0.176471,0.117647,0.089552,0.0,0.074627,0.059701
4,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.115607,0.127168,0.153846,0.0,0.123077,0.061538,0.166667,0.012987,0.115385,0.128205


In [43]:
summary_dataset = pd.DataFrame(new_summary_dataset)
del new_summary_dataset

In [44]:
summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,starling_predictions_rougeL,starling_predictions_rougeLsum,yagpt_predictions_rouge1,yagpt_predictions_rouge2,yagpt_predictions_rougeL,yagpt_predictions_rougeLsum,yagpt3_predictions_rouge1,yagpt3_predictions_rouge2,yagpt3_predictions_rougeL,yagpt3_predictions_rougeLsum
0,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.135593,0.135593,0.202128,0.021505,0.159574,0.106383,0.194286,0.034682,0.182857,0.125714
1,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.134831,0.11236,0.179775,0.0,0.11236,0.089888,0.183333,0.033898,0.15,0.083333
2,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.161616,0.161616,0.234234,0.018349,0.144144,0.09009,0.257426,0.020202,0.178218,0.217822
3,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.113208,0.113208,0.176471,0.04,0.176471,0.117647,0.089552,0.0,0.074627,0.059701
4,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.115607,0.127168,0.153846,0.0,0.123077,0.061538,0.166667,0.012987,0.115385,0.128205


Посчитаем статистики для представления о качестве датасета

In [45]:
from statistics import mean 
from collections import defaultdict
score_stats = defaultdict(dict) if (not "score_stats" in locals() and not "score_stats" in globals()) else score_stats
for filename in files:
    predictions_field = filename.split(".")[0]
    mean_rouge1 = mean(results[predictions_field][f"{predictions_field}_rouge1"])
    var_rouge1 = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rouge1"])) - mean_rouge1 ** 2
    
    mean_rouge2 = mean(results[predictions_field][f"{predictions_field}_rouge2"])
    var_rouge2 = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rouge2"])) - mean_rouge2 ** 2
    
    mean_rougeL = mean(results[predictions_field][f"{predictions_field}_rougeL"])
    var_rougeL = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rougeL"])) - mean_rougeL ** 2
    
    mean_rougeLsum = mean(results[predictions_field][f"{predictions_field}_rougeLsum"])
    var_rougeLsum = mean(map(lambda x: x**2, results[predictions_field][f"{predictions_field}_rougeLsum"])) - mean_rougeLsum ** 2
    score_stats[predictions_field].update({"mean_rouge1": mean_rouge1, "var_rouge1": var_rouge1, "mean_rouge2": mean_rouge2, "var_rouge2": var_rouge2, "mean_rougeL": mean_rougeL, "var_rougeL": var_rougeL, "mean_rougeLsum": mean_rougeLsum, "var_rougeLsum": var_rougeLsum})
print(dict(score_stats))

{'mbart_predictions': {'mean_precision': 0.6136284266032425, 'var_precision': 0.002562404920058825, 'mean_recall': 0.6459829117737564, 'var_recall': 0.0020065185461710655, 'mean_f1': 0.6278165169790679, 'var_f1': 0.0014819173965948673, 'mean_rouge1': 0.17117003932067482, 'var_rouge1': 0.004341147921558534, 'mean_rouge2': 0.02344682199891833, 'var_rouge2': 0.0011444597245821816, 'mean_rougeL': 0.13289182462640708, 'var_rougeL': 0.002422151570893989, 'mean_rougeLsum': 0.13289182462640708, 'var_rougeLsum': 0.002422151570893989}, 'mt5_predictions': {'mean_precision': 0.6229039161813026, 'var_precision': 0.002883070243606789, 'mean_recall': 0.5968168857050877, 'var_recall': 0.001236734393415706, 'mean_f1': 0.6083403989380481, 'var_f1': 0.0013447706139774196, 'mean_rouge1': 0.12115537965951172, 'var_rouge1': 0.004318886772201885, 'mean_rouge2': 0.01688636431316882, 'var_rouge2': 0.0009999806910292405, 'mean_rougeL': 0.10811023400453072, 'var_rougeL': 0.0025408305074679956, 'mean_rougeLsum': 

In [46]:
stats_pd = pd.DataFrame(score_stats)

In [47]:
stats_pd.head(len(stats_pd))

Unnamed: 0,mbart_predictions,mt5_predictions,summarunner_predictions,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions
mean_precision,0.613628,0.622904,0.620263,0.617807,0.597878,0.622112,0.609768
var_precision,0.002562,0.002883,0.002443,0.00289,0.002965,0.002084,0.001972
mean_recall,0.645983,0.596817,0.666155,0.716344,0.69702,0.705082,0.712831
var_recall,0.002007,0.001237,0.001324,0.002252,0.001594,0.002066,0.002063
mean_f1,0.627817,0.60834,0.641129,0.662422,0.642646,0.660237,0.656557
var_f1,0.001482,0.001345,0.001276,0.002046,0.001885,0.001643,0.001592
mean_rouge1,0.17117,0.121155,0.188155,0.184048,0.163554,0.173754,0.184505
var_rouge1,0.004341,0.004319,0.003888,0.005549,0.004344,0.003705,0.004085
mean_rouge2,0.023447,0.016886,0.025835,0.031596,0.022257,0.024918,0.035398
var_rouge2,0.001144,0.001,0.000804,0.001282,0.000571,0.000626,0.001011


In [48]:
summary_dataset.to_csv("summary_dataset_with_bertscore_bleu_rouge.csv", index=False)

## METEOR

https://huggingface.co/spaces/evaluate-metric/meteor

In [49]:
import pandas as pd
summary_dataset = pd.read_csv("summary_dataset_with_bertscore_bleu_rouge.csv")
files = [
    "mbart_predictions.txt",
    "mt5_predictions.txt",
    "summarunner_predictions.txt",
    "llama_7b_predictions.csv",
    "starling_predictions.csv",
    "yagpt_predictions.csv",
    "yagpt3_predictions.csv"
]

In [50]:
summary_dataset.columns

Index(['Ind', 'title', 'text', 'summary', 'mbart_predictions',
       'mt5_predictions', 'summarunner_predictions', 'mbart_predictions_bleu',
       'mt5_predictions_bleu', 'summarunner_predictions_bleu',
       'llama_7b_predictions', 'starling_predictions', 'yagpt_predictions',
       'yagpt3_predictions', 'mbart_predictions_bertscore_precision',
       'mbart_predictions_bertscore_recall', 'mbart_predictions_bertscore_f1',
       'mt5_predictions_bertscore_precision',
       'mt5_predictions_bertscore_recall', 'mt5_predictions_bertscore_f1',
       'summarunner_predictions_bertscore_precision',
       'summarunner_predictions_bertscore_recall',
       'summarunner_predictions_bertscore_f1',
       'llama_7b_predictions_bertscore_precision',
       'llama_7b_predictions_bertscore_recall',
       'llama_7b_predictions_bertscore_f1',
       'starling_predictions_bertscore_precision',
       'starling_predictions_bertscore_recall',
       'starling_predictions_bertscore_f1',
       'yag

In [51]:
len(summary_dataset)

51

In [52]:
from tqdm import tqdm
from threading import Thread
from evaluate import load
import os, contextlib

import nltk
nltk.download('wordnet')


def calculate_meteor(result_list, predictions, references, thread_no, step):
    meteor = None
    # with open(os.devnull, 'w') as devnull:
    #     with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
    
    meteor = load('meteor')
    index = thread_no
    disable_tqdm = (thread_no != 0)
    
    
    with tqdm(total=len(predictions), disable=disable_tqdm) as pbar:
        pbar.update(index)
        while index < len(predictions):
            result_list[index] = meteor.compute(predictions=[predictions[index]], references=[references[index]])
            index += step
            pbar.update(step)
    print(f"Thread {thread_no} finished")

results = {}
references = list(summary_dataset["summary"])
for filename in files:
    predictions_field = filename.split(".")[0]
    predictions = list(summary_dataset[predictions_field])
    results[predictions_field] = [0] * len(predictions)
    
    thread_num = 1
    thread_pool = []
    for thread in range(thread_num):
        thread_pool.append(Thread(target=calculate_meteor, args=[results[predictions_field], predictions, references, thread, thread_num], daemon=True))
        thread_pool[-1].start()

    for thread in range(thread_num):
        thread_pool[thread].join()

[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|██████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 59.82it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 523.03it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 464.33it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 350.18it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 336.63it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 366.60it/s]


Thread 0 finished


[nltk_data] Downloading package wordnet to /home/skatori/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/skatori/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/skatori/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
100%|█████████████████████████████████████████████████████████| 51/51 [00:00<00:00, 371.33it/s]

Thread 0 finished





In [53]:
print(results)

{'mbart_predictions': [{'meteor': 0.047393364928909956}, {'meteor': 0.037220843672456566}, {'meteor': 0.0992}, {'meteor': 0.10309278350515465}, {'meteor': 0.10600706713780918}, {'meteor': 0.1471140603480656}, {'meteor': 0.13636363636363635}, {'meteor': 0.2321977705170094}, {'meteor': 0.09485094850948508}, {'meteor': 0.20855379188712522}, {'meteor': 0.06147540983606558}, {'meteor': 0.078125}, {'meteor': 0.1739130434782609}, {'meteor': 0.10101010101010102}, {'meteor': 0.09067357512953368}, {'meteor': 0.044642857142857144}, {'meteor': 0.1348314606741573}, {'meteor': 0.19230769230769232}, {'meteor': 0.11627906976744187}, {'meteor': 0.07425742574257424}, {'meteor': 0.1627006880733945}, {'meteor': 0.05115089514066496}, {'meteor': 0.09328358208955226}, {'meteor': 0.07481296758104737}, {'meteor': 0.1724726775956284}, {'meteor': 0.0909090909090909}, {'meteor': 0.06993006993006994}, {'meteor': 0.04464285714285715}, {'meteor': 0.021067415730337075}, {'meteor': 0.06726457399103139}, {'meteor': 0.1

In [54]:
for key in results.keys():
    results[key] = {f"{key}_meteor": [val["meteor"] if not isinstance(val, int) else 0 for val in results[key]]}
    print(key, len(results[key]))

mbart_predictions 1
mt5_predictions 1
summarunner_predictions 1
llama_7b_predictions 1
starling_predictions 1
yagpt_predictions 1
yagpt3_predictions 1


In [55]:
new_data = {}
for data_key in results.keys():
    for metric_key in results[data_key].keys():
        if "hashcode" in metric_key:
            continue
        new_data[metric_key] = pd.Series(results[data_key][metric_key])
new_data_df = pd.DataFrame(new_data)
new_data_df.head()

Unnamed: 0,mbart_predictions_meteor,mt5_predictions_meteor,summarunner_predictions_meteor,llama_7b_predictions_meteor,starling_predictions_meteor,yagpt_predictions_meteor,yagpt3_predictions_meteor
0,0.047393,0.036145,0.181536,0.182509,0.115063,0.231934,0.204158
1,0.037221,0.05,0.18386,0.165975,0.170541,0.162835,0.169991
2,0.0992,0.013587,0.121951,0.192115,0.143198,0.194789,0.199416
3,0.103093,0.051546,0.22516,0.174309,0.226337,0.250197,0.104895
4,0.106007,0.054945,0.182371,0.241204,0.163728,0.169492,0.171053


In [56]:
new_summary_dataset = pd.concat([summary_dataset, new_data_df.set_index(summary_dataset.index)], axis=1)
new_summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,yagpt3_predictions_rouge2,yagpt3_predictions_rougeL,yagpt3_predictions_rougeLsum,mbart_predictions_meteor,mt5_predictions_meteor,summarunner_predictions_meteor,llama_7b_predictions_meteor,starling_predictions_meteor,yagpt_predictions_meteor,yagpt3_predictions_meteor
0,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.034682,0.182857,0.125714,0.047393,0.036145,0.181536,0.182509,0.115063,0.231934,0.204158
1,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.033898,0.15,0.083333,0.037221,0.05,0.18386,0.165975,0.170541,0.162835,0.169991
2,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.020202,0.178218,0.217822,0.0992,0.013587,0.121951,0.192115,0.143198,0.194789,0.199416
3,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.0,0.074627,0.059701,0.103093,0.051546,0.22516,0.174309,0.226337,0.250197,0.104895
4,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.012987,0.115385,0.128205,0.106007,0.054945,0.182371,0.241204,0.163728,0.169492,0.171053


In [57]:
summary_dataset = pd.DataFrame(new_summary_dataset)
del new_summary_dataset

In [58]:
summary_dataset.head()

Unnamed: 0,Ind,title,text,summary,mbart_predictions,mt5_predictions,summarunner_predictions,mbart_predictions_bleu,mt5_predictions_bleu,summarunner_predictions_bleu,...,yagpt3_predictions_rouge2,yagpt3_predictions_rougeL,yagpt3_predictions_rougeLsum,mbart_predictions_meteor,mt5_predictions_meteor,summarunner_predictions_meteor,llama_7b_predictions_meteor,starling_predictions_meteor,yagpt_predictions_meteor,yagpt3_predictions_meteor
0,156,wikilingua_multilingual-val-36988,Лучший путь поддержания здоровья зубов и предо...,Чистите зубы и используйте зубную нить. Не заб...,"Российские стоматологи рассказали о том, как з...",Существует множество способов защитить зубы от...,выберите синюю водолазку и темный блеск для гу...,0.017496,0.005383,0.060654,...,0.034682,0.182857,0.125714,0.047393,0.036145,0.181536,0.182509,0.115063,0.231934,0.204158
1,400,wikilingua_multilingual-val-37232,"Ваши руки - ваше сокровище, ухаживайте за ними...",Чрезмерный уход за руками. Не бросайте свою ос...,"На этой неделе мы расскажем о том, как правиль...","Если вы когда-нибудь станете топ моделью, то н...",регулярно мажьте руки питательным кремом. если...,0.016922,0.01195,0.075167,...,0.033898,0.15,0.083333,0.037221,0.05,0.18386,0.165975,0.170541,0.162835,0.169991
2,472,wikilingua_multilingual-val-37304,Или поместите ее в посудомоечную машину. Когда...,После еды сложите посуду в раковину и споласки...,"В сегодняшнем обзоре мы расскажем о том, как п...",Вы можете вывести пятна из стиральной машины.\n,"когда вы заметите, что она заполнена, запустит...",0.023233,0.002937,0.036801,...,0.020202,0.178218,0.217822,0.0992,0.013587,0.121951,0.192115,0.143198,0.194789,0.199416
3,500,wikilingua_multilingual-val-37332,"Все люди видят сны, и часто не по одному за но...",Ведите дневник сновидений. Делайте записи в дн...,"В преддверии ночи, когда вы проснетесь из-за с...",Сношения во сне могут помочь вам лучше понимат...,"записывайте все мельчайшие детали, а также фик...",0.066542,0.058565,0.039209,...,0.0,0.074627,0.059701,0.103093,0.051546,0.22516,0.174309,0.226337,0.250197,0.104895
4,501,wikilingua_multilingual-val-37333,. Прежде всего вам необходимо начинать поиски ...,Найдите свое определение любви Учитывайте разн...,"В сегодняшнем материале мы поговорим о том, ка...","Любовь - это явление, которое можно описать ка...",в контексте романтической любви многие нередко...,0.051349,0.049504,0.029521,...,0.012987,0.115385,0.128205,0.106007,0.054945,0.182371,0.241204,0.163728,0.169492,0.171053


Посчитаем статистики для представления о качестве датасета

In [59]:
from collections import defaultdict

score_stats = defaultdict(dict) if (not "score_stats" in locals() and not "score_stats" in globals()) else score_stats
for filename in files:
    predictions_field = filename.split(".")[0]
    score_stats[predictions_field].update({f"meteor": sum(results[predictions_field][f"{predictions_field}_meteor"]) / len(results[predictions_field][f"{predictions_field}_meteor"])})

In [60]:
stats_pd = pd.DataFrame(score_stats)

In [61]:
stats_pd.head(len(stats_pd))

Unnamed: 0,mbart_predictions,mt5_predictions,summarunner_predictions,llama_7b_predictions,starling_predictions,yagpt_predictions,yagpt3_predictions
mean_precision,0.613628,0.622904,0.620263,0.617807,0.597878,0.622112,0.609768
var_precision,0.002562,0.002883,0.002443,0.00289,0.002965,0.002084,0.001972
mean_recall,0.645983,0.596817,0.666155,0.716344,0.69702,0.705082,0.712831
var_recall,0.002007,0.001237,0.001324,0.002252,0.001594,0.002066,0.002063
mean_f1,0.627817,0.60834,0.641129,0.662422,0.642646,0.660237,0.656557
var_f1,0.001482,0.001345,0.001276,0.002046,0.001885,0.001643,0.001592
mean_rouge1,0.17117,0.121155,0.188155,0.184048,0.163554,0.173754,0.184505
var_rouge1,0.004341,0.004319,0.003888,0.005549,0.004344,0.003705,0.004085
mean_rouge2,0.023447,0.016886,0.025835,0.031596,0.022257,0.024918,0.035398
var_rouge2,0.001144,0.001,0.000804,0.001282,0.000571,0.000626,0.001011


In [62]:
summary_dataset.to_csv("summary_dataset_with_bertscore_bleu_rouge_meteor.csv", index=False)

In [63]:
summary_dataset = summary_dataset.head(35)

In [64]:
summary_dataset.to_csv("summary_dataset_with_bertscore_bleu_rouge_meteor.csv", index=False)

In [66]:
res = {}
for column in summary_dataset.columns:
    if "bleu" in column:
        res[column] = summary_dataset.loc[:, column].mean()

In [67]:
res

{'mbart_predictions_bleu': 0.0461162148004562,
 'mt5_predictions_bleu': 0.04063504403212619,
 'summarunner_predictions_bleu': 0.04393171673480969,
 'mbart_predictions_bleu.1': 0.0461162148004562,
 'mt5_predictions_bleu.1': 0.04063504403212619,
 'summarunner_predictions_bleu.1': 0.04393171673480969,
 'llama_7b_predictions_bleu': 0.03097015985650897,
 'starling_predictions_bleu': 0.024204768855108062,
 'yagpt_predictions_bleu': 0.024437143160110992,
 'yagpt3_predictions_bleu': 0.028146147471788834}