In [None]:
import os
import nltk
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForLanguageModeling
import numpy as np
import torch
import pandas as pd

from bert_score import score as bert_score_compute
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.chrf_score import corpus_chrf

nltk.download('punkt_tab', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

# Загрузка и подготовка датасета

In [None]:
VALID_FILE = '/content/drive/MyDrive/final_data/test.csv'
MODEL_NAME = 'ai-forever/rugpt3medium_based_on_gpt2'
OUTPUT_DIR = '/content/drive/MyDrive/checkpoints/ruGPT3medium_sum'

MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 64
BATCH_SIZE_PER_DEVICE = 2

In [8]:
rouge = Rouge()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [9]:
def preprocess_function(examples):
    inputs = []
    for t in examples['text']:
        prompt = 'Текст: ' + str(t) + ' Краткое содержание:'

        ids = tokenizer(prompt, truncation=True, max_length=MAX_SOURCE_LENGTH, add_special_tokens=False)['input_ids']
        inputs.append(ids)
    return {'input_ids': inputs}

In [11]:
# Подготовка датасета
raw_eval = load_dataset('csv', data_files={'validation': VALID_FILE})['validation']
print(f"Количество примеров: {len(raw_eval)}")
tokenized_eval = raw_eval.map(preprocess_function, batched=True, remove_columns=raw_eval.column_names)

Generating validation split: 0 examples [00:00, ? examples/s]

Количество примеров: 3228


Map:   0%|          | 0/3228 [00:00<?, ? examples/s]

# Подсчет метрик

In [10]:
def compute_metrics(preds, labels):
    # Декодируем в строки
    decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
    summaries = [d.split('Краткое содержание:')[-1].strip() for d in decoded]
    refs = [str(l).strip() for l in labels]

    filt_preds, filt_labels = [], []
    tok_preds, tok_refs = [], []
    for hyp, ref in zip(summaries, refs):
        if hyp and ref:
            filt_preds.append(hyp)
            filt_labels.append(ref)
            tok_preds.append(nltk.word_tokenize(hyp, language='russian'))
            tok_refs.append([nltk.word_tokenize(ref, language='russian')])

    results = {}
    # gen_len
    gen_lens = [len(h.split()) for h in filt_preds]
    results['gen_len'] = round(float(np.mean(gen_lens)) if gen_lens else 0.0, 4)

    # Rouge
    sc = rouge.get_scores(hyps=filt_preds, refs=filt_labels, avg=True)
    results['rouge1_f'] = round(sc['rouge-1']['f'] * 100, 4)
    results['rouge2_f'] = round(sc['rouge-2']['f'] * 100, 4)
    results['rougel_f'] = round(sc['rouge-l']['f'] * 100, 4)

    # BERTScore
    P, R, F1 = bert_score_compute(filt_preds, filt_labels, lang='ru', device=device)
    results['bert_score_f1'] = round(F1.mean().item() * 100, 4)

    # CHRF++
    results['chrf++'] = round(corpus_chrf(filt_labels, filt_preds) * 100, 4)

    # BLEU
    results['bleu'] = round(corpus_bleu(tok_refs, tok_preds) * 100, 4) if tok_preds else 0.0

    # METEOR
    meteor_scores = [meteor_score(r, p) for p, r in zip(tok_preds, tok_refs)]
    results['meteor'] = round(np.mean(meteor_scores) * 100, 4) if meteor_scores else 0.0

    return results

In [12]:
checkpoint_folders = sorted(
    [os.path.join(OUTPUT_DIR, d) for d in os.listdir(OUTPUT_DIR) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(OUTPUT_DIR, d))],
    key=lambda x: int(x.split('-')[-1]))

checkpoint_folders

['/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-1',
 '/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-2',
 '/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-3']

In [None]:
all_results1 = []

device = 'cuda' if torch.cuda.is_available() else 'cpu'

for ckpt_path in checkpoint_folders:
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(ckpt_path).to(device)
    model.eval()

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)

    args = Seq2SeqTrainingArguments(
        output_dir='./tmp_eval_output',
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE,
        predict_with_generate=True,
        generation_max_length=MAX_SOURCE_LENGTH + MAX_TARGET_LENGTH,
        generation_num_beams=4,
        fp16=torch.cuda.is_available(),
        report_to='none'
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        data_collator=data_collator
    )

    # Предсказания
    pred_out = trainer.predict(test_dataset=tokenized_eval)
    raw_preds = pred_out.predictions 

    # Очищаем невалидные токены
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id
    preds = np.where((raw_preds >= 0) & (raw_preds < vocab_size), raw_preds, pad_id).astype(int)

    # Считаем метрики
    labels = raw_eval['summary']
    m = compute_metrics(preds, labels)

    print(f"Метрики для {os.path.basename(ckpt_path)}:")
    for k, v in m.items():
        print(f"  {k}: {v}")

    result_entry = {'checkpoint': os.path.basename(ckpt_path)}
    result_entry.update(m)
    all_results1.append(result_entry)


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Метрики для checkpoint-1:
  gen_len: 16.8448
  rouge1_f: 27.6208
  rouge2_f: 13.1655
  rougel_f: 24.7906
  bert_score_f1: 76.6617
  chrf++: 34.5505
  bleu: 10.2944
  meteor: 27.2294


Метрики для checkpoint-2:
  gen_len: 17.0465
  rouge1_f: 27.5998
  rouge2_f: 13.1305
  rougel_f: 24.8251
  bert_score_f1: 76.8588
  chrf++: 35.2836
  bleu: 10.349
  meteor: 27.5693




Метрики для checkpoint-3:
  gen_len: 17.3789
  rouge1_f: 28.0156
  rouge2_f: 13.5553
  rougel_f: 25.2542
  bert_score_f1: 76.9728
  chrf++: 35.6485
  bleu: 10.6785
  meteor: 27.8397


In [None]:
checkpoint_folders = sorted(
    [os.path.join(OUTPUT_DIR, d) for d in os.listdir(OUTPUT_DIR) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(OUTPUT_DIR, d))],
    key=lambda x: int(x.split('-')[-1]))

checkpoint_folders

['/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-5',
 '/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-10',
 '/content/drive/MyDrive/checkpoints/ruGPT3medium_sum/checkpoint-20']

In [None]:
all_results2 = []

device = 'cuda' if torch.cuda.is_available() else 'cpu'

for ckpt_path in checkpoint_folders:
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(ckpt_path).to(device)
    model.eval()

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)

    args = Seq2SeqTrainingArguments(
        output_dir='./tmp_eval_output',
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE,
        predict_with_generate=True,
        generation_max_length=MAX_SOURCE_LENGTH + MAX_TARGET_LENGTH,
        generation_num_beams=4,
        fp16=torch.cuda.is_available(),
        report_to='none'
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        data_collator=data_collator
    )

    # Предсказания
    pred_out = trainer.predict(test_dataset=tokenized_eval)
    raw_preds = pred_out.predictions  

    # Очищаем невалидные токены
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id
    preds = np.where((raw_preds >= 0) & (raw_preds < vocab_size), raw_preds, pad_id).astype(int)

    # Считаем метрики
    labels = raw_eval['summary']
    m = compute_metrics(preds, labels)

    print(f"Метрики для {os.path.basename(ckpt_path)}:")
    for k, v in m.items():
        print(f"  {k}: {v}")

    result_entry = {'checkpoint': os.path.basename(ckpt_path)}
    result_entry.update(m)
    all_results2.append(result_entry)


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]



Метрики для checkpoint-5:
  gen_len: 17.3086
  rouge1_f: 28.3472
  rouge2_f: 13.6715
  rougel_f: 25.6117
  bert_score_f1: 77.106
  chrf++: 36.055
  bleu: 10.9312
  meteor: 28.2693




Метрики для checkpoint-10:
  gen_len: 17.9393
  rouge1_f: 28.5318
  rouge2_f: 13.7822
  rougel_f: 25.7967
  bert_score_f1: 77.0877
  chrf++: 37.1313
  bleu: 11.3196
  meteor: 28.8166




Метрики для checkpoint-20:
  gen_len: 23.2076
  rouge1_f: 27.0257
  rouge2_f: 12.5114
  rougel_f: 24.4839
  bert_score_f1: 76.175
  chrf++: 38.8347
  bleu: 10.7231
  meteor: 29.0458


In [None]:
results_df1 = pd.DataFrame(all_results1)
results_df1

Unnamed: 0,checkpoint,gen_len,rouge1_f,rouge2_f,rougel_f,bert_score_f1,chrf++,bleu,meteor
0,checkpoint-1,16.8448,27.6208,13.1655,24.7906,76.6617,34.5505,10.2944,27.2294
1,checkpoint-2,17.0465,27.5998,13.1305,24.8251,76.8588,35.2836,10.349,27.5693
2,checkpoint-3,17.3789,28.0156,13.5553,25.2542,76.9728,35.6485,10.6785,27.8397


In [None]:
results_df1.to_csv('metrics_rugpt3_medium1.csv', index=False)

In [None]:
results_df2 = pd.DataFrame(all_results2)
results_df2

Unnamed: 0,checkpoint,gen_len,rouge1_f,rouge2_f,rougel_f,bert_score_f1,chrf++,bleu,meteor
0,checkpoint-5,17.3086,28.3472,13.6715,25.6117,77.106,36.055,10.9312,28.2693
1,checkpoint-10,17.9393,28.5318,13.7822,25.7967,77.0877,37.1313,11.3196,28.8166
2,checkpoint-20,23.2076,27.0257,12.5114,24.4839,76.175,38.8347,10.7231,29.0458


In [None]:
results_df2.to_csv('metrics_rugpt3_medium2.csv', index=False)