In [None]:
import os
import nltk
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForLanguageModeling
import numpy as np
import torch
import pandas as pd

from bert_score import score as bert_score_compute
from rouge import Rouge
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.chrf_score import corpus_chrf

nltk.download('punkt_tab', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

True

# Загрузка и подготовка датасета

In [None]:
VALID_FILE = '/content/drive/MyDrive/final_data/test.csv'
MODEL_NAME = 'ai-forever/rugpt3small_based_on_gpt2'
OUTPUT_DIR = '/content/drive/MyDrive/checkpoints/ruGPT3_sum'

MAX_SOURCE_LENGTH = 512
MAX_TARGET_LENGTH = 64
BATCH_SIZE_PER_DEVICE = 2

In [11]:
rouge = Rouge()

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

In [12]:
def preprocess_function(examples):
    inputs = []
    for t in examples['text']:
        prompt = 'Текст: ' + str(t) + ' Краткое содержание:'

        ids = tokenizer(prompt, truncation=True, max_length=MAX_SOURCE_LENGTH, add_special_tokens=False)['input_ids']
        inputs.append(ids)
    return {'input_ids': inputs}

In [None]:
raw_eval = load_dataset('csv', data_files={'validation': VALID_FILE})['validation']
print(f"Количество примеров: {len(raw_eval)}")
tokenized_eval = raw_eval.map(preprocess_function, batched=True, remove_columns=raw_eval.column_names)

Generating validation split: 0 examples [00:00, ? examples/s]

Количество примеров: 3228


Map:   0%|          | 0/3228 [00:00<?, ? examples/s]

# Подсчёт метрик

In [13]:
def compute_metrics(preds, labels):
    # Декодируем в строки
    decoded = tokenizer.batch_decode(preds, skip_special_tokens=True)
    summaries = [d.split('Краткое содержание:')[-1].strip() for d in decoded]
    refs = [str(l).strip() for l in labels]

    filt_preds, filt_labels = [], []
    tok_preds, tok_refs = [], []
    for hyp, ref in zip(summaries, refs):
        if hyp and ref:
            filt_preds.append(hyp)
            filt_labels.append(ref)
            tok_preds.append(nltk.word_tokenize(hyp, language='russian'))
            tok_refs.append([nltk.word_tokenize(ref, language='russian')])

    results = {}
    # gen_len
    gen_lens = [len(h.split()) for h in filt_preds]
    results['gen_len'] = round(float(np.mean(gen_lens)) if gen_lens else 0.0, 4)

    # Rouge
    sc = rouge.get_scores(hyps=filt_preds, refs=filt_labels, avg=True)
    results['rouge1_f'] = round(sc['rouge-1']['f'] * 100, 4)
    results['rouge2_f'] = round(sc['rouge-2']['f'] * 100, 4)
    results['rougel_f'] = round(sc['rouge-l']['f'] * 100, 4)

    # BERTScore
    P, R, F1 = bert_score_compute(filt_preds, filt_labels, lang='ru', device=device)
    results['bert_score_f1'] = round(F1.mean().item() * 100, 4)

    # CHRF++
    results['chrf++'] = round(corpus_chrf(filt_labels, filt_preds) * 100, 4)

    # BLEU
    results['bleu'] = round(corpus_bleu(tok_refs, tok_preds) * 100, 4) if tok_preds else 0.0

    # METEOR
    meteor_scores = [meteor_score(r, p) for p, r in zip(tok_preds, tok_refs)]
    results['meteor'] = round(np.mean(meteor_scores) * 100, 4) if meteor_scores else 0.0

    return results

In [15]:
checkpoint_folders = sorted(
    [os.path.join(OUTPUT_DIR, d) for d in os.listdir(OUTPUT_DIR) if d.startswith('checkpoint-') and os.path.isdir(os.path.join(OUTPUT_DIR, d))],
    key=lambda x: int(x.split('-')[-1]))

checkpoint_folders

['/content/drive/MyDrive/checkpoints/ruGPT3_sum/checkpoint-1',
 '/content/drive/MyDrive/checkpoints/ruGPT3_sum/checkpoint-10',
 '/content/drive/MyDrive/checkpoints/ruGPT3_sum/checkpoint-20',
 '/content/drive/MyDrive/checkpoints/ruGPT3_sum/checkpoint-30',
 '/content/drive/MyDrive/checkpoints/ruGPT3_sum/checkpoint-40']

In [None]:
all_results = []
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for ckpt_path in checkpoint_folders:
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(ckpt_path).to(device)
    model.eval()

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)

    args = Seq2SeqTrainingArguments(
        output_dir='./tmp_eval_output',
        per_device_eval_batch_size=BATCH_SIZE_PER_DEVICE,
        predict_with_generate=True,
        generation_max_length=MAX_SOURCE_LENGTH + MAX_TARGET_LENGTH,
        generation_num_beams=4,
        fp16=torch.cuda.is_available(),
        report_to='none'
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        data_collator=data_collator
    )

    # Предсказания
    pred_out = trainer.predict(test_dataset=tokenized_eval)
    raw_preds = pred_out.predictions  

    # Очищаем невалидные токены
    vocab_size = tokenizer.vocab_size
    pad_id = tokenizer.pad_token_id
    preds = np.where((raw_preds >= 0) & (raw_preds < vocab_size), raw_preds, pad_id).astype(int)

    # Считаем метрики
    labels = raw_eval['summary']
    m = compute_metrics(preds, labels)

    print(f"Метрики для {os.path.basename(ckpt_path)}:")
    for k, v in m.items():
        print(f"  {k}: {v}")

    result_entry = {'checkpoint': os.path.basename(ckpt_path)}
    result_entry.update(m)
    all_results.append(result_entry)


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Метрики для checkpoint-1:
  gen_len: 24.9628
  rouge1_f: 22.309
  rouge2_f: 9.3042
  rougel_f: 20.1036
  bert_score_f1: 73.3188
  chrf++: 29.3804
  bleu: 6.0272
  meteor: 21.5824


Метрики для checkpoint-10:
  gen_len: 22.3563
  rouge1_f: 24.6596
  rouge2_f: 11.2175
  rougel_f: 22.3052
  bert_score_f1: 74.7099
  chrf++: 32.5906
  bleu: 8.6051
  meteor: 24.6576


Метрики для checkpoint-20:
  gen_len: 19.0366
  rouge1_f: 25.4454
  rouge2_f: 11.6954
  rougel_f: 23.0045
  bert_score_f1: 75.4493
  chrf++: 32.9665
  bleu: 9.3725
  meteor: 25.2293


Метрики для checkpoint-30:
  gen_len: 19.3306
  rouge1_f: 25.7735
  rouge2_f: 11.9392
  rougel_f: 23.3381
  bert_score_f1: 75.5763
  chrf++: 33.2885
  bleu: 9.5952
  meteor: 25.5423


Метрики для checkpoint-40:
  gen_len: 18.2206
  rouge1_f: 25.7682
  rouge2_f: 11.9178
  rougel_f: 23.3153
  bert_score_f1: 75.645
  chrf++: 33.3899
  bleu: 9.5557
  meteor: 25.6184


In [17]:
results_df = pd.DataFrame(all_results)
results_df

Unnamed: 0,checkpoint,gen_len,rouge1_f,rouge2_f,rougel_f,bert_score_f1,chrf++,bleu,meteor
0,checkpoint-1,24.9628,22.309,9.3042,20.1036,73.3188,29.3804,6.0272,21.5824
1,checkpoint-10,22.3563,24.6596,11.2175,22.3052,74.7099,32.5906,8.6051,24.6576
2,checkpoint-20,19.0366,25.4454,11.6954,23.0045,75.4493,32.9665,9.3725,25.2293
3,checkpoint-30,19.3306,25.7735,11.9392,23.3381,75.5763,33.2885,9.5952,25.5423
4,checkpoint-40,18.2206,25.7682,11.9178,23.3153,75.645,33.3899,9.5557,25.6184


In [None]:
results_df.to_csv('metrics_rugpt3small.csv', index=False)