Lets see whether simple mt5 model overfits in small data samples

In [161]:
from transformers import (AutoTokenizer,
                          AutoModelForSeq2SeqLM,
                          Seq2SeqTrainer,
                          Seq2SeqTrainingArguments,
                          DataCollatorForSeq2Seq
                          )
from datasets import load_dataset
import evaluate
import numpy as np
import torch
import warnings
import wandb
warnings.filterwarnings("ignore")
import random
# Set all seeds for reproducibility
random.seed(100)
np.random.seed(100)
torch.manual_seed(100)
torch.cuda.manual_seed_all(100)
# Load aryal's dataset from hf
ds = load_dataset("sumitaryal/nepali_grammatical_error_correction")

In [162]:
ds

DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 7723971
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 406525
    })
})

In [163]:
# select randomly few samples from train 
# split further into train and valid dataset
small_dataset = ds["train"].shuffle(seed=42).select(range(12500))
small_dataset = small_dataset.train_test_split(test_size=0.1, seed=42)
small_dataset["valid"] = small_dataset["test"] # Rename the split in the DatasetDict
del small_dataset["test"]
small_dataset

DatasetDict({
    train: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 11250
    })
    valid: Dataset({
        features: ['incorrect_sentence', 'correct_sentence'],
        num_rows: 1250
    })
})

In [164]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)


In [165]:
prefix = "рд╡рд╛рдХреНрдп рд╕рдЪреНрдпрд╛рдЙрдиреБрд╣реЛрд╕реН: "

def preprocess(batch):
    
    inputs = [prefix + inp for inp in batch["incorrect_sentence"]]

    # tokenize input (incorrect)
    input_encodings = tokenizer(
        inputs, 
        max_length=128,
        truncation=True 
    )
    # tokenize target (correct)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(
            batch["correct_sentence"], 
            max_length=128,
            truncation=True
        )

    # set labels for seq2seq training                           # for seq2deq models, the "labels" are the token IDs of the target sequence
    input_encodings["labels"] = target_encodings["input_ids"]   

    return input_encodings

dataset_encoded = small_dataset.map(preprocess, batched=True) 


Map: 100%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИ| 11250/11250 [00:05<00:00, 2054.96 examples/s]
Map: 100%|тЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИтЦИ| 1250/1250 [00:00<00:00, 2456.12 examples/s]


In [166]:
#pytorch model expects in tensor format
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

In [167]:
import evaluate
import numpy as np
from collections import Counter
import numpy as np
import re

def tokenize_nepali(text):
    """Tokenizes Nepali text: splits on spaces and removes punctuation."""
    # Remove punctuation commonly used in Nepali
    text = re.sub(r"[ред,!?]", "", text)
    return text.strip().split()

def gleu_sentence(reference, prediction, max_n=4):
    """
    Compute sentence-level GEC-GLEU.
    Returns a score between 0 and 1.
    """
    ref_tokens = tokenize_nepali(reference)
    hyp_tokens = tokenize_nepali(prediction)
    
    # Adjust max_n for short sentences
    max_n = min(max_n, len(ref_tokens), len(hyp_tokens))
    if max_n == 0:
        return 0.0  # empty sentence
    
    scores = []
    for n in range(1, max_n+1):
        ref_ngrams = Counter([tuple(ref_tokens[i:i+n]) for i in range(len(ref_tokens)-n+1)])
        hyp_ngrams = Counter([tuple(hyp_tokens[i:i+n]) for i in range(len(hyp_tokens)-n+1)])
        overlap = sum((ref_ngrams & hyp_ngrams).values())
        precision = overlap / max(1, sum(hyp_ngrams.values()))
        recall = overlap / max(1, sum(ref_ngrams.values()))
        scores.append(min(precision, recall))
    return sum(scores) / max_n

def corpus_gec_gleu(references, predictions):
    """
    Compute corpus-level GEC-GLEU.
    `references` can be a list of strings or a list of single-item lists.
    """
    # Flatten single-reference lists
    refs_flat = [r[0] if isinstance(r, list) else r for r in references]
    
    scores = [gleu_sentence(r, p) for r, p in zip(refs_flat, predictions)]
    return float(np.mean(scores))

# Load metrics once
bleu_metric = evaluate.load("sacrebleu")
chrf_metric = evaluate.load("chrf")
bertscore_metric = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    """
    Compute BLEU, chrF, Correction Accuracy, and BERTScore for Nepali GEC.
    Handles both token IDs and plain text predictions.
    """
    predictions, labels = eval_pred

    # --- Handle tuple outputs (e.g., logits + labels) ---
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    # --- If preds/labels are lists of strings, skip decoding ---
    if isinstance(predictions[0], str) and isinstance(labels[0], str):
        preds_clean = [p.strip() for p in predictions]
        refs_clean = [r.strip() for r in labels]
    else:
        # Convert to numpy arrays
        predictions = np.array(predictions)
        labels = np.array(labels)

        # Handle logits (vocab dimension)
        if predictions.ndim == 3:
            predictions = predictions.argmax(axis=-1)

        # Replace -100 with pad_token_id
        predictions = np.where(predictions == -100, tokenizer.pad_token_id, predictions)
        labels = np.where(labels == -100, tokenizer.pad_token_id, labels)

        # Decode
        preds = tokenizer.batch_decode(predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        preds_clean = [p.strip() for p in preds]
        refs_clean = [r.strip() for r in refs]

    # --- Format for metrics ---
    references = [[r] for r in refs_clean]
    metrics = {}

    # --- BLEU ---
    try:
        non_empty_indices = [i for i, (p, r) in enumerate(zip(preds_clean, refs_clean)) if p and r]
        if non_empty_indices:
            preds_bleu = [preds_clean[i] for i in non_empty_indices]
            refs_bleu = [[refs_clean[i]] for i in non_empty_indices]
            bleu_result = bleu_metric.compute(predictions=preds_bleu, references=refs_bleu)
            metrics["bleu"] = bleu_result["score"]
        else:
            metrics["bleu"] = 0.0
    except Exception as e:
        print(f"BLEU computation failed: {e}")
        metrics["bleu"] = 0.0

    # --- chrF ---
    try:
        chrf_result = chrf_metric.compute(predictions=preds_clean, references=refs_clean)
        metrics["chrf"] = chrf_result["score"]
    except Exception as e:
        print(f"chrF computation failed: {e}")
        metrics["chrf"] = 0.0

    # --- Correction Accuracy ---
    try:
        exact_matches = np.mean([p == r for p, r in zip(preds_clean, refs_clean)])
        metrics["correction_accuracy"] = exact_matches
    except Exception as e:
        print(f"Correction accuracy computation failed: {e}")
        metrics["correction_accuracy"] = 0.0

    # # --- BERTScore ---
    # try:
    #     non_empty_indices_bert = [i for i, (p, r) in enumerate(zip(preds_clean, refs_clean)) if p and r]
    #     if non_empty_indices_bert:
    #         preds_bert = [preds_clean[i] for i in non_empty_indices_bert]
    #         refs_bert = [refs_clean[i] for i in non_empty_indices_bert]
    #         bertscore_result = bertscore_metric.compute(
    #             predictions=preds_bert,
    #             references=refs_bert,
    #             lang="ne",
    #             model_type="microsoft/mdeberta-v3-base"
    #         )
    #         metrics["bertscore_f1"] = float(np.mean(bertscore_result["f1"]))
    #     else:
    #         metrics["bertscore_f1"] = 0.0
    # except Exception as e:
    #     print(f"BERTScore computation failed: {e}")
    #     metrics["bertscore_f1"] = 0.0
        
    # --- GLEU (SacreBLEU) ---
    try:


        gleu_score = corpus_gec_gleu(refs_clean, preds_clean)


        metrics["gleu"] = gleu_score
    except Exception as e:
        print("GLEU failed:", e)
        metrics["gleu"] = 0.0

    # --- Print one sample for sanity ---
    if len(preds_clean) > 0:
        print(f"ЁЯФН Sample - Pred: '{preds_clean[0][:50]}...' | Ref: '{refs_clean[0][:50]}...' | Match: {preds_clean[0] == refs_clean[0]}")

    return metrics

preds = ["рдореЗрд░реЛ рдирд╛рдо рд╕рдиреНрддреЛрд╖ рд╣реЛ ред", "рдо рд╕реНрдХреБрд▓ рдЬрд╛рдиреНрдЫреБ ред", "рдо рдЦрд╛рдирд╛ рдЦрд╛рдиреНрдЫреБ ред"]
refs  = ["рдореЗрд░реЛ рдирд╛рдо рд╕рдиреНрддреЛрд╖ рд╣реЛ ред", "рдо рд╕реНрдХреБрд▓ рдЬрд╛рдиреНрдЫреБ ред", "рдо рдЦрд╛рдирд╛ рдЦрд╛рдиреНрдЫреБ ред"]
compute_metrics((preds, refs))

ЁЯФН Sample - Pred: 'рдореЗрд░реЛ рдирд╛рдо рд╕рдиреНрддреЛрд╖ рд╣реЛ ред...' | Ref: 'рдореЗрд░реЛ рдирд╛рдо рд╕рдиреНрддреЛрд╖ рд╣реЛ ред...' | Match: True


{'bleu': 100.00000000000004,
 'chrf': 100.0,
 'correction_accuracy': np.float64(1.0),
 'gleu': 1.0}

In [168]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [169]:
from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_8bit=True,
                                        llm_int8_threshold=6.0,  
                                        llm_int8_has_fp16_weight=False )
model = AutoModelForSeq2SeqLM.from_pretrained(model_id,
                                              quantization_config=quantization_config,
                                            #   torch_dtype=torch.float16, # disable if quantization used
                                              device_map="auto")

In [170]:
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
# from peft import unload
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "k", "v", "o", "wi", "wo"],
    lora_dropout=0.05, # disable for overfit test
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
model = prepare_model_for_kbit_training(model)
# model.gradient_checkpointing_enable()
# model = unload(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# model.config.use_cache = False  # Required for gradient checkpointing


trainable params: 1,769,472 || all params: 301,946,240 || trainable%: 0.5860


In [171]:
seq2seq_data_collator = DataCollatorForSeq2Seq(
    tokenizer, 
    # model=model,
    pad_to_multiple_of=8,  # Optional: for better performance
    return_tensors="pt", 
    padding=True)


In [172]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback, TrainerCallback
from math import ceil
import os
import wandb

wandb.finish()
wandb.init(project="nepali-grammar-correction", name="mt5-nepali")
run_id = wandb.run.id

batch_size = 32
num_train_epochs = 5
gradient_accumulation_steps = 2
learning_rate = 3e-3
weight_decay = 0.01
lr_scheduler_type = "linear"
steps_per_epoch = ceil(len(dataset_encoded["train"]) // (batch_size * gradient_accumulation_steps))    # no. of steps per epoch # log once per epoch
# logging_steps = max(1, steps_per_epoch // 20)                                                     # Log 20 times per epoch
eval_steps = max(1, steps_per_epoch) // 2           # Log 2 times per epoch
num_training_steps = steps_per_epoch * num_train_epochs
warmup_steps = int(0.05 * num_training_steps)


model_name = f"{model_id}-finetuned-gec"

# Create directories
os.makedirs(f"../outputs/checkpoints/{model_name}", exist_ok=True)
os.makedirs("../outputs/best_model", exist_ok=True)
os.makedirs("../outputs/logs", exist_ok=True)

training_args = Seq2SeqTrainingArguments(output_dir=f"../outputs/checkpoints/{model_name}",
                                         num_train_epochs=num_train_epochs,

                                         # Memory Optimization:
                                         per_device_train_batch_size=batch_size,
                                         per_device_eval_batch_size=batch_size,
                                         gradient_accumulation_steps=gradient_accumulation_steps,  # Simulate larger batch size eg: 8 * 2 = 16
                                         fp16=False,                                                # Use mixed precision if GPU supports it
                                         dataloader_pin_memory=True,                        # тЬЕ Faster data loading
                                         dataloader_num_workers=4,                          # тЬЕ Parallel data loading

                                         gradient_checkpointing=False,                      # тЬЕ Disable for speed
                                         
                                         # Logging & Saving:
                                         logging_dir="../outputs/logs",
                                         logging_steps=1,    # log the training loss and metrics every X steps
                                         eval_strategy="epoch",          # performs evaluation per epoch
                                        #  eval_steps=eval_steps,
                                         save_strategy="epoch",          # saves model checkpoint per epoch
                                        #  save_steps=230000,
                                         save_total_limit=2,             # keep last 2 checkpoints for safety
                                         overwrite_output_dir=True,      # Overwrite previous runs

                                         # Best Model saving:
                                         load_best_model_at_end=True,        # Load the best model at the end
                                         metric_for_best_model="eval_loss",   # Use eval_loss to determine best model
                                         greater_is_better=False,            # Lower eval_loss is better

                                         # performance
                                         warmup_steps=warmup_steps,             # Gradually increases LR at start
                                         learning_rate=learning_rate,
                                         weight_decay=weight_decay,             # L2 regularization
                                         lr_scheduler_type=lr_scheduler_type,
                                         max_grad_norm=1.0,                     # Prevent exploding gradients
                                         optim="paged_adamw_8bit",              # Better optimizer for quantized models


                                         # Seq2seq specific:
                                         predict_with_generate=True,    # essential for seq2seq , If not set then metrics will be computed on meaningless logits
                                         generation_max_length=128,      # Max output length
                                         generation_num_beams=1,        # 1=greedy, 4=beam search (slower but better)

                                         report_to="wandb",          # This enables automatic logging
                                         run_name="mt5-nepali",
                                         push_to_hub=False,                       # save the model to HF
                                         seed=42,
                                         data_seed=42,
                                         )





[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
eval/bleu,тЦБтЦЕтЦДтЦЕтЦЗтЦИ
eval/chrf,тЦБтЦГтЦЕтЦЗтЦИтЦИ
eval/correction_accuracy,тЦБтЦБтЦБтЦБтЦБтЦБ
eval/gleu,тЦБтЦГтЦЕтЦЖтЦИтЦИ
eval/loss,тЦИтЦДтЦВтЦБтЦБтЦБ
eval/model_preparation_time,тЦБтЦБтЦБтЦБтЦБтЦБ
eval/runtime,тЦБтЦЕтЦИтЦИтЦЗтЦИ
eval/samples_per_second,тЦИтЦВтЦБтЦБтЦБтЦБ
eval/steps_per_second,тЦИтЦВтЦБтЦБтЦБтЦБ
train/epoch,тЦБтЦБтЦВтЦВтЦВтЦВтЦВтЦВтЦВтЦВтЦГтЦГтЦГтЦГтЦГтЦДтЦДтЦДтЦДтЦДтЦЕтЦЕтЦЕтЦЕтЦЕтЦЕтЦЕтЦЖтЦЖтЦЖтЦЗтЦЗтЦЗтЦЗтЦЗтЦЗтЦЗтЦИтЦИтЦИ

0,1
eval/bleu,9.83785
eval/chrf,44.45996
eval/correction_accuracy,0
eval/gleu,0.12065
eval/loss,1.41266
eval/model_preparation_time,0.0036
eval/runtime,239.1088
eval/samples_per_second,1.046
eval/steps_per_second,0.067
total_flos,317706831396864.0




In [173]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["valid"],  # same dataset for overfitting here
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    compute_metrics=compute_metrics,
    callbacks=[
               EarlyStoppingCallback(early_stopping_patience=3)]
      
)



In [174]:
# Complete safety check
def safe_training_check(trainer):
    """Comprehensive pre-training safety check"""
    print(" Running pre-training safety checks...")

    # 1. Check model is on correct device
    print(f"Model device: {next(trainer.model.parameters()).device}")

    # 2. Check dataset sizes
    print(f"Train dataset size: {len(trainer.train_dataset)}")
    print(f"Eval dataset size: {len(trainer.eval_dataset)}")

    # 3. Test data loading
    try:
        sample_batch = next(iter(trainer.get_train_dataloader()))
        print(" Data loading works")
        # print(f"Batch keys: {sample_batch.keys()}")
    except Exception as e:
        print(f" Data loading failed: {e}")
        return False

    # 4. Test evaluation
    try:
        trainer.model.eval()    # Set to evaluation mode
        print(" Performing evaluation check...")
        eval_results = trainer.evaluate()
        print(" Evaluation successful")
        print(f"Initial metrics: {eval_results}")
        return True
    except Exception as e:
        print(f" Evaluation failed: {e}")
        return False

# Usage
if safe_training_check(trainer):
    print("\n" + "="*60)
    print("тЬЕ All checks passed! Starting training...")
    print("="*60)
    trainer.train()
    print("тЬЕ Training complete!")

else:
    print(" Fix issues before training!")

 Running pre-training safety checks...
Model device: cuda:0
Train dataset size: 11250
Eval dataset size: 1250
 Data loading works
 Performing evaluation check...


ЁЯФН Sample - Pred: '<extra_id_0> ред...' | Ref: 'рдпрддрд╛рдХрд╛ рджрд╢рдХрд╣рд░реВрдорд╛ рднрдПрдХрд╛ рдкреНрд░рддреНрдпреЗрдХ рдЬрдирдЖрдиреНрджреЛрд▓рдирдорд╛ рдиреЗрдкрд╛рд▓реА рдХрдо...' | Match: False
 Evaluation successful
Initial metrics: {'eval_loss': 14.842653274536133, 'eval_model_preparation_time': 0.0032, 'eval_bleu': 0.2091458306127584, 'eval_chrf': 1.1538016829513393, 'eval_correction_accuracy': 0.0, 'eval_gleu': 0.0062149767114550395, 'eval_runtime': 79.8617, 'eval_samples_per_second': 15.652, 'eval_steps_per_second': 0.501}

тЬЕ All checks passed! Starting training...


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.67 GiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 10.31 GiB is allocated by PyTorch, and 1007.83 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import gc
import torch

# del model       # or del comet_model
gc.collect()
torch.cuda.empty_cache()

Inference

In [None]:
def correct_grammar_simple(text):
    # Add task prefix (use the same format as during training)
    input_text = f"рд╡рд╛рдХреНрдп рд╕реБрдзрд╛рд░реНрдиреБрд╣реЛрд╕реН: {text}"
    
    # Tokenize
    inputs = tokenizer(
        input_text,
        return_tensors = "pt",
        truncation = True,
        padding=False
    ).to(device)
    
    # Generate correction
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=128,
            # num_beams=5,
            # repetition_penalty=2.5,
            # length_penalty=1.0,
            # temperature=0.8
        )
        
    # Decode output
    corrected_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected_text

# Test
test_sentence = "рдирдЧрд░рдкрд╛рд▓рд┐рдХрд╛ рдХрд╕реНрддреЛ рдХрд┐рд╕рд┐рдордХреЛ рдкрд░реНрдпрдЯрдХ рд▓реНрдпрд╛рдЙрди рд╕рдХреНрдЫреЗ "
corrected = correct_grammar_simple(test_sentence)
print(f"Original: {test_sentence}")
print(f"Corrected: {corrected}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


NameError: name 'device' is not defined

In [None]:
def correct_batch(texts, batch_size=8):
    """
    Correct grammar for multiple sentences
    """
    corrected_texts = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        
        # Add prefix to each text
        input_texts = [f"рд╡рд╛рдХреНрдп рд╕реБрдзрд╛рд░реНрдиреБрд╣реЛрд╕реН: {text}" for text in batch_texts]
        
        
    
        # Tokenize
        inputs = tokenizer(
            input_texts,
            return_tensors = "pt",
            truncation = True,
            padding=True
        ).to(device)
        
        # Generate correction
        with torch.no_grad():
            outputs = model.generate(
                inputs.input_ids,
                # attention_mask=inputs.attention_mask,
                max_length=128,
                num_beams=5,
                repetition_penalty=2.5
            )
            
        # Decode batch
        batch_corrected = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        corrected_texts.extend(batch_corrected)
        
    return corrected_texts
        
    
test_sentences = small_dataset["train"]["incorrect_sentence"][:]
labels = small_dataset["train"]["correct_sentence"][:]
corrected_sentences = correct_batch(test_sentences)
for orig, corr, lab in zip(test_sentences, corrected_sentences, labels):
    print(f"Original:  {orig}")
    print(f"Corrected: {corr}")
    print(f"label:     {lab}")
    print("---")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Original:  рдЦрд╛рдирдкрд┐рди рд╣реБрдиреНрде ред
Corrected: рдЦрд╛рдирдкрд┐рди рд╣реБрдиреНрдереНрдпреЛ ред
label:     рдЦрд╛рдирдкрд┐рди рд╣реБрдиреНрдереНрдпреЛ ред
---
Original:  рдЖрдлреНрдиреИ рдиреЗрддреГрддреНрд╡рдорд╛ рдРрддрд┐рд╣рд╛рд╕рд┐рдХ рд╕рдВрд╡рд┐рдзрд╛рди рдЬрд╛рд░реА рдЧрд░реНрджреИ рд╕рдВрд╡рд┐рдзрд╛рдирдХреЛ рд╕рдлрд▓ рдХрд╛рд░реНрдпрд╛рдиреНрд╡рдпрди рд░ рд╕рд░реНрд╡рд╕реНрд╡реАрдХрд╛рд░реНрдпрддрд╛рдХреЛ рджрд╛рдпрд░рд╛рд▓рд╛рдИ рдлрд░рд╛рдХрд┐рд▓реЛ рдмрдиреЗ рдХрд╛рдо рдХрдо рдЪреБрдиреМрддреАрдкреВрд░реНрдг рдерд┐рдПрди ред
Corrected: рдЖрдлреНрдиреИ рдиреЗрддреГрддреНрд╡рдорд╛ рдРрддрд┐рд╣рд╛рд╕рд┐рдХ рд╕рдВрд╡рд┐рдзрд╛рди рдЬрд╛рд░реА рдЧрд░реНрджреИ рд╕рдВрд╡рд┐рдзрд╛рдирдХреЛ рд╕рдлрд▓ рдХрд╛рд░реНрдпрд╛рдиреНрд╡рдпрди рд░ рд╕рд░реНрд╡рд╕реНрд╡реАрдХрд╛рд░реНрдпрддрд╛рдХреЛ рджрд╛рдпрд░рд╛рд▓рд╛рдИ рдлрд░рд╛рдХрд┐рд▓реЛ рдмрдирд╛рдЙрдиреЗ рдХрд╛рдо рдХрдо рдЪреБрдиреМрддреАрдкреВрд░реНрдг рдерд┐рдПрди ред
label:     рдЖрдлреНрдиреИ рдиреЗрддреГрддреНрд╡рдо

In [None]:
compute_metrics((corrected_sentences, labels))

NameError: name 'corrected_sentences' is not defined