In [None]:
%pip install transformers torch datasets sentencepiece
%pip install protobuf
%pip install -U "transformers[torch]"
%pip install -U "accelerate>=0.26.0"
%pip install sacrebleu
%pip install evaluate
%pip install sacrebleu
%pip install evaluate
%pip install rouge_score
%pip install bert_score
%pip install tensorboard
%pip install accelerate>=0.26.0

[0mINFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
[0m[31mERROR: Could not find a version that satisfies the requirement nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64" (from torch) (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64"[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
[0m

In [None]:
# # Load model and tokenizer
# model_name = "t5-small"
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)

# tokenizer.src_lang = "ta_IN"
# tokenizer.tgt_lang = "si_LK"

In [None]:
# Model Architecture Tweaks

# Load model and tokenizer
from transformers import T5ForConditionalGeneration, T5Config

# Try a larger model if possible
model_name = "t5-small"  # or "t5-large" if you have resources
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Or customize config
config = T5Config.from_pretrained(
    model_name,
    dropout_rate=0.1,
    layer_norm_epsilon=1e-6,
    d_ff=2048  # Larger feed-forward layers
)

model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

tokenizer.src_lang = "ta_IN"
tokenizer.tgt_lang = "si_LK"

In [None]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "/kaggle/input/t5-model-03/train.tsv",
        "validation": "/kaggle/input/t5-model-03/val.tsv",
        "test": "/kaggle/input/t5-model-03/test.tsv"
    },
    delimiter="\t",  # TSV format
    column_names=["source", "target"]  # Only needed if your files don't have headers
)

# Example usage
print("Train Sample:", dataset["train"][1])
print("Validation Sample:", dataset["validation"][1])
print("Test Sample:", dataset["test"][1])

In [None]:
dataset = dataset.filter(
    lambda x: x["source"].strip() != "" and x["target"].strip() != ""
)

In [None]:
# Check for empty source/target strings
empty_source = [ex for ex in dataset["train"] if not ex["source"].strip()]
empty_target = [ex for ex in dataset["train"] if not ex["target"].strip()]

print(f"Empty source samples: {len(empty_source)}")
print(f"Empty target samples: {len(empty_target)}")

In [None]:
def analyze_empty_values(split):
    print(f"\nAnalyzing {split} split:")
    empty_count = 0
    whitespace_count = 0
    
    for i, example in enumerate(dataset[split]):
        source_empty = not example["source"].strip()
        target_empty = not example["target"].strip()
        
        if source_empty or target_empty:
            empty_count += 1
            print(f"Index {i}:")
            print(f"  Source: {'EMPTY' if source_empty else repr(example['source'])}")
            print(f"  Target: {'EMPTY' if target_empty else repr(example['target'])}")
            
        if any(c.isspace() for c in example["source"]) or any(c.isspace() for c in example["target"]):
            whitespace_count += 1
    
    print(f"\nTotal empty examples: {empty_count}")
    print(f"Examples with only whitespace: {whitespace_count}")

# Run analysis on all splits
for split in ["train", "validation", "test"]:
    analyze_empty_values(split)

In [None]:
import pandas as pd

def check_with_pandas(split):
    df = pd.DataFrame(dataset[split])
    
    # Find empty strings
    empty_src = df[df["source"].str.strip() == ""]
    empty_tgt = df[df["target"].str.strip() == ""]
    
    print(f"\n{split} split empty sources: {len(empty_src)}")
    print(f"{split} split empty targets: {len(empty_tgt)}")
    
    # Show samples with empty values
    if not empty_src.empty:
        print("\nEmpty source examples:")
        print(empty_src.head())
    if not empty_tgt.empty:
        print("\nEmpty target examples:")
        print(empty_tgt.head())

for split in ["train", "validation", "test"]:
    check_with_pandas(split)

In [None]:
import matplotlib.pyplot as plt

def visualize_empty_values():
    splits = ["train", "validation", "test"]
    empty_counts = {}
    
    for split in splits:
        empty_src = sum(1 for ex in dataset[split] if not ex["source"].strip())
        empty_tgt = sum(1 for ex in dataset[split] if not ex["target"].strip())
        empty_counts[split] = (empty_src, empty_tgt)
    
    # Plotting
    fig, ax = plt.subplots()
    x = range(len(splits))
    width = 0.35
    
    ax.bar(x, [v[0] for v in empty_counts.values()], width, label='Empty Sources')
    ax.bar([p + width for p in x], [v[1] for v in empty_counts.values()], width, label='Empty Targets')
    
    ax.set_ylabel('Count')
    ax.set_title('Empty Values by Dataset Split')
    ax.set_xticks([p + width/2 for p in x])
    ax.set_xticklabels(splits)
    ax.legend()
    
    plt.show()

visualize_empty_values()

In [None]:
def clean_dataset(examples):
    return {
        "source": [s.strip() for s in examples["source"]],
        "target": [t.strip() for t in examples["target"]]
    }

dataset = dataset.map(clean_dataset, batched=True)
dataset = dataset.filter(
    lambda x: len(x["source"]) > 0 and len(x["target"]) > 0
)

In [None]:
def preprocess_function(examples):
    # Add prefix for T5 (important!)
    inputs = ["translate Tamil to Sinhala: " + ex for ex in examples["source"]]
    targets = [ex for ex in examples["target"]]
    
    model_inputs = tokenizer(
        inputs, 
        max_length=128, 
        truncation=True,
        padding="max_length"  # Ensure consistent length
    )
   
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=128, 
            truncation=True,
            padding="max_length"
        )
    
    # Replace padding token id with -100 for loss calculation
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] 
        for label in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
from transformers import TrainerCallback
import os

class SavePerEpochCallback(TrainerCallback):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def on_epoch_end(self, args, state, control, **kwargs):
        epoch_dir = os.path.join(args.output_dir, f"epoch_{int(state.epoch)}_model")
        os.makedirs(epoch_dir, exist_ok=True)
        kwargs["model"].save_pretrained(epoch_dir)
        self.tokenizer.save_pretrained(epoch_dir)
        return control


In [None]:
from evaluate import load
import numpy as np
import torch

bleu = load("sacrebleu")
rouge = load("rouge")
chrf = load("chrf")
bart_score = load("bertscore")  # No direct "bartscore", use bertscore or integrate external lib

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
    
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# In your compute_metrics function:

    decoded_preds = [pred if pred.strip() != "" else "[EMPTY]" for pred in decoded_preds]
    decoded_labels = [label if label.strip() != "" else "[EMPTY]" for label in decoded_labels]

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    bleu_result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_result = rouge.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels])
    chrf_result = chrf.compute(predictions=decoded_preds, references=decoded_labels)
    bertscore_result = bart_score.compute(predictions=decoded_preds, references=[l[0] for l in decoded_labels], lang="si")


    # Exact Match
    em = np.mean([p == l[0] for p, l in zip(decoded_preds, decoded_labels)])


    # Token Accuracy
    total = correct = 0
    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = pred.split()
        label_tokens = label[0].split()
        total += len(label_tokens)
        correct += sum([p == l for p, l in zip(pred_tokens, label_tokens)])
    token_acc = correct / total if total > 0 else 0


    return {
        "bleu": bleu_result["score"],
        "rougeL": rouge_result["rougeL"],
        "chrf": chrf_result["score"],
        "exact_match": em,
        "token_accuracy": token_acc,
        "bertscore_f1": np.mean(bertscore_result["f1"])
    }


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./kaggle/working/T5-results_ta_si",
    evaluation_strategy="steps",
    eval_steps=100,  # More frequent evaluation
    save_strategy="steps",
    save_steps=10,
    save_total_limit=3,
    logging_dir="./kaggle/working/logs",
    logging_steps=10,
    learning_rate=3e-4,  # T5 typically uses higher LR
    per_device_train_batch_size=16,  # Reduce if OOM occurs
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=30,
    predict_with_generate=True,
    fp16=True,
    warmup_steps=1000,
    gradient_accumulation_steps=4,  # Effective larger batch size
    optim="adafactor",  # T5's recommended optimizer
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True
)

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

#  Data collator for padding and batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding=True,
    max_length=128,
    return_tensors="pt"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[SavePerEpochCallback(tokenizer)]
)

In [None]:
trainer.tran()

In [None]:
# trainer.train(resume_from_checkpoint="/kaggle/working/kaggle/working/T5-results_ta_si/checkpoint-10280")