In [3]:
import argparse
import csv
from datasets import load_dataset, Dataset
import io
import os
import random
import sys
from tokenizers import Tokenizer
import torch
from transformers import AutoTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

2025-09-09 15:13:54.285035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757430834.628363      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757430834.719265      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
#configuration.py
base_path = "/kaggle/input/cac-da-machine-translation-corpus-eng-amh"
whole_tsv = f"{base_path}/CAC_DA_Amh_Eng_whole_data.tsv"
train_data  = f"{base_path}/CAC_DA_Amh_Eng_train.tsv"
val_data   = f"{base_path}/CAC_DA_Amh_Eng_val.tsv"
test_data   = f"{base_path}/CAC_DA_Amh_Eng_test.tsv"

log_file   = "/kaggle/working/output/malformed_lines.log"
pretrained_tokenizer_name = 'facebook/nllb-200-distilled-600M'
pretrained_tokenizer = f'{base_path}nllb_200_distilled_600M_tokenizer'
output_dir = '/kaggle/working/output/nllb_finetuned'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

split_ratio = (0.84, 0.08, 0.08)
min_len = 1  # adjust to your preference
max_len = 128  # Maximum length for tokenization

batch_size = 8
num_epochs = 100

tag_am, tag_en = ">>amh<<", ">>eng<<"
pairs_written, malformed = 0, 0
# make sure log_file is created
os.makedirs(log_file, exist_ok=True)
def is_ok(src, tgt):
    # Basic filtering: skip very short or malformed lines
    if src is None or tgt is None:
        return False
    if len(src.split()) < 3 or len(tgt.split()) < 3:
        return False
    return True

class TSVStream:
    def __init__(self, path, tokenizer):
        self.path = path
        self.tok = tokenizer

    def __iter__(self):
        ds = load_dataset("csv", data_files=self.path, split="train", delimiter="\t",
                          column_names=["tag", "src", "tgt"], quoting=csv.QUOTE_MINIMAL, streaming=True, )

        lang_token_map = {">>eng<<": "eng_Latn", ">>amh<<": "amh_Ethi" }

        for ex in ds:
            if not is_ok(ex["src"], ex["tgt"]):
                continue

            tag = ex["tag"]
            lang_code = lang_token_map.get(tag)
            if lang_code is None:
                continue

            forced_bos_token_id = self.tok.convert_tokens_to_ids(lang_code)
            if forced_bos_token_id is None:
                continue

            # 🔥 Set the target language code manually
            self.tok.tgt_lang = lang_code

            source = f"{tag} {ex['src']}"  # Prefix source with language tag

            model_inputs = self.tok(source, max_length=max_len, truncation=True, padding="max_length", return_tensors="pt", )

            labels = self.tok(text_target=ex["tgt"], max_length=max_len, truncation=True, padding="max_length",
                              return_tensors="pt", )

            model_inputs["labels"] = labels["input_ids"].squeeze(0)
            model_inputs = {k: v.squeeze(0) for k, v in model_inputs.items()}
            model_inputs["forced_bos_token_id"] = forced_bos_token_id

            yield {k: torch.tensor(v) if isinstance(v, list) else v for k, v in model_inputs.items() }

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--split", type=str, default=train_data,  # required=True,
                        help="TSV split file path")
    args = parser.parse_args()

    print(f"📂 Using split file: {args.split}")

    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

    # # Save pretrained model
    # tokenizer.save_pretrained(f'{base_path}nllb_200_distilled_600M_tokenizer/')
    tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name,
                                              cache_dir=f'{base_path}nllb_200_distilled_600M_tokenizer/')

    # # Load
    # tokenizer = Tokenizer.from_file(f'{base_path}nllb_200_distilled_600M_tokenizer/')

    stream = TSVStream(args.split, tokenizer)

    print("🔍 Sanity‑checking first few samples …")
    for i, ex in enumerate(stream):
        print(f"\nSample {i + 1}:")
        for k, v in ex.items():
            if torch.is_tensor(v):
                print(f"  {k}: {tuple(v.shape)}")
            else:
                print(f"  {k}: {v}")
        if i >= 4:
            break

# if __name__ == "__main__":
#     main()
# ─── Load tokenizer & model ─────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)
model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_tokenizer_name).to(device)
# ─── Load Datasets from TSVs using your streaming parser ────────
train_tsvstream = list(TSVStream(train_data, tokenizer))
val_tsvstream = list(TSVStream(val_data, tokenizer))
train_ds = Dataset.from_list(train_tsvstream)
eval_ds = Dataset.from_list(val_tsvstream)
print("✅ Dataset sizes:")
print("   ➤ train:", len(train_ds))
print("   ➤ eval :", len(eval_ds))
# ─── Data Collator ──────────────────────────────────────────────
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# make sure output directory is created
os.makedirs(output_dir, exist_ok=True)
# ─── Training arguments ─────────────────────────────────────────
training_args = Seq2SeqTrainingArguments(output_dir=output_dir, num_train_epochs=num_epochs,
                                         per_device_train_batch_size=batch_size,
                                         per_device_eval_batch_size=batch_size,
                                         eval_strategy="epoch", save_strategy="epoch",
                                         save_total_limit=3, predict_with_generate=True,
                                         fp16=torch.cuda.is_available(),
                                         logging_dir="./logs", logging_steps=100,
                                         report_to="none",  # disables W&B
                                         load_best_model_at_end=True,
                                         metric_for_best_model="eval_loss", greater_is_better=False, )

# ─── Trainer setup ──────────────────────────────────────────────
trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds,
                         tokenizer=tokenizer, data_collator=data_collator, )
# ─── Train model ────────────────────────────────────────────────
trainer.train()
# ─── Save final model ───────────────────────────────────────────
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\n✅ Fine-tuned model saved to: {output_dir}")


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

✅ Dataset sizes:
   ➤ train: 122579
   ➤ eval : 11681


  trainer = Seq2SeqTrainer(model=model, args=training_args, train_dataset=train_ds, eval_dataset=eval_ds,
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


In [None]:
!pip install pandas torch logging datasets transdormers evaluate nltk 
# 1. Import Modules
import pandas as pd
import torch
import logging
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration, T5Tokenizer,
    Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
)
import evaluate
import nltk
from nltk.translate.bleu_score import SmoothingFunction
from google.colab import files

print("Upload CAC_DA_Amh_Eng_train.tsv file")
#uploaded = files.upload()
#file_name = list(uploaded.keys())[0]

nltk.download('punkt')
logging.basicConfig(level=logging.INFO)

# 2. Load and Clean TSV Dataset
df = pd.read_csv(file_name, sep='\t', header=0)
df = df.rename(columns={df.columns[0]: ">>amh<<", df.columns[1]: ">>eng<<"})
df_clean = df.dropna(subset=[">>amh<<", ">>eng<<"])
df_clean = df_clean[df_clean[">>amh<<"].str.strip().astype(bool) & df_clean[">>eng<<"].str.strip().astype(bool)]

# 3. Convert to Hugging Face Dataset and Split
dataset = Dataset.from_pandas(df_clean.reset_index(drop=True))
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 4. Tokenization Setup
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
prefix = "translate Amharic to English: "

def preprocess_function(example):
    input_text = prefix + example[">>amh<<"]
    target_text = example[">>eng<<"]
    model_inputs = tokenizer(input_text, max_length=128, padding="max_length", truncation=True)
    label_ids = tokenizer(target_text, max_length=128, padding="max_length", truncation=True)["input_ids"]
    model_inputs["labels"] = [token_id if token_id != tokenizer.pad_token_id else -100 for token_id in label_ids]
    return model_inputs

tokenized_ds = dataset.map(preprocess_function, batched=False)

# 5. Load Model & Setup Data Collator
model = T5ForConditionalGeneration.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# 6. BLEU Metric Load
bleu = evaluate.load("bleu")
smoother = SmoothingFunction().method4

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Replace potential out-of-range token IDs in predictions with pad_token_id
    preds[preds == -100] = tokenizer.pad_token_id
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 back to pad_token_id for decoding
    labels = [
        [(token if token != -100 else tokenizer.pad_token_id) for token in example]
        for example in labels
    ]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Ensure references are lists of strings for BLEU computation
    decoded_labels = [[lab] for lab in decoded_labels]
    return bleu.compute(predictions=decoded_preds, references=decoded_labels)

# 7. Training Configuration
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-amharic-eng-final",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,

)

# 8. Trainer Setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 9. Train the Model
trainer.train()

# 10. Save Final Checkpoint
model.save_pretrained("t5-amharic-eng-final")
tokenizer.save_pretrained("t5-amharic-eng-final")

# 11. Run Sample Inference
sample_inputs = df_clean[">>amh<<"].sample(10, random_state=42).tolist()
inputs = tokenizer([prefix + s for s in sample_inputs], return_tensors="pt", padding=True).to(model.device)
outputs = model.generate(**inputs)
translations = tokenizer.batch_decode(outputs, skip_special_tokens=True)

pd.DataFrame({"Amharic": sample_inputs, "Predicted English": translations}).to_csv(
    "sample_predictions.csv", index=False
)
print("✅ Sample predictions saved to sample_predictions.csv")