In [None]:
import torch
import pandas as pd
from tqdm.auto import tqdm
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

In [None]:
!nvidia-smi

In [None]:
!apt-get install -y git-lfs
!git lfs install
!git clone https://huggingface.co/datasets/masakhane/afrixnli /content/masakhane/afrixnli

In [None]:
train_path = "./masakhane/afrixnli/data/eng/dev.tsv"
test_path  = "./masakhane/afrixnli/data/eng/test.tsv"

# Load data set
train_df = pd.read_csv(train_path, sep="\t", header=0)
test_df  = pd.read_csv(test_path,  sep="\t", header=0)

print(f"Train DataFrame shape: {train_df.shape}")
print(f"Test  DataFrame shape: {test_df.shape}")

In [None]:
train_df.head()

In [None]:
test_df.head()

# [HelpMumHQ/AI-translator-eng-to-9ja](https://huggingface.co/HelpMumHQ/AI-translator-eng-to-9ja)

In [None]:
MODEL_NAME_M2M100 = "HelpMumHQ/AI-translator-eng-to-9ja"

# Load tokenizer & model
translator_tokenizer_m2m100 = M2M100Tokenizer.from_pretrained(MODEL_NAME_M2M100)
translator_model_m2m100 = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME_M2M100)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
translator_model_m2m100.to(device)

In [None]:
def translate_texts_m2m100(
    texts,
    src_lang="en",
    tgt_lang="yo",
    batch_size=16,
    max_length=128,
    num_beams=4
):
    """
    Translate `texts` (list of strings) from `src_lang` to `tgt_lang`
    using the globally-loaded M2M-100 model.
    Returns a list[str] of the same length.
    """
    tok = translator_tokenizer_m2m100
    mdl = translator_model_m2m100
    tok.src_lang = src_lang
    decoded_all = []

    for i in tqdm(range(0, len(texts), batch_size), desc=f"M2M100 {src_lang}→{tgt_lang}"):
        batch = texts[i : i + batch_size]
        enc = tok(
            batch,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_length
        ).to(device)

        gen = mdl.generate(
            **enc,
            forced_bos_token_id=tok.get_lang_id(tgt_lang),
            max_length=max_length,
            num_beams=num_beams
        )
        decoded_all.extend(tok.batch_decode(gen, skip_special_tokens=True))

    return decoded_all


In [None]:
TEXT_COLS = ["premise", "hypothesis"]
for col in TEXT_COLS:
    for df in (train_df, test_df):
        new_col = f"{col}_yoruba"
        df_type = "train" if df is train_df else "test"
        print(f"Translating `{col}` → `{new_col}` in {df_type}…")
        df[new_col] = translate_texts_m2m100(
            df[col].astype(str).tolist(),
            src_lang="en",
            tgt_lang="yo",
            batch_size=16
        )

In [None]:
train_df.to_csv("train_translated_m2m100.tsv", sep="\t", index=False)
test_df.to_csv("test_translated_m2m100.tsv",  sep="\t", index=False)
print("Files written: train_translated_m2m100.tsv, test_translated_m2m100.tsv")

In [None]:
train_dfe = pd.read_csv('./train_translated_m2m100.tsv', sep="\t", header=0)

In [None]:
train_dfe.head()

In [None]:
test_dfe = pd.read_csv('./test_translated_m2m100.tsv', sep="\t", header=0)