In [1]:
# Mount the google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# import llibraries and frameworks
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback
)



In [3]:
# inference example on how to translate a Kankanaey word/phrase/sentence using the trained model

## load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/nllb_kke_model")

## load and configure the model
model = AutoModelForSeq2SeqLM.from_pretrained(
    "/content/drive/MyDrive/nllb_kke_model",
    # low_cpu_mem_usage=True,
    # offload_folder="offload",
    # offload_state_dict=True,
    # torch_dtype=torch.float16
    )

## translation function
def translate_text(text, max_length=128, num_beams=4):
    # tokenize the source text and set src_lang via tokenizer (surrogate)
    inputs = tokenizer([text], return_tensors="pt", truncation=True, padding=True, max_length=200)
    # if torch.cuda.is_available():
    #     inputs = {k: v.to("cuda") for k, v in inputs.items()}
    #     model.to("cuda")
    # Ensure model generates in target lang by forcing bos token id (if set earlier)
    gen = model.generate(
        **inputs,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        forced_bos_token_id=getattr(model.config, "forced_bos_token_id", None)
    )
    out = tokenizer.batch_decode(gen, skip_special_tokens=True)
    return out[0].strip()

example_src = "Entako ed bilig"
exam_trans_tgt = "Let us go to the forest."
print("Example input (raw):", example_src)
print("Example target (raw):", exam_trans_tgt)
print("Translation (model):", translate_text(example_src))

Example input (raw): Entako ed bilig
Example target (raw): Let us go to the forest.
Translation (model): Let's go to the forest
