In [None]:
!pip install evaluate
!pip install sacrebleu



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
from datasets import Dataset  # Corrected from Datasets
import torch
import numpy as np
import evaluate
import os
from glob import glob
from huggingface_hub import login, upload_folder

In [None]:
# Disable wandb to avoid unnecessary logging
os.environ["WANDB_DISABLED"] = "true"

# 1. Load dữ liệu
data_path = '/content/clean_vie.txt'

def load_data_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
    en_texts, vi_texts = zip(*[line.strip().split("\t") for line in lines])
    return {"en": list(en_texts), "vi": list(vi_texts)}

data = load_data_from_file(data_path)
ds = Dataset.from_dict(data)  # Create a Dataset object from the dictionary

# Split dataset into train and validation (e.g., 90% train, 10% validation)
train_test_split = ds.train_test_split(test_size=0.1, seed=42)
train_ds = train_test_split["train"]
val_ds = train_test_split["test"]

In [None]:
# 2. Tokenizer
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [None]:
# 3. Preprocessing function
MAX_LEN = 75

def preprocess_function(examples):
    inputs = tokenizer(examples["en"], padding="max_length", truncation=True, max_length=MAX_LEN)
    targets = tokenizer(examples["vi"], padding="max_length", truncation=True, max_length=MAX_LEN)

    # Replace pad token with -100 for loss calculation
    labels = [[-100 if token == tokenizer.pad_token_id else token for token in label]
              for label in targets["input_ids"]]

    return {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"],
        "labels": labels
    }

# Apply preprocessing
preprocessed_train_ds = train_ds.map(preprocess_function, batched=True)
preprocessed_val_ds = val_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/6650 [00:00<?, ? examples/s]

Map:   0%|          | 0/739 [00:00<?, ? examples/s]

In [None]:
# 4. Load model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [None]:
# 5. Define evaluation metric (SacreBLEU)
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100 with pad token id
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
# 6. Training arguments (adjusted for Kaggle)
training_args = Seq2SeqTrainingArguments(
    output_dir="/en-vi-mbart50",  # Kaggle working directory
    logging_dir="/logs",          # Kaggle working directory
    logging_steps=1000,
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=1,
    num_train_epochs=3,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision for GPU efficiency on Kaggle
    report_to="none",  # Disable external reporting (e.g., wandb)
)

In [None]:
# 7. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
# 8. Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=preprocessed_train_ds,
    eval_dataset=preprocessed_val_ds,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# 9. Train the model
trainer.train()

Step,Training Loss,Validation Loss,Bleu
1000,0.7562,1.05717,45.238247


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1248, training_loss=0.6633907159169515, metrics={'train_runtime': 975.389, 'train_samples_per_second': 20.453, 'train_steps_per_second': 1.279, 'total_flos': 3166570229760000.0, 'train_loss': 0.6633907159169515, 'epoch': 3.0})

In [None]:
# 10. Save the final model locally
final_model_path = "./en-vi-mbart50-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)


('./en-vi-mbart50-final/tokenizer_config.json',
 './en-vi-mbart50-final/special_tokens_map.json',
 './en-vi-mbart50-final/sentencepiece.bpe.model',
 './en-vi-mbart50-final/added_tokens.json',
 './en-vi-mbart50-final/tokenizer.json')

In [None]:
# 11. Upload to Hugging Face
hf_token = "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"
login(hf_token)

# Find latest checkpoint
checkpoints = glob("/content/en-vi-mbart50-final")
if checkpoints:
    checkpoint_path = max(checkpoints, key=os.path.getctime)
    try:
        # Upload best checkpoint
        upload_folder(
            folder_path=checkpoint_path,
            repo_id="VyDat/NMT_FT",
            commit_message=f"Upload fine-tuned checkpoint ({os.path.basename(checkpoint_path)})",
        )

        # Upload final model
        upload_folder(
            folder_path=final_model_path,
            repo_id="VyDat/NMT_FT",
            commit_message="Upload final model and tokenizer",
        )
        print("Model successfully uploaded to Hugging Face: VyDat/NMT_FT")
    except Exception as e:
        print(f"Error uploading to Hugging Face: {str(e)}")
else:
    print("No checkpoints found to upload")

# Print final message
print("Training and upload process completed!")

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model successfully uploaded to Hugging Face: VyDat/NMT_FT
Training and upload process completed!
