In [2]:
from transformers import MBartForConditionalGeneration, MBart50Tokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Quick data prep
df = pd.read_csv('AugmentedHuggingFaceDataSet.csv')
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
train_df.to_json('train.jsonl', orient='records', lines=True)
val_df.to_json('val.jsonl', orient='records', lines=True)

# 2. Load model and tokenizer (smaller model for speed)
model_name = "facebook/mbart-large-50"
tokenizer = MBart50Tokenizer.from_pretrained(model_name, src_lang="ne_NP", tgt_lang="ne_NP")
model = MBartForConditionalGeneration.from_pretrained(model_name)

# 3. Load dataset
dataset = load_dataset('json', data_files={'train': 'train.jsonl', 'validation': 'val.jsonl'})

# 4. Simple preprocessing
def preprocess(examples):
    inputs = tokenizer(examples["text"], max_length=256, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=64, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess, batched=True)

# 5. Training setup with 4 epochs and epoch-based evaluation
training_args = TrainingArguments(
    output_dir="./mbart-nepali",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    save_total_limit=None,
    save_strategy="epoch"
)

# 6. Train and save
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"]
)

trainer.train()

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/5647 [00:00<?, ? examples/s]

Map:   0%|          | 0/1412 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,1.5104,1.337333
2,0.9397,1.220384
3,2.8206,1.670237




TrainOutput(global_step=4236, training_loss=1.3624807543299804, metrics={'train_runtime': 28321.8576, 'train_samples_per_second': 0.598, 'train_steps_per_second': 0.15, 'total_flos': 9178336349650944.0, 'train_loss': 1.3624807543299804, 'epoch': 3.0})