# Task 3 - Training M-bart

- *Setup Notebook on Google Colab*
- *Load the MBart Model and Tokenizer from Google Drive*
- *Tokenize the Multilingual Dataset*
- *Split the Dataset into 80% Train set and 20% Test set*
- *Setup Training Arguements and Trainer for MBart*
- *Train MBart and Save the model*

In [1]:
# Ensuring GPU is detected
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

True
0
NVIDIA A100-SXM4-40GB


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from datasets import load_dataset

# Load the tokenizer and model
model_path = "drive/MyDrive/Multi-lingual Customer Service Chatbot - Colab/models/mbart-large-50"
tokenizer = MBart50TokenizerFast.from_pretrained(model_path, padding_side='right')
model = MBartForConditionalGeneration.from_pretrained(model_path)



In [5]:
# Define tokenization function without static padding (dynamic padding will be done later)
def tokenize_function(examples):
    inputs = tokenizer(examples['instruction'], truncation=True, padding=False, max_length=20)  # 95th Percentile of instruction_length
    targets = tokenizer(examples['response'], truncation=True, padding=False, max_length=440)   # 99th Percentile of response_length
    inputs["labels"] = targets["input_ids"]
    return inputs

# Load and tokenize dataset
dataset = load_dataset('csv', data_files='drive/MyDrive/Multi-lingual Customer Service Chatbot - Colab/data/Multilingual_Customer_Support_Training_Dataset.csv')
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [8]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForSeq2Seq
from datasets import concatenate_datasets
import random
import numpy as np
import os

model_dir = "drive/MyDrive/Multi-lingual Customer Service Chatbot - Colab/models/final_mbart_model_new"

if not os.path.exists(model_dir):
    # Set seeds for reproducibility
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)

    # Corrected number of samples per language (since each language has 26,872 rows)
    sample_size_per_language = 26872

    # Filter dataset by language and take samples for each language
    en_samples = tokenized_datasets["train"].filter(lambda example: example['language'] == 'en').shuffle(seed=42).select(range(sample_size_per_language))
    fr_samples = tokenized_datasets["train"].filter(lambda example: example['language'] == 'fr').shuffle(seed=42).select(range(sample_size_per_language))
    es_samples = tokenized_datasets["train"].filter(lambda example: example['language'] == 'es').shuffle(seed=42).select(range(sample_size_per_language))

    # Combine samples from all languages to form the full train dataset
    full_dataset = concatenate_datasets([en_samples, fr_samples, es_samples])

    # Split the dataset into training (80%) and evaluation (20%)
    train_size = int(0.80 * len(full_dataset))
    eval_size = len(full_dataset) - train_size

    train_dataset = full_dataset.select(range(train_size))
    eval_dataset = full_dataset.select(range(train_size, len(full_dataset)))

    # Create a data collator for dynamic padding
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)  # Padding=True enables dynamic padding

    # Modified training arguments
    training_args = TrainingArguments(
        output_dir="drive/MyDrive/Multi-lingual Customer Service Chatbot - Colab/models",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,  # Slightly higher learning rate to start
        per_device_train_batch_size=12,  # Batch size for efficient memory usage
        per_device_eval_batch_size=12,   # Same batch size for evaluation
        num_train_epochs=7,  # Increased epochs for better generalization
        save_total_limit=3,  # Save checkpoints to prevent excessive memory use
        fp16=True,  # Mixed precision training for performance improvements
        weight_decay=0.01,  # Helps with regularization and avoiding overfitting
        logging_steps=200,
        load_best_model_at_end=True,
        gradient_accumulation_steps=5,  # Accumulating gradients to reduce memory footprint
        warmup_steps=500,  # Reduced warmup steps for faster convergence
        report_to="none",  # Reporting disabled to avoid logging overhead
        gradient_checkpointing=True,  # Save memory during gradient computation
        lr_scheduler_type="cosine",  # Cosine learning rate scheduler for smoother training
        eval_accumulation_steps=10,  # Accumulate evaluation steps to avoid memory issues
    )

    # Trainer setup with dynamic padding and BLEU score computation
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,  # Enable dynamic padding
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Early stopping if validation loss doesn't improve
    )
    print(f"Dataset and Trainer initialized for Model Training")
else:
    print(f"Model and tokenizer already exist in {model_dir}. Skipping initialization.")

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Dataset and Trainer initialized for Model Training


In [9]:
!PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [10]:
# Check if the model already exists
if not os.path.exists(model_dir):
    # Train the model
    trainer.train()

    # Save the final model and tokenizer if it doesn't exist
    trainer.save_model(model_dir)
    tokenizer.save_pretrained(model_dir)
    print(f"Model and tokenizer saved to {model_dir}")
else:
    print(f"Model and tokenizer already exist in {model_dir}. Skipping training.")

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
1,1.2804,1.306809
2,0.7382,0.787249
3,0.6547,0.748505
4,0.5909,0.693028
5,0.563,0.676866
6,0.5393,0.67043
7,0.5296,0.670399


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
  return fn(*args, **kwargs)
  with tor

Model and tokenizer saved to drive/MyDrive/Multi-lingual Customer Service Chatbot - Colab/models/final_mbart_model_new
