In [None]:
# Execute this cell only if you are using Google Colab
try:
    from google.colab import drive
    drive.mount('/content/drive')
    %cd "/content/drive/MyDrive/Project"
    !apt install python3.10-venv
    !python -m venv "myenv" # To set up the virtual env
    !source /content/drive/MyDrive/Project/myenv/bin/activate; pip install -r requirements.txt
except:
    pass

In [None]:
# Import the relevant files and packages
from utils.helpers import compute_accuracy, optimizer_and_loss_function_loader, compute_micro_f1
from utils.tokenizer import tokenize_and_align_labels
from utils.dataset_exporter import DataSetLoader
from models.NERModel import XLMRoBERTaNERTagger
from transformers import XLMRobertaTokenizerFast, DataCollatorWithPadding
import torch
import time
import wandb
from config import read_config

config = read_config()

In [None]:
class CodeSwitchedNERTagger():
    """
    This is a base class that is responbile for interacting with the various components of the application.
    It loads the dataset, does the tokenization, instantiates the model and performs the training and validation.
    """
    def __init__(self):
        self.dataset_loader = DataSetLoader()
        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained(config["model_name"])
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer) # For dynamic padding
        self.epochs = config["epochs"]
        self.learning_rate = config["learning_rate"]
        self.project_name = config["project"]
        self.capture_logs_for_wandb = config["capture_logs_for_wandb"]
        self.batch_size = config["batch_size"]

    def load_train_dataset(self):
        """
        Loads the original train dataset to the instance.
        Returns:
            original_train_dataset: Original train dataset (DataSet)
        """
        original_train_dataset = self.dataset_loader.load_train_dataset()
        return original_train_dataset

    def load_augmented_train_dataset(self):
        """
        Loads the augmented train dataset to the instance.
        Returns:
            augmented_train_dataset: Augmented train dataset (DataSet)
        """
        augmented_train_dataset = self.dataset_loader.load_train_dataset_with_data_augmentation()
        return augmented_train_dataset

    def load_validation_dataset(self):
        """
        Loads the validation dataset to the instance

        Returns:
            validation_dataset: Validation dataset (DataSet)
        """
        validation_dataset = self.dataset_loader.load_validation_dataset()
        return validation_dataset

    def tokenize_and_create_dataloader(self, dataset):
        """_summary_

        Args:
            dataset (Dataset): Any dataset that needs to tokenized and converted to the dataloader with batches.

        Returns:
            dataloader: Dataloader created from the dataset (DataSetDict)
        """
        tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True) # Toeknize
        tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        # Change it to data loader using the data collator and the batch
        dataloader = torch.utils.data.DataLoader(tokenized_dataset, batch_size= self.batch_size, collate_fn= self.data_collator)
        return dataloader

    def instantiate_model(self):
        """
        Instantiates the neural model for training and evaluation.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"The model will be trained on a {self.device} hardware")
        self.model = XLMRoBERTaNERTagger().to(self.device)

    def load_optimizer_and_loss_function(self):
        """
        Loads the optimmizer and the loss function to used during training.
        """
        self.optimizer, self.loss = optimizer_and_loss_function_loader(self.model)

    def train_and_evalutate(self, train_dataloader, eval_dataloader):
            """
                Parameters:
                - train_dataloader: Training data (DatasetDict)
                - eval_dataloader: Evaluation data (DatasetDict)
                Returns: None
                Implementation: Training and evaluation of the model happens as per the predefined epoch value.
                                Both the training and evaluation dataset is split into mini batches whose size is also predefined.
                                The model will process the minibatches, predicts the outcome, calculates the error using the loss function defined and through backpropogation,
                                fine tunes the parameters of the custom head model defined by the class. Based on these new parameters, the evaluation dataset is used for prediction
                                and the metrics are calculated. 
            """
            training_losses = []
            validation_f1, training_f1, training_accuracy, validation_accuracy = [], [], [], []

            threshold = 0.01
            patience = 3  # Number of epochs to wait before stopping
            best_accuracy = 0
            epochs_without_improvement = 0

            for epoch in range(self.epochs):
                start = time.time()
                self.model.train()
                true_labels, predicted_labels, true_labels_eval, predicted_labels_eval = [], [], [], []
                # Below 2 varaibles for capturing the accuracies and the losses over each batch
                epoch_total_loss = 0.0
                epoch_total_accuracy = 0.0
                if epochs_without_improvement <= patience:
                    for batch in train_dataloader:
                        gold_outputs = batch["labels"].to(self.device)
                        batch = { k: v.to(self.device) for k, v in batch.items() }
                        logits = self.model(**batch)
                        # logits should be of the shape - (batch_size * max_sequence_length, 19),
                        # Gold Outputs of the shape - (19,)
                        outcome = self.loss(logits.view(-1, len(config["labels"])), gold_outputs.view(-1))

                        # Training data predictions done below
                        training_predictions = torch.argmax(logits, dim = -1 )
                        self.optimizer.zero_grad() # Zero out the gradient
                        outcome.backward()
                        self.optimizer.step()

                        # Update the loss
                        epoch_total_loss += outcome.item()
                        epoch_total_accuracy += compute_accuracy(training_predictions, gold_outputs)

                        # Use the labels for micro F1 calculation
                        true_labels.append(gold_outputs)
                        predicted_labels.append(training_predictions)

                    avg_loss_per_epoch = epoch_total_loss/ len(train_dataloader)
                    training_epoch_micro_f1 = compute_micro_f1(true_labels, predicted_labels)
                    training_epoch_accuracy = epoch_total_accuracy / len(train_dataloader)

                    training_losses.append(avg_loss_per_epoch)
                    training_f1.append(training_epoch_micro_f1)
                    training_accuracy.append(training_epoch_accuracy)

                    # Check if there's an improvement in accuracy
                    if training_epoch_accuracy - best_accuracy > threshold:
                        best_accuracy = training_epoch_accuracy
                        epochs_without_improvement = 0
                    else:
                        epochs_without_improvement += 1
                else:
                   print("Stopping training due to negligible improvement.")

                self.model.eval()
                eval_total_accuracy = 0.0
                for batch in eval_dataloader:
                    batch = { k: v.to(self.device) for k, v in batch.items() }
                    gold_outputs = batch["labels"].to(self.device)
                    with torch.no_grad():
                        logits = self.model(**batch)

                    eval_predictions = torch.argmax(logits, dim = -1 )

                    true_labels_eval.append(gold_outputs)
                    predicted_labels_eval.append(eval_predictions)
                    eval_total_accuracy += compute_accuracy(eval_predictions, gold_outputs)

                validation_epoch_accuracy = eval_total_accuracy/len(eval_dataloader)
                validation_epoch_micro_f1 = compute_micro_f1(true_labels_eval, predicted_labels_eval)
                validation_f1.append(validation_epoch_micro_f1)
                validation_accuracy.append(validation_epoch_accuracy)

                end = time.time()
                print("\n---------------------------------")
                print(f">>> Epoch {epoch + 1} completed in {end-start} seconds.")
                print(f">>> Epoch {epoch + 1}: Micro F1 on training data: {training_epoch_micro_f1}, Micro F1 on validation: {validation_epoch_micro_f1}")
                if epochs_without_improvement <= patience:
                  print(f">>> Epoch {epoch + 1}: Accuracy on training data: {training_epoch_accuracy}, Accuracy on validation: {validation_epoch_accuracy}")
                else:
                    print(f">>> Epoch {epoch + 1}: Accuracy on validation: {validation_epoch_accuracy}")
                print("---------------------------------\n")

            return {
                "loss": training_losses,
                "accuracy": {
                    "training": training_accuracy,
                    "validation": validation_accuracy
                },
                "micro_f1": {
                    "training": training_f1,
                    "validation":  validation_f1
                }
            }

    def plot(self, result):
        """_summary_

        Args:
            result (Dict): Result dict containing the training and validation related metrics like loss, accuracy and Micro F1.
        Implementation:
            A WanDb table is created and used for the plotting of line graphs.
        """
        # Login to Wandb if capture_logs_for_wandb is True
        if self.capture_logs_for_wandb:
            wandb.login()
            run = wandb.init(
                # Set the project where this run will be logged
                project= self.project_name,
                # Track hyperparameters and run metadata
                config={
                    "learning_rate": self.learning_rate,
                    "epochs": self.epochs,
                }
            )

            epoch_list = [i for i in range (1, self.epochs + 1)]
            data_training_loss = [[x, y] for (x, y) in zip(epoch_list, result['loss'])]
            data_training_accuracy = [[x, y] for (x, y) in zip(epoch_list, result['accuracy']['training'])]
            data_validation_accuracy = [[x, y] for (x, y) in zip(epoch_list, result['accuracy']['validation'])]
            data_training_micro_f1 = [[x, y] for (x, y) in zip(epoch_list, result['micro_f1']['training'])]
            data_validation_micro_f1 = [[x, y] for (x, y) in zip(epoch_list, result['micro_f1']['validation'])]

            columns_accuracy = ["Epochs", "Accuracy"]
            columns_loss = ["Epochs", "Loss"]
            columns_f1 = ["Epochs", "Micro F1"]
            table_training_accuracy  = wandb.Table(data = data_training_accuracy, columns = columns_accuracy)
            table_training_loss = wandb.Table(data = data_training_loss, columns = columns_loss)
            table_validation_accuracy = wandb.Table(data = data_validation_accuracy, columns = columns_accuracy)
            table_training_f1 = wandb.Table(data= data_training_micro_f1, columns = columns_f1)
            table_validation_f1 = wandb.Table(data= data_validation_micro_f1, columns = columns_f1)

            wandb.log({f'{config["project"]} Model Training Micro F1' : wandb.plot.line(table_training_f1, "Epochs", "Micro F1", title=f'{config["project"]} Model Training Micro F1')})
            wandb.log({f'{config["project"]} Model Validation Micro F1' : wandb.plot.line(table_validation_f1, "Epochs", "Micro F1", title=f'{config["project"]} Model Validation Micro F1')})
            wandb.log({f'{config["project"]} Model Training Loss' : wandb.plot.line(table_training_loss, "Epochs", "Loss", title=f'{config["project"]} Model Training Loss')})
            wandb.log({f'{config["project"]} Model Training Accuracy' : wandb.plot.line(table_training_accuracy, "Epochs", "Accuracy", title=f'{config["project"]} Model Training Accuracy')})
            wandb.log({f'{config["project"]} Model Validation Accuracy' : wandb.plot.line(table_validation_accuracy, "Epochs", "Accuracy", title=f'{config["project"]} Model Validation Accuracy')})



In [None]:
# Instantiate the main class.
tagger = CodeSwitchedNERTagger()
# Intialise the datasets. Here, use the augmented dataset.
augmented_train_dataset = tagger.load_augmented_train_dataset()
validation_dataset = tagger.load_validation_dataset()
# Tokenize the dataset and create the data loaders
train_dataloader = tagger.tokenize_and_create_dataloader(augmented_train_dataset)
eval_dataloader = tagger.tokenize_and_create_dataloader(validation_dataset)
# Initialise the nerual model
tagger.instantiate_model()
# Start the training and evalution
tagger.load_optimizer_and_loss_function()
result = tagger.train_and_evalutate(train_dataloader, eval_dataloader)
tagger.plot(result)

In [None]:
# Tagger To instance is for evaluating the original dataset
tagger_two = CodeSwitchedNERTagger()
# Intialise the datasets. Here, use the original train dataset.
original_train_dataset = tagger_two.load_train_dataset()
validation_dataset = tagger_two.load_validation_dataset()
train_dataloader = tagger_two.tokenize_and_create_dataloader(original_train_dataset)
# Initialise the nerual model
tagger_two.instantiate_model()
# Start the training and evalution
tagger_two.load_optimizer_and_loss_function()
eval_dataloader = tagger_two.tokenize_and_create_dataloader(validation_dataset)
result = tagger_two.train_and_evalutate(train_dataloader, eval_dataloader)
# Plot into WanDB
tagger_two.plot(result)