In [None]:
pip install wandb datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install git+https://github.com/huggingface/accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-rbrm6vv7
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-rbrm6vv7
  Resolved https://github.com/huggingface/accelerate to commit bfa74e51d2af08221f5787d281d681ca9bceddd2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
pip install transformers==4.28.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    MBartForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from tqdm import tqdm

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Read the CSV dataframe
df = pd.read_csv("Data/corrected_data.csv")

df = df.sample(frac=1, random_state=42)

# df = df.sample(frac=0.1)
# Split the dataframe into train, val, and test splits
train_df = df.sample(frac=0.8)
val_df = df.drop(train_df.index)
test_df = val_df.sample(frac=0.5)

# Create the hugging face dataset
dataset = DatasetDict()
dataset["train"] = Dataset.from_pandas(train_df)
dataset["val"] = Dataset.from_pandas(val_df)
dataset["test"] = Dataset.from_pandas(test_df)

In [None]:
model_name = "ai4bharat/IndicBART"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def preprocess(examples):
    return tokenizer(
        examples["Content"], truncation=True, padding="max_length", max_length=128
    )

In [None]:
dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/30428 [00:00<?, ? examples/s]

Map:   0%|          | 0/7607 [00:00<?, ? examples/s]

Map:   0%|          | 0/3804 [00:00<?, ? examples/s]

In [None]:
import wandb
import numpy as np
import torch

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30428
    })
    val: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 7607
    })
    test: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3804
    })
})


In [None]:
from transformers import AlbartForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at ai4bharat/IndicBART were not used when initializing MBartForSequenceClassification: ['lm_head.weight', 'final_logits_bias']
- This IS expected if you are initializing MBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBART and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-str

In [None]:
from sklearn.metrics import precision_recall_fscore_support

def objective(model, weight_decay, num_train_epochs, train_batch, eval_batch, optimizer, lr_scheduler):

    device = torch.device('cuda')
    model.to(device)
    training_args = TrainingArguments(
        output_dir="indicbart-exp1/",
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=train_batch,
        per_device_eval_batch_size=eval_batch,
#         disable_tqdm=True,
        report_to = 'wandb'
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        optimizers = (optimizer,lr_scheduler)
    )

    wandb.init(project="Unmasking-Hate", name="indicBart-tel-run-custom-exp1")

    trainer.train()

    # Evaluate the model after each epoch.
    progress_bar = tqdm(range(training_args.num_train_epochs), desc="Epochs", position=0)
    for epoch in progress_bar:
        train_loss = 0.0
        train_predictions = []
        train_targets = []
        val_predictions = []
        val_targets = []
        for batch in trainer.get_eval_dataloader():
            model_input = batch["input_ids"].to(trainer.model.device)
            targets = batch["labels"].to(trainer.model.device)

            outputs = trainer.model(model_input, labels=targets)
            loss = outputs.loss
            logits = outputs.logits

            train_loss += loss.item()
            train_predictions.append(logits.detach().cpu().numpy())
            train_targets.append(targets.detach().cpu().numpy())

        train_loss /= len(trainer.get_eval_dataloader())
        train_predictions = np.concatenate(train_predictions, axis=0)
        train_targets = np.concatenate(train_targets, axis=0)
        train_accuracy = (train_predictions.argmax(axis=1) == train_targets).mean()

        val_loss = trainer.evaluate().get("eval_loss", None)

        # Call trainer.predict() on the val dataset
        val_outputs = trainer.predict(dataset["val"])
        val_predictions = val_outputs.predictions
        val_targets = val_outputs.label_ids

        val_accuracy = (val_predictions.argmax(axis=1) == val_targets).mean()

        precision, recall, f1, _ = precision_recall_fscore_support(
                  val_targets, val_predictions.argmax(axis=1), average="weighted"
        )


        print(f"Epoch: {epoch + 1}",f"Train Loss: {train_loss}",f"Val Loss: {val_loss}",f"Train Accuracy: {train_accuracy}",f"Val Accuracy: {val_accuracy}",f"Precision: {precision}",f"Recall: {recall}",f"F1: {f1}")

        wandb.log(
            {
                "Epoch": epoch + 1,
                "Train Loss": train_loss,
                "Val Loss": val_loss,
                "Train Accuracy": train_accuracy,
                "Val Accuracy": val_accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
            }
        )

    wandb.finish()

    return


In [None]:
torch.cuda.empty_cache()

In [None]:
from transformers import AdamW
from transformers import get_cosine_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=1e-3)
num_training_steps = 10
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=0.1*num_training_steps, num_training_steps=num_training_steps)
objective(model,0.0001,3,16,16, optimizer, lr_scheduler)

In [None]:
model.save_pretrained('indicBart_exp1/hf/')

In [None]:
# Save the model weights
torch.save(model.state_dict(), "indicBart_exp1/indicBart-model.pt")