In [1]:
from datasets import load_dataset
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from tqdm import tqdm

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict

# Read the CSV dataframe
df = pd.read_csv("Data/corrected_data.csv")



df = df.sample(frac=1, random_state=42)

# df = df.sample(frac=0.1)
# Split the dataframe into train, val, and test splits
train_df = df.sample(frac=0.8)
val_df = df.drop(train_df.index)
test_df = val_df.sample(frac=0.5)

# Create the hugging face dataset
dataset = DatasetDict()
dataset["train"] = Dataset.from_pandas(train_df)
dataset["val"] = Dataset.from_pandas(val_df)
dataset["test"] = Dataset.from_pandas(test_df)

In [3]:
model_name = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [4]:
def preprocess(examples):
    max_length = 128
    return tokenizer(
        examples["Content"], truncation=True, padding="max_length", max_length=max_length
    )

In [5]:
dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/30428 [00:00<?, ? examples/s]

Map:   0%|          | 0/7607 [00:00<?, ? examples/s]

Map:   0%|          | 0/3804 [00:00<?, ? examples/s]

In [6]:
import wandb
import numpy as np
import torch

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 30428
    })
    val: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 7607
    })
    test: Dataset({
        features: ['Content', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 3804
    })
})


In [8]:
model = DistilBertForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'classif

In [9]:
from sklearn.metrics import precision_recall_fscore_support

def objective(model, weight_decay, num_train_epochs, train_batch, eval_batch, optimizer, lr_scheduler):
    
    device = torch.device('cuda')
    model.to(device)
    training_args = TrainingArguments(
        output_dir="Distil-test-exp4/",
        weight_decay=weight_decay,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=train_batch,
        per_device_eval_batch_size=eval_batch,
#         disable_tqdm=True,
        report_to = 'wandb'
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["val"],
        optimizers = (optimizer,lr_scheduler),
    )

    wandb.init(project="Unmasking-Hate", name="distilbert-tel-run-custom-4")

    trainer.train()
    
    # Evaluate the model after each epoch.
    progress_bar = tqdm(range(training_args.num_train_epochs), desc="Epochs", position=0)
    for epoch in progress_bar:
        train_loss = 0.0
        train_predictions = []
        train_targets = []
        val_predictions = []
        val_targets = []
        for batch in trainer.get_eval_dataloader():
            model_input = batch["input_ids"].to(trainer.model.device)
            targets = batch["labels"].to(trainer.model.device)
            
            outputs = trainer.model(model_input, labels=targets)
            loss = outputs.loss
            logits = outputs.logits
            
            train_loss += loss.item()
            train_predictions.append(logits.detach().cpu().numpy())
            train_targets.append(targets.detach().cpu().numpy())
            
        train_loss /= len(trainer.get_eval_dataloader())
        train_predictions = np.concatenate(train_predictions, axis=0)
        train_targets = np.concatenate(train_targets, axis=0)
        train_accuracy = (train_predictions.argmax(axis=1) == train_targets).mean()
        
        val_loss = trainer.evaluate().get("eval_loss", None)
        
        # Call trainer.predict() on the val dataset
        val_outputs = trainer.predict(dataset["val"])
        val_predictions = val_outputs.predictions
        val_targets = val_outputs.label_ids
            
        val_accuracy = (val_predictions.argmax(axis=1) == val_targets).mean()
        
        precision, recall, f1, _ = precision_recall_fscore_support(
            val_targets, val_predictions.argmax(axis=1), average="weighted"
        )
    
        print(f"Epoch: {epoch + 1}",f"Train Loss: {train_loss}",f"Val Loss: {val_loss}",f"Train Accuracy: {train_accuracy}",f"Val Accuracy: {val_accuracy}",f"Precision: {precision}",f"Recall: {recall}",f"F1: {f1}")

        wandb.log(
            {
                "Epoch": epoch + 1,
                "Train Loss": train_loss,
                "Val Loss": val_loss,
                "Train Accuracy": train_accuracy,
                "Val Accuracy": val_accuracy,
                "Precision": precision,
                "Recall": recall,
                "F1": f1,
            }
        )

    wandb.finish()

    return 


In [10]:
torch.cuda.empty_cache()

In [11]:
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(), lr=0.0001)
num_training_steps = 31.25
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*num_training_steps, num_training_steps=num_training_steps)
objective(model,0.0001,6,8,8, optimizer, lr_scheduler)

[34m[1mwandb[0m: Currently logged in as: [33msiddarthakoppaka[0m ([33munmask-hate[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01693333333338766, max=1.0)…

Step,Training Loss
500,0.6634
1000,0.6647
1500,0.6609
2000,0.6627
2500,0.6605
3000,0.6606
3500,0.6611
4000,0.6615
4500,0.6603
5000,0.6603


Epochs:   0%|                                                                                    | 0/6 [00:00<?, ?it/s]

Epochs:  17%|████████████▌                                                              | 1/6 [02:50<14:13, 170.63s/it]

Epoch: 1 Train Loss: 0.6853013901680426 Val Loss: 0.654646635055542 Train Accuracy: 0.5370053897725778 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203


Epochs:  33%|█████████████████████████                                                  | 2/6 [05:39<11:19, 169.82s/it]

Epoch: 2 Train Loss: 0.683364329528608 Val Loss: 0.654646635055542 Train Accuracy: 0.5253056395425266 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203


Epochs:  50%|█████████████████████████████████████▌                                     | 3/6 [08:29<08:29, 169.70s/it]

Epoch: 3 Train Loss: 0.683364329528608 Val Loss: 0.654646635055542 Train Accuracy: 0.5253056395425266 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203


Epochs:  67%|██████████████████████████████████████████████████                         | 4/6 [11:18<05:39, 169.52s/it]

Epoch: 4 Train Loss: 0.683364329528608 Val Loss: 0.654646635055542 Train Accuracy: 0.5253056395425266 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203


Epochs:  83%|██████████████████████████████████████████████████████████████▌            | 5/6 [14:08<02:49, 169.50s/it]

Epoch: 5 Train Loss: 0.683364329528608 Val Loss: 0.654646635055542 Train Accuracy: 0.5253056395425266 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203


Epochs: 100%|███████████████████████████████████████████████████████████████████████████| 6/6 [16:57<00:00, 169.60s/it]

Epoch: 6 Train Loss: 0.683364329528608 Val Loss: 0.654646635055542 Train Accuracy: 0.5253056395425266 Val Accuracy: 0.6696463783357434 Precision: 0.7457811503630998 Recall: 0.6696463783357434 F1: 0.6337381181427203





0,1
Epoch,▁▂▄▅▇█
F1,▁▁▁▁▁▁
Precision,▁▁▁▁▁▁
Recall,▁▁▁▁▁▁
Train Accuracy,█▁▁▁▁▁
Train Loss,█▁▁▁▁▁
Val Accuracy,▁▁▁▁▁▁
Val Loss,▁▁▁▁▁▁
eval/loss,▁▁▁▁▁▁
eval/runtime,▂▄▅▁▆█

0,1
Epoch,6.0
F1,0.63374
Precision,0.74578
Recall,0.66965
Train Accuracy,0.52531
Train Loss,0.68336
Val Accuracy,0.66965
Val Loss,0.65465
eval/loss,0.65465
eval/runtime,53.6696


In [12]:
model.save_pretrained('Distil_exp4/Distil_hf/')

In [13]:
import transformers

In [14]:
# Save the model weights
torch.save(model.state_dict(), "Distil_exp4/distil-model.pt")