<a href="https://colab.research.google.com/github/RobyRoshna/Insensitive-Lang-Detection/blob/main/BERTtraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login
login(token='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')


# Split to test, train, and validation (80:10:10)

In [None]:

# The annotated dataset
file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/Abstract_annotations.xlsx'  # Update with your path
data = pd.read_excel(file_path)

# cleaning data
data = data[['Sentence', 'Manual_Annotation']]
data = data.dropna()

# 1 for insensitive and 0 for notInsensitive
data['Manual_Annotation'] = data['Manual_Annotation'].apply(lambda x: 1 if x.lower() == 'insensitive' else 0)

# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Manual_Annotation'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Manual_Annotation'])

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 870, Validation size: 109, Test size: 109


# Tokenizer

In [None]:
from transformers import BertTokenizer
import pandas as pd

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize data
def tokenize_data(data, tokenizer, max_length=109):
    return tokenizer(
        list(data['Sentence']),  # Tokenize sentences
        padding=True,            # Pad shorter sentences
        truncation=True,         # Truncate longer sentences
        max_length=max_length,   # Max token length
        return_tensors='pt'      # Return PyTorch tensors
    )

train_labels = list(train_data['Manual_Annotation'])
val_labels = list(val_data['Manual_Annotation'])
test_labels = list(test_data['Manual_Annotation'])

# Tokenize the data
train_encodings = tokenize_data(train_data, tokenizer)
val_encodings = tokenize_data(val_data, tokenizer)
test_encodings = tokenize_data(test_data, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch

# Custom Dataset Class for Tokenized Data
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset.

        Args:
            encodings: Dictionary containing tokenized input IDs, attention masks, etc.
            labels: List of labels corresponding to the sentences (e.g., 0 for NotInsensitive, 1 for Insensitive).
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves the tokenized inputs and the corresponding label for the given index.

        Args:
            idx: Index of the data sample.

        Returns:
            A dictionary containing the tokenized inputs (input IDs, attention masks, etc.)
            and the label for the specified index.
        """
        # Convert tokenized data for the index to PyTorch tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add the corresponding label
        return item
# Create datasets for train, validation, and test sets
train_dataset = SentenceDataset(train_encodings, train_labels)
val_dataset = SentenceDataset(val_encodings, val_labels)
test_dataset = SentenceDataset(test_encodings, test_labels)


In [None]:
sentence_lengths = [len(tokenizer.tokenize(sent)) for sent in train_data['Sentence']]
print(f"Max length: {max(sentence_lengths)}")
print(f"Average length: {sum(sentence_lengths)/len(sentence_lengths)}")


Max length: 109
Average length: 32.96206896551724


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
#Examples from the training dataset
for i in range(5):
    item = train_dataset[i]
    print("Input IDs:", item['input_ids'])
    print("Attention Mask:", item['attention_mask'])
    print("Label:", item['labels'])  # 0 for Not Insensitive, 1 for Insensitive


Input IDs: tensor([  101,  2122,  2913,  1998,  3141,  3906,  6592,  4022,  2005,  2925,
        27758,  2015,  1997,  1996,  2291,  2000,  5770,  6397,  1998, 17453,
        18234,  5198,  1999,  4547,  1010,  2658,  1010,  1998, 10517, 18046,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Function to compute metrics
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [22]:
import wandb
from transformers import Trainer, TrainingArguments

# Close any previous WandB session
wandb.finish()

# Initialize WandB with a specific run name
wandb.init(project="Insensitive Lang Detecton", entity="Roshna", name="Bert_base")

# TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    report_to=["wandb"],  # Ensure WandB is used for logging
    run_name="Bert_base"  # Specify the run name for the Trainer
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# Evaluate and log test results in a separate WandB session
wandb.finish()  # Close the previous WandB session
wandb.init(project="Insensitive Lang Detecton", entity="Roshna", name="Bert_base_evaluation", job_type="evaluation")

results = trainer.evaluate(test_dataset, metric_key_prefix="test")
wandb.log(results)  # Log evaluation results

wandb.finish()  # Close the evaluation session


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0,0.220007,0.972477,0.947368,1.0,0.972973
2,0.0,0.234221,0.972477,0.947368,1.0,0.972973
3,0.0,0.293321,0.972477,0.947368,1.0,0.972973


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


0,1
eval/accuracy,▁▁▁
eval/f1,▁▁▁
eval/loss,▁▂█
eval/precision,▁▁▁
eval/recall,▁▁▁
eval/runtime,█▁▄
eval/samples_per_second,▁█▅
eval/steps_per_second,▁█▅
train/epoch,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▆▇▇███

0,1
eval/accuracy,0.97248
eval/f1,0.97297
eval/loss,0.29332
eval/precision,0.94737
eval/recall,1.0
eval/runtime,0.5824
eval/samples_per_second,187.156
eval/steps_per_second,12.019
total_flos,146196219022200.0
train/epoch,3.0


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


0,1
epoch,▁
test/accuracy,▁
test/f1,▁
test/loss,▁
test/precision,▁
test/recall,▁
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
test_accuracy,▁

0,1
epoch,3.0
test/accuracy,0.94495
test/f1,0.9434
test/loss,0.64825
test/precision,0.96154
test/recall,0.92593
test/runtime,0.5163
test/samples_per_second,211.114
test/steps_per_second,13.558
test_accuracy,0.94495
