<a href="https://colab.research.google.com/github/RobyRoshna/Insensitive-Lang-Detection/blob/main/BERTtraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from huggingface_hub import login
login(token='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')


# Split to test, train, and validation (80:10:10)

In [None]:

# The annotated dataset
file_path = '/content/drive/MyDrive/Honours MiscData(Roshna)/Abstract_annotations.xlsx'  # Update with your path
data = pd.read_excel(file_path)

# cleaning data
data = data[['Sentence', 'Manual_Annotation']]
data = data.dropna()

# 1 for insensitive and 0 for notInsensitive
data['Manual_Annotation'] = data['Manual_Annotation'].apply(lambda x: 1 if x.lower() == 'insensitive' else 0)

# Split the data into train, validation, and test sets
train_data, temp_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Manual_Annotation'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Manual_Annotation'])

print(f"Train size: {len(train_data)}, Validation size: {len(val_data)}, Test size: {len(test_data)}")


Train size: 870, Validation size: 109, Test size: 109


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Manual_Annotation'] = data['Manual_Annotation'].apply(lambda x: 1 if x.lower() == 'insensitive' else 0)


# Tokenizer

In [None]:
from transformers import BertTokenizer
import pandas as pd

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to tokenize data
def tokenize_data(data, tokenizer, max_length=109):
    return tokenizer(
        list(data['Sentence']),  # Tokenize sentences
        padding=True,            # Pad shorter sentences
        truncation=True,         # Truncate longer sentences
        max_length=max_length,   # Max token length
        return_tensors='pt'      # Return PyTorch tensors
    )

train_labels = list(train_data['Manual_Annotation'])
val_labels = list(val_data['Manual_Annotation'])
test_labels = list(test_data['Manual_Annotation'])

# Tokenize the data
train_encodings = tokenize_data(train_data, tokenizer)
val_encodings = tokenize_data(val_data, tokenizer)
test_encodings = tokenize_data(test_data, tokenizer)


In [None]:
import torch

# Custom Dataset Class for Tokenized Data
class SentenceDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        """
        Initializes the dataset.

        Args:
            encodings: Dictionary containing tokenized input IDs, attention masks, etc.
            labels: List of labels corresponding to the sentences (e.g., 0 for NotInsensitive, 1 for Insensitive).
        """
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """
        Returns the total number of samples in the dataset.
        """
        return len(self.labels)

    def __getitem__(self, idx):
        """
        Retrieves the tokenized inputs and the corresponding label for the given index.

        Args:
            idx: Index of the data sample.

        Returns:
            A dictionary containing the tokenized inputs (input IDs, attention masks, etc.)
            and the label for the specified index.
        """
        # Convert tokenized data for the index to PyTorch tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Add the corresponding label
        return item
# Create datasets for train, validation, and test sets
train_dataset = SentenceDataset(train_encodings, train_labels)
val_dataset = SentenceDataset(val_encodings, val_labels)
test_dataset = SentenceDataset(test_encodings, test_labels)


In [None]:
sentence_lengths = [len(tokenizer.tokenize(sent)) for sent in train_data['Sentence']]
print(f"Max length: {max(sentence_lengths)}")
print(f"Average length: {sum(sentence_lengths)/len(sentence_lengths)}")


Max length: 109
Average length: 32.96206896551724


In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [None]:
# Print a few examples from the training dataset
for i in range(5):
    item = train_dataset[i]
    print("Input IDs:", item['input_ids'])
    print("Attention Mask:", item['attention_mask'])
    print("Label:", item['labels'])  # 0 for Not Insensitive, 1 for Insensitive


Input IDs: tensor([  101,  2122,  2913,  1998,  3141,  3906,  6592,  4022,  2005,  2925,
        27758,  2015,  1997,  1996,  2291,  2000,  5770,  6397,  1998, 17453,
        18234,  5198,  1999,  4547,  1010,  2658,  1010,  1998, 10517, 18046,
         1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [None]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT for binary classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Function to compute metrics
def compute_metrics(pred):
    predictions, labels = pred
    preds = predictions.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()




<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2148,0.115072,0.944954,0.961538,0.925926,0.943396
2,0.1332,0.03358,0.990826,1.0,0.981481,0.990654
3,0.0561,0.046235,0.990826,0.981818,1.0,0.990826


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=165, training_loss=0.2229250180901903, metrics={'train_runtime': 130.816, 'train_samples_per_second': 19.952, 'train_steps_per_second': 1.261, 'total_flos': 146196219022200.0, 'train_loss': 0.2229250180901903, 'epoch': 3.0})

In [None]:


results = trainer.evaluate(test_dataset, metric_key_prefix="test")
print(results)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'test_loss': 0.1727628856897354, 'test_accuracy': 0.963302752293578, 'test_precision': 1.0, 'test_recall': 0.9259259259259259, 'test_f1': 0.9615384615384616, 'test_runtime': 0.5033, 'test_samples_per_second': 216.569, 'test_steps_per_second': 13.908, 'epoch': 3.0}
