---
🔠 Natural Language Processing (CS60075) Autumn 2024, IIT Kharagpur

📃 Assignment 3 Part 2: [Hate Speech Classification using Few-shot Prompting](https://sites.google.com/view/nlp-cs-iit-kgp/assignments)

👦🏻 Author: [Prasanna Paithankar (21CS30065)](https://cse.iitkgp.ac.in/~prasannabp/)

---

##### 📚 Import Libraries and Resources

In [1]:
import os

import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from dotenv import load_dotenv
from huggingface_hub import HfFolder, login
from sklearn.metrics import (classification_report,
                             precision_recall_fscore_support)
from transformers import (AutoConfig, AutoModelForSequenceClassification,
                          AutoTokenizer, Trainer, TrainingArguments)

  from .autonotebook import tqdm as notebook_tqdm





##### 🤗 Load Hugging Face Environment

In [2]:
load_dotenv()
HUGGING_FACE_TOKEN = os.getenv("HUGGING_FACE_TOKEN")
login(token=HUGGING_FACE_TOKEN)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\paith\.cache\huggingface\token
Login successful


##### 🧪 Load & Preprocess Dataset

In [3]:
iddict = {"normal": 0, "hatespeech": 1, "offensive": 2}
labeldict = {0: "normal", 1: "hatespeech", 2: "offensive"}

train_data = pd.read_csv('dataset/NLP_ass_train.tsv',
                         sep='\t', header=None,
                         names=['text', 'label'], dtype={'text': str, 'label': str})
train_data["label"] = train_data["label"].map(iddict)
train_dataset = Dataset.from_pandas(train_data)

val_data = pd.read_csv('dataset/NLP_ass_valid.tsv', sep='\t', header=None,
                       names=['text', 'label'], dtype={'text': str, 'label': str})
val_data["label"] = val_data["label"].map(iddict)
val_dataset = Dataset.from_pandas(val_data)

test_data = pd.read_csv('dataset/NLP_ass_test.tsv',
                        sep='\t', header=None,
                        names=['text', 'label'], dtype={'text': str, 'label': str})
test_data["label"] = test_data["label"].map(iddict)
test_dataset = Dataset.from_pandas(test_data)

##### 🌲Load Environment Variables

In [4]:
MODEL_NAME = os.getenv("MODEL_NAME")
REPOSITORY_NAME = os.getenv("REPOSITORY_NAME")
cache_dir = "./cache/"

##### 🎶 Fine-tune Setup

In [None]:
config = AutoConfig.from_pretrained(
    MODEL_NAME, num_labels=len(labeldict), iddict=iddict, labeldict=labeldict
)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, config=config, cache_dir=cache_dir)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

training_args = TrainingArguments(
    num_train_epochs=30,
    output_dir=REPOSITORY_NAME,
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=REPOSITORY_NAME,
    hub_token=HfFolder.get_token(),
)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-small and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
def tkf(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True)


train_dataset_tokenized = train_dataset.map(tkf, batched=True)
val_dataset_tokenized = val_dataset.map(tkf, batched=True)
test_dataset_tokenized = test_dataset.map(tkf, batched=True)

Map: 100%|██████████| 15383/15383 [00:06<00:00, 2303.78 examples/s]
Map: 100%|██████████| 1922/1922 [00:00<00:00, 2281.36 examples/s]
Map: 100%|██████████| 1924/1924 [00:00<00:00, 2297.99 examples/s]


##### 🏃🏻‍♂️‍➡️ Training

In [11]:
def metric(prediction):
    logits, labels = prediction
    predictions = np.argmax(logits[0], axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary"
    )
    return {"precision": precision, "recall": recall, "f1": f1}


trainer = Trainer(
    model=model,
    args=training_args,

    train_dataset=train_dataset_tokenized,

    eval_dataset=val_dataset_tokenized,
    compute_metrics=metric,
)

trainer.train()


tokenizer.save_pretrained(REPOSITORY_NAME)

trainer.create_model_card()

trainer.push_to_hub()

##### 🏗️ Load the Model

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to("cuda") if torch.cuda.is_available() else model.to("cpu")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

##### 📊 Classify

In [13]:
def classify(texts):
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True,
    )
    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    confidences, predicted_classes = torch.max(probs, dim=1)
    predicted_classes = predicted_classes.cpu().numpy()
    confidences = confidences.cpu().numpy()

    predicted_labels = [labeldict[class_id] for class_id in predicted_classes]

    return list(zip(predicted_labels, confidences))

##### 🏋🏻‍♂️ Evaluate

In [17]:
def evalModel(batch_size=16):
    pred = []
    labels = []

    for i in range(0, len(test_dataset), batch_size):
        batch_predictions = classify(test_dataset["text"][i : i + batch_size])

        pred.extend(batch_predictions)
        labels.extend([labeldict[label_id] for label_id in test_dataset["label"][i : i + batch_size]])

    report = classification_report(labels, [pair[0] for pair in pred])
    
    return report

result = evalModel()

print()
print("Classification report:")
print(result)


Classification report:
              precision    recall  f1-score   support

  hatespeech       0.71      0.78      0.79       594
      normal       0.75      0.71      0.77       782
   offensive       0.56      0.46      0.52       548

    accuracy                           0.72      1924
   macro avg       0.68      0.68      0.70      1924
weighted avg       0.72      0.71      0.72      1924



##### 🤼 Dataset Intersection

In [19]:
train_set = set(train_data['text'])
val_set = set(val_data['text'])
test_set = set(test_data['text'])

def calculate_intersections(train_set, val_set, test_set):
    return len(test_set.intersection(train_set)), len(test_set.intersection(val_set))

train_test, validation_test = calculate_intersections(train_set, val_set, test_set)

print("Number of common samples between train and test set: ", train_test)
print("Intersection percentage: ", train_test / len(test_set))
print()
print("Number of common samples between validation and test set: ", validation_test)
print("Intersection percentage: ", validation_test / len(test_set))

Number of common samples between train and test set:  5
Intersection percentage:  0.0026001040041601664

Number of common samples between validation and test set:  1
Intersection percentage:  0.0005200208008320333


***
Prasanna Paithankar (21CS30065)