### TASK 1

In [44]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

In [45]:
# Load dataset
url = "https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv"
df = pd.read_csv(url)
df



Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...,...
24778,25291,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,25292,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,25294,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,25295,6,0,6,0,1,youu got wild bitches tellin you lies


In [46]:
# Clean + relabel
df['label'] = df['class'].apply(lambda x: 1 if x in [0, 1] else 0)
df = df[['tweet', 'label']]

def clean_text(text):
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    return text.lower().strip()

df['tweet'] = df['tweet'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweet'] = df['tweet'].apply(clean_text)


In [47]:
# Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['tweet'], df['label'], test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [48]:
# Dataset class
class ToxicCommentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create DataLoaders
train_dataset = ToxicCommentDataset(train_texts, train_labels, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataset = ToxicCommentDataset(test_texts, test_labels, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16)

### TASK 2: Odd Layer vs Even Layer Student Training

In [49]:
from transformers import BertModel, BertConfig

# Load the teacher (12-layer BERT)
teacher = BertModel.from_pretrained("bert-base-uncased", output_hidden_states=True)
teacher.eval()

# Define a 6-layer student using the same config
student_config = BertConfig.from_pretrained("bert-base-uncased", num_hidden_layers=6)
student = BertModel(student_config)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher.to(device)
student.to(device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)


In [50]:
import torch.nn as nn
from torch.optim import Adam

loss_fn = nn.MSELoss()

def distillation_step(input_ids, attention_mask, layer_ids, teacher, student):
    with torch.no_grad():
        teacher_outputs = teacher(input_ids=input_ids, attention_mask=attention_mask)
        teacher_hidden = teacher_outputs.hidden_states

    student_outputs = student(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
    student_hidden = student_outputs.hidden_states

    loss = 0.0
    for student_layer_idx, teacher_layer_idx in enumerate(layer_ids):
        teacher_rep = teacher_hidden[teacher_layer_idx]
        student_rep = student_hidden[student_layer_idx + 1]  # skip embeddings
        loss += loss_fn(student_rep, teacher_rep)

    return loss / len(layer_ids)


In [51]:
def train_student(layer_ids, student, teacher, name="student_model", num_epochs=5):
    optimizer = Adam(student.parameters(), lr=5e-5)
    student.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()
            loss = distillation_step(input_ids, attention_mask, layer_ids, teacher, student)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f}")

    # Save the trained model
    torch.save(student.state_dict(), f"{name}.pt")
    print(f"✅ Saved: {name}.pt")


In [52]:
# Train using ODD teacher layers
odd_layer_ids = [1, 3, 5, 7, 9, 11]
student_odd = BertModel(student_config).to(device)
train_student(odd_layer_ids, student_odd, teacher, name="student_odd")

# Train using EVEN teacher layers
even_layer_ids = [2, 4, 6, 8, 10, 12]
student_even = BertModel(student_config).to(device)
train_student(even_layer_ids, student_even, teacher, name="student_even")


Epoch 1/5 | Loss: 0.3533
Epoch 2/5 | Loss: 0.2361
Epoch 3/5 | Loss: 0.1856
Epoch 4/5 | Loss: 0.1510
Epoch 5/5 | Loss: 0.1279
✅ Saved: student_odd.pt
Epoch 1/5 | Loss: 0.3733
Epoch 2/5 | Loss: 0.2494
Epoch 3/5 | Loss: 0.1924
Epoch 4/5 | Loss: 0.1522
Epoch 5/5 | Loss: 0.1242
✅ Saved: student_even.pt


### TASK 3: LoRA Fine-Tuning for Student Model

# pip install peft accelerate bitsandbytes

In [53]:
from transformers import BertForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

# Load 6-layer BERT student model for classification
student_config.num_hidden_layers = 6
student_model = BertForSequenceClassification(student_config).to(device)

In [54]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "value"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

# Inject LoRA into the student model
lora_student = get_peft_model(student_model, lora_config)
lora_student.print_trainable_parameters()  # optional: verify only adapters are trainable

Could not load bitsandbytes native library: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /home/pk_124960/Desktop/PK_ait/.venv/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so)
Traceback (most recent call last):
  File "/home/pk_124960/Desktop/PK_ait/.venv/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 85, in <module>
    lib = get_native_library()
  File "/home/pk_124960/Desktop/PK_ait/.venv/lib/python3.10/site-packages/bitsandbytes/cextension.py", line 72, in get_native_library
    dll = ct.cdll.LoadLibrary(str(binary_path))
  File "/usr/lib/python3.10/ctypes/__init__.py", line 452, in LoadLibrary
    return self._dlltype(name)
  File "/usr/lib/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by /home/pk_124960/Desktop/PK_ait/.venv/lib/python3.10/site-packages/bitsandbytes/

trainable params: 148,994 || all params: 67,105,540 || trainable%: 0.2220


In [55]:
from transformers import AdamW
from tqdm import tqdm

optimizer = AdamW(lora_student.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()
lora_student.train()

for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = lora_student(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} | LoRA Fine-tuning Loss: {avg_loss:.4f}")

100%|██████████| 1240/1240 [17:27<00:00,  1.18it/s]


Epoch 1 | LoRA Fine-tuning Loss: 0.3210


100%|██████████| 1240/1240 [17:28<00:00,  1.18it/s]


Epoch 2 | LoRA Fine-tuning Loss: 0.1814


100%|██████████| 1240/1240 [17:29<00:00,  1.18it/s]

Epoch 3 | LoRA Fine-tuning Loss: 0.1554





In [56]:
lora_student.save_pretrained("student_lora")
print("✅ LoRA model saved.")

✅ LoRA model saved.




### TASK 4

In [57]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertForSequenceClassification


In [58]:
def evaluate_model(model, dataloader):
    model.eval()
    preds, labels = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            true_labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)

            preds.extend(predictions.cpu().numpy())
            labels.extend(true_labels.cpu().numpy())

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [59]:
# Odd Student Model
student_odd_model = BertForSequenceClassification(student_config)
student_odd_model.load_state_dict(torch.load("student_odd.pt", map_location=device), strict=False)
student_odd_model.to(device)

results_odd = evaluate_model(student_odd_model, test_dataloader)
print("Odd Layer Student:", results_odd)


  student_odd_model.load_state_dict(torch.load("student_odd.pt", map_location=device), strict=False)


Odd Layer Student: {'accuracy': 0.1684486584627799, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [60]:
# Even Student Model
student_even_model = BertForSequenceClassification(student_config)
student_even_model.load_state_dict(torch.load("student_even.pt", map_location=device), strict=False)
student_even_model.to(device)

results_even = evaluate_model(student_even_model, test_dataloader)
print("Even Layer Student:", results_even)


  student_even_model.load_state_dict(torch.load("student_even.pt", map_location=device), strict=False)


Even Layer Student: {'accuracy': 0.8315513415372201, 'precision': 0.8315513415372201, 'recall': 1.0, 'f1': 0.908029518669457}


In [62]:
# LoRA Model
from peft import PeftModel, PeftConfig

lora_config = PeftConfig.from_pretrained("student_lora")
base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_hidden_layers=6)
lora_model = PeftModel.from_pretrained(base_model, "student_lora")
lora_model.to(device)

results_lora = evaluate_model(lora_model, test_dataloader)
print("LoRA Model:", results_lora)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LoRA Model: {'accuracy': 0.8234819447246319, 'precision': 0.8309887869520897, 'recall': 0.9888403687530325, 'f1': 0.903068572061593}


In [63]:
import pandas as pd

comparison_df = pd.DataFrame([
    {"Model": "Odd Layer", **results_odd},
    {"Model": "Even Layer", **results_even},
    {"Model": "LoRA", **results_lora}
])

print("\n Performance Comparison:")
display(comparison_df)



 Performance Comparison:


Unnamed: 0,Model,accuracy,precision,recall,f1
0,Odd Layer,0.168449,0.0,0.0,0.0
1,Even Layer,0.831551,0.831551,1.0,0.90803
2,LoRA,0.823482,0.830989,0.98884,0.903069
