# Phase 0: Import Necessary Libraries

In [66]:
# System Info
import sys         # Access system-specific parameters and functions
import os          # Interface with the operating system
import platform    # Retrieve underlying platform and hardware information

# Data Processing
import numpy as np         # Numerical operations and arrays
import pandas as pd        # Data manipulation with DataFrames

# Visualization
import matplotlib.pyplot as plt  # Plotting
import seaborn as sns            # Statistical data visualization

# Machine Learning & NLP
import sklearn              # Scikit-learn for ML tools (baseline models, metrics)
import transformers         # Hugging Face Transformers (e.g., BERT)
import datasets             # Hugging Face Datasets for loading NLP corpora

# Deep Learning
import torch                # PyTorch for tensor ops and training

# Progress Bars
import tqdm             # Progress bars for loops and training

# Ignore Warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# Check versions
print("✅ Library Versions\n" + "-"*30)
print(f"Python version      : {sys.version}")
print(f"NumPy version       : {np.__version__}")
print(f"Pandas version      : {pd.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")
print(f"Matplotlib version  : {plt.matplotlib.__version__}")
print(f"Seaborn version     : {sns.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Huggingface Datasets: {datasets.__version__}")
print(f"PyTorch version     : {torch.__version__}")
print(f"TQDM version        : {tqdm.__version__}")
print(f"CPU architecture    : {platform.processor() or platform.machine()}")


✅ Library Versions
------------------------------
Python version      : 3.10.18 (main, Jun  5 2025, 08:37:47) [Clang 14.0.6 ]
NumPy version       : 1.23.5
Pandas version      : 2.3.1
Scikit-learn version: 1.7.1
Matplotlib version  : 3.10.5
Seaborn version     : 0.13.2
Transformers version: 4.30.2
Huggingface Datasets: 4.0.0
PyTorch version     : 2.7.1
TQDM version        : 4.67.1
CPU architecture    : arm


In [67]:
import json

# Path to SemEval2024 Task 8 monolingual data
data_path = "../data/semeval/subtaskA_monolingual.jsonl"

# Load data
with open(data_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# Preview first example
print(f"Loaded {len(data)} examples")
print(data[0])


Loaded 34272 examples
{'text': "Today, many adults or teenage drivers are hooked onto their phones. While driving, they can be prompted to use their phones for text messaging. It may cause many accidents, death, serious injuries and more. I honestly think that drivers should use phones while driving  because they are taking risks that could kill or injure others and also yourself. There are also laws against using a phone while operating a moving vehicle but people still disobey them. I think that there should be more consequences when it comes down to texting and driving.  Using your cell phones causes many distractions. It only takes a blink of an eye to cause an accident. Yeah, resisting the urge to text while driving may be hard but it can and will save lives including yours. When driving, your eyes and mind are programmed to be focused on the road at all times. Having a cellphone on your person is a hazard in my opinion. Just think about it, anyone could be crossing a busy street.

# Phase 1: Preprocessing

In [68]:
# Convert JSON list to DataFrame first
df = pd.DataFrame(data)

# Label mapping (from integers to string labels)
label_map = {
    0: "support",
    1: "refute",
    2: "no_relation"
}

# Create new column with text labels
df["label_text"] = df["label"].map(label_map)

# Preview
df[["text", "label", "label_text"]].head()

Unnamed: 0,text,label,label_text
0,"Today, many adults or teenage drivers are hook...",0,support
1,"The automobile, since its advent, has revoluti...",1,refute
2,One policy that could potentially improve aca...,1,refute
3,Title: Navigating the Road Ahead: The Case for...,1,refute
4,Have you ever woken up in the morning and wish...,0,support


In [69]:
from sklearn.model_selection import train_test_split

# Split data (stratify for balanced labels)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
train_df["label"].value_counts(normalize=True), test_df["label"].value_counts(normalize=True)


Train size: 27417, Test size: 6855


(label
 1    0.525222
 0    0.474778
 Name: proportion, dtype: float64,
 label
 1    0.525164
 0    0.474836
 Name: proportion, dtype: float64)

# Phase 2: Tokenization + BERT Input Format

In [70]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize train and test texts
train_encodings = tokenizer(list(train_df["text"]), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(list(test_df["text"]), padding="max_length", truncation=True, max_length=128, return_tensors="pt")

# Convert labels
import torch
train_labels = torch.tensor(train_df["label"].values)
test_labels = torch.tensor(test_df["label"].values)

In [71]:
print("🔢 Train shape:", train_encodings["input_ids"].shape)
print("📝 Sample decode:", tokenizer.decode(train_encodings["input_ids"][0]))

🔢 Train shape: torch.Size([27417, 128])
📝 Sample decode: [CLS] it would be make out community better if the principle would make everyone do community service. people should be required to do it because it would make our environment cleaner. people would take more respect in our community. people would have a responsibility to clean up after themselves and others. most people don't think about our community before they do something wrong to it. it needs to be kept clean and not have trash everywhere. since they are the people to clean it they wouldn't destroy our community. also many people grafiti on things. things are ruined from spray paint and markings all over walls or buildings. people need to clean our [SEP]


# Phase 3: Train BERT Model

In [72]:
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # dict with input_ids, attention_mask
        self.labels = labels        # list or tensor of labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Each item is a dict of {input_ids, attention_mask, label}
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [73]:
# Tokenization
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize train/test
train_encodings = tokenizer(list(train_df["text"]), truncation=True, padding=True, max_length=256)
test_encodings = tokenizer(list(test_df["text"]), truncation=True, padding=True, max_length=256)

# Prepare labels
train_labels = list(train_df["label"])
test_labels = list(test_df["label"])

# Create Dataset objects
train_dataset = TextClassificationDataset(train_encodings, train_labels)
test_dataset = TextClassificationDataset(test_encodings, test_labels)

In [74]:
# Preview first training example
sample = train_dataset[0]
print("🔍 Sample from train_dataset[0]:")
for key, value in sample.items():
    print(f"{key}: {value.shape} → {value}")

🔍 Sample from train_dataset[0]:
input_ids: torch.Size([256]) → tensor([  101,  2009,  2052,  2022,  2191,  2041,  2451,  2488,  2065,  1996,
         6958,  2052,  2191,  3071,  2079,  2451,  2326,  1012,  2111,  2323,
         2022,  3223,  2000,  2079,  2009,  2138,  2009,  2052,  2191,  2256,
         4044, 20133,  1012,  2111,  2052,  2202,  2062,  4847,  1999,  2256,
         2451,  1012,  2111,  2052,  2031,  1037,  5368,  2000,  4550,  2039,
         2044,  3209,  1998,  2500,  1012,  2087,  2111,  2123,  1005,  1056,
         2228,  2055,  2256,  2451,  2077,  2027,  2079,  2242,  3308,  2000,
         2009,  1012,  2009,  3791,  2000,  2022,  2921,  4550,  1998,  2025,
         2031, 11669,  7249,  1012,  2144,  2027,  2024,  1996,  2111,  2000,
         4550,  2009,  2027,  2876,  1005,  1056,  6033,  2256,  2451,  1012,
         2036,  2116,  2111, 22160, 25090,  2006,  2477,  1012,  2477,  2024,
         9868,  2013, 12509,  6773,  1998, 13967,  2035,  2058,  3681,  2030,
 

In [75]:
tokenizer.decode(sample["input_ids"], skip_special_tokens=True)

"it would be make out community better if the principle would make everyone do community service. people should be required to do it because it would make our environment cleaner. people would take more respect in our community. people would have a responsibility to clean up after themselves and others. most people don't think about our community before they do something wrong to it. it needs to be kept clean and not have trash everywhere. since they are the people to clean it they wouldn't destroy our community. also many people grafiti on things. things are ruined from spray paint and markings all over walls or buildings. people need to clean our environment so it can last longer and be much cleaner for us all. many things are just getting done and no one is having respect to the people that actually go out there and clean it all up and try. if more people would try to not litter and do other things to our community than it wouldn't be half as bad as it is. people don't know what it 

In [76]:
from torch.utils.data import DataLoader

# Set batch size
batch_size = 16

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [77]:
# Preview one batch
batch = next(iter(train_loader))

print("📦 A Batch from train_loader")
print(f"input_ids shape     : {batch['input_ids'].shape}")
print(f"attention_mask shape: {batch['attention_mask'].shape}")
print(f"labels shape        : {batch['labels'].shape}")


📦 A Batch from train_loader
input_ids shape     : torch.Size([16, 256])
attention_mask shape: torch.Size([16, 256])
labels shape        : torch.Size([16])


In [84]:
from torch.optim import AdamW
from transformers import get_scheduler

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Scheduler (optional but helpful for training stability)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [85]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
from tqdm import tqdm
import os
import torch

# 🔧 Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set number of epochs (run 3 times)
num_epochs = 3

# Training loop
model.train()
for epoch in range(num_epochs):
    print(f"\n Epoch {epoch+1}/{num_epochs}")
    loop = tqdm(train_loader, leave=True)

    for batch in loop:
        # Move batch to device (CPU or MPS)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Remove token_type_ids if present (DistilBERT doesn't use them)
        batch.pop("token_type_ids", None)

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        # Backward + optimize
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Log loss
        loop.set_description(f"Epoch {epoch+1}")
        loop.set_postfix(loss=loss.item())

# Save model weights after training (PyTorch format)
torch.save(model.state_dict(), "distilbert_trained_model.pt")
print(" Model saved to distilbert_trained_model.pt")


# Phase 4: Evaluation & Comparison

In [None]:
# Evaluation mode
model.eval()

# Store predictions and true labels
all_preds = []
all_labels = []

# No gradient needed
with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        batch.pop("token_type_ids", None)  # Remove if exists
        
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

### Evaluate with Accuracy, F1-score

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import warnings
from sklearn.exceptions import UndefinedMetricWarning
import pandas as pd

# Ignore undefined metric warnings
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Accuracy
acc = accuracy_score(all_labels, all_preds)
acc_percent = acc * 100

# Classification Report as Dict
report_dict = classification_report(
    all_labels,
    all_preds,
    labels=[0, 1],
    target_names=["Support", "Refute"],
    output_dict=True,
    zero_division=0
)

# Convert to DataFrame
df = pd.DataFrame(report_dict).T

# Select relevant rows
df_clean = df.loc[["Support", "Refute", "macro avg", "weighted avg"], ["precision", "recall", "f1-score", "support"]]

# Rename index
df_clean.index = ["Support", "Refute", "Macro Avg", "Weighted Avg"]
df_clean.columns = ["Precision", "Recall", "F1-score", "Support (samples)"]

# Round scores
df_clean = df_clean.round(4)

# Display table
print("📊 Classification Report Summary (DistilBERT - Test Set)")
display(df_clean)

# Print Accuracy
print(f"\n Overall Accuracy: {acc:.4f} ({acc_percent:.2f}%)")