# Main imports and code

In [2]:
# check which gpu we're using
!nvidia-smi

Mon Feb 16 13:55:15 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 591.86                 Driver Version: 591.86         CUDA Version: 13.1     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   49C    P0            590W /  140W |       0MiB /   8188MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [3]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel, MultiLabelClassificationArgs
from urllib import request
import pandas as pd
import logging
import torch
from collections import Counter
from ast import literal_eval

In [4]:
# prepare logger
logging.basicConfig(level=logging.INFO)

transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# check gpu
cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

cuda_available = torch.cuda.is_available()

if cuda_available:
    print("GPU")
    print(torch.cuda.get_device_name(0))
else:
    print("CPU")


Cuda available?  True
GPU
NVIDIA GeForce RTX 4060 Laptop GPU


# Fetch Don't Patronize Me! data manager module

In [8]:
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
# helper function to save predictions to an output file
def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')
from dont_patronize_me import DontPatronizeMe
dpm = DontPatronizeMe('.', '.')
dpm.load_task1()
dpm.load_task2(return_one_hot=True)

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py
Map of label to numerical label:
{'Unbalanced_power_relations': 0, 'Shallow_solution': 1, 'Presupposition': 2, 'Authority_voice': 3, 'Metaphors': 4, 'Compassion': 5, 'The_poorer_the_merrier': 6}


# Load paragraph IDs

In [11]:
trids = pd.read_csv('train_semeval_parids-labels.csv')
teids = pd.read_csv('dev_semeval_parids-labels.csv')
trids.par_id = trids.par_id.astype(str)
teids.par_id = teids.par_id.astype(str)
data=dpm.train_task1_df
data

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""White House press secretary Sean Spicer said ...",0,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" Just like we received migrants fleeing El ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""Sri Lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,He added that the AFP will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" She has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" Anja Ringgren Loven I ca n't find a word t...",1,4




# Rebuild training set

In [12]:
rows = [] # will contain par_id, label and text
for idx in range(len(trids)):
  parid = trids.par_id[idx]
  #print(parid)
  # select row from original dataset to retrieve `text` and binary label
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })

trdf1 = pd.DataFrame(rows)
trdf1

Unnamed: 0,par_id,community,text,label
0,4341,poor-families,"The scheme saw an estimated 150,000 children f...",1
1,4136,homeless,Durban 's homeless communities reconciliation ...,1
2,10352,poor-families,The next immediate problem that cropped up was...,1
3,8279,vulnerable,Far more important than the implications for t...,1
4,1164,poor-families,To strengthen child-sensitive social protectio...,1
...,...,...,...,...
8370,8380,refugee,Rescue teams search for survivors on the rubbl...,0
8371,8381,hopeless,The launch of ' Happy Birthday ' took place la...,0
8372,8382,homeless,"The unrest has left at least 20,000 people dea...",0
8373,8383,hopeless,You have to see it from my perspective . I may...,0


# Rebuild test set

In [15]:
rows = [] # will contain par_id, label and text
for idx in range(len(teids)):
  parid = teids.par_id[idx]
  #print(parid)
  # select row from original dataset
  keyword = data.loc[data.par_id == parid].keyword.values[0]
  text = data.loc[data.par_id == parid].text.values[0]
  label = data.loc[data.par_id == parid].label.values[0]
  rows.append({
      'par_id':parid,
      'community':keyword,
      'text':text,
      'label':label
  })
print(len(rows))
tedf1 = pd.DataFrame(rows)
print(type(tedf1))
print(tedf1.head())
tedf1 = tedf1.sample(frac=1).reset_index(drop=True)

2094
<class 'pandas.core.frame.DataFrame'>
  par_id community                                               text  label
0   4046  hopeless  We also know that they can benefit by receivin...      1
1   1279   refugee  Pope Francis washed and kissed the feet of Mus...      1
2   8330   refugee  Many refugees do n't want to be resettled anyw...      1
3   4063   in-need  "Budding chefs , like "" Fred "" , "" Winston ...      1
4   4089  homeless  "In a 90-degree view of his constituency , one...      1


# RoBERTa with Proposed Enhancements for Task 1
# Improvements: Class-weighted loss, optimized input length (100), lower learning rate, extended training, early stopping on F1

In [17]:
# PROPOSED APPROACH: Enhanced resampling and class weighting
# Use MUCH more balanced data: ALL No-PCL samples + full PCL samples (1:3 ratio)
pcldf = trdf1[trdf1.label==1]
nocldf = trdf1[trdf1.label==0]

# Strategy: Use ALL No-PCL data (2382 samples) to prevent overpredicting PCL
# This creates a 1:3 ratio (794 PCL : 2382 No-PCL)
training_set1 = pd.concat([pcldf, nocldf], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Enhanced training set class distribution (1:3 ratio):")
print(f"PCL (1): {sum(training_set1.label==1)}")
print(f"No-PCL (0): {sum(training_set1.label==0)}")

# Calculate class weights - use inverse frequency weighting
# weight_i = N / (c * N_i)
n_samples = len(training_set1)
n_classes = 2
n_class_0 = sum(training_set1.label == 0)
n_class_1 = sum(training_set1.label == 1)

# Base weights
weight_class_0 = n_samples / (n_classes * n_class_0)
weight_class_1 = n_samples / (n_classes * n_class_1)

print(f"\nCalculated class weights:")
print(f"Weight for class 0 (No-PCL): {weight_class_0:.4f}")
print(f"Weight for class 1 (PCL): {weight_class_1:.4f}")

Enhanced training set class distribution (1:3 ratio):
PCL (1): 794
No-PCL (0): 7581

Calculated class weights:
Weight for class 0 (No-PCL): 0.5524
Weight for class 1 (PCL): 5.2739


In [18]:
training_set1

Unnamed: 0,par_id,community,text,label
0,4355,in-need,The Kindness Institute came about through Cavi...,0
1,8147,refugee,Minister Swaminathan forwarded a Cabinet paper...,0
2,1728,migrant,"""Human Rights Watch last month released a repo...",0
3,3781,migrant,"""The book is """" Never Look an American in the ...",0
4,5083,immigrant,Born to immigrant Indian parents in the small ...,0
...,...,...,...,...
8370,5450,in-need,Does cricket brain need a matric degree ? ? ? ?,0
8371,4854,refugee,"Meanwhile , the State government of Tamil Nadu...",0
8372,5067,in-need,Q : What technologies are you talking about ? ...,0
8373,70,migrant,Italy sharply criticised new U.N . human right...,0


In [None]:
import numpy as np
import os
import shutil
import torch

# Clean up old checkpoint files and GPU memory
if os.path.exists('./outputs'):
    shutil.rmtree('./outputs')
os.makedirs('./BestModel', exist_ok=True)
torch.cuda.empty_cache()

# ADVANCED APPROACH V3-FIX: Reduce memory usage to prevent OOM errors
# Use 1:3 ratio (794 PCL : 2382 No-PCL) with SMALLER batch sizes

task1_model_args = ClassificationArgs(
    num_train_epochs=20,  # Reduced from 25 to 20 (still sufficient)
    learning_rate=2e-5,
    max_seq_length=128,
    train_batch_size=8,   # REDUCED from 16 to 8 to prevent OOM
    eval_batch_size=16,   # REDUCED from 32 to 16
    gradient_accumulation_steps=1,
    no_cache=True,
    overwrite_output_dir=True,
    use_multiprocessing=False,
    silent=False,
    weight_decay=0.01,
    warmup_ratio=0.2,
    output_dir='./outputs',
    save_best_model=True,
    fp16=True,  # Keep fp16 for memory efficiency
    evaluate_during_training=False,  # DISABLED to save GPU memory
    save_steps=1000,  # Only save every 1000 steps
    logging_steps=50,
    best_model_dir='./outputs/best_model'
)

# Create model with standard class weights
task1_model = ClassificationModel("roberta",
                                  'roberta-base',
                                  args=task1_model_args,
                                  num_labels=2,
                                  use_cuda=cuda_available,
                                  weight=[weight_class_0, weight_class_1])

print("=" * 80)
print("ADVANCED TRAINING V3-FIXED: RoBERTa with Balanced Data & Memory Optimization")
print("=" * 80)
print(f"\nMemory Fixes:")
print(f"  ✓ Batch size: 16 → 8 (prevent OOM)")
print(f"  ✓ Eval batch: 32 → 16")
print(f"  ✓ Evaluation during training: DISABLED (save GPU memory)")
print(f"  ✓ Epochs: 25 → 20 (still sufficient)\n")
print(f"Training Configuration:")
print(f"  - Learning rate: 2e-5")
print(f"  - Batch size: 8")
print(f"  - Max sequence length: 128")
print(f"  - Data ratio: 1:3 (794 PCL : 2382 No-PCL)")
print(f"  - Class weights: [{weight_class_0:.4f}, {weight_class_1:.4f}]\n")
print("Training will display progress below...\n")

# Train model (no eval_df to save memory)
task1_model.train_model(training_set1[['text', 'label']])

torch.cuda.empty_cache()
print("\n✓ Training completed!")

# Save best model
best_model_checkpoint = './outputs/best_model'
if os.path.exists(best_model_checkpoint):
    final_model_path = './BestModel/pytorch_model_roberta'
    if os.path.exists(final_model_path):
        shutil.rmtree(final_model_path)
    shutil.copytree(best_model_checkpoint, final_model_path)
    print(f"✓ Best model copied to: {os.path.abspath(final_model_path)}")
else:
    final_model_path = './BestModel/pytorch_model_roberta'
    task1_model.save_model(final_model_path)
    print(f"✓ Final model saved to: {os.path.abspath(final_model_path)}")

# Cleanup and reload
del task1_model
torch.cuda.empty_cache()



In [24]:

# Load best model for final evaluation
print("\nLoading best model for final evaluation...")
best_model_path = './BestModel/pytorch_model_roberta'
final_eval_model = ClassificationModel("roberta",
                                        best_model_path,
                                        args=ClassificationArgs(max_seq_length=128),
                                        use_cuda=cuda_available)

print("Running predictions on dev set...")
preds_task1, _ = final_eval_model.predict(tedf1.text.tolist())
print("✓ Predictions completed!")





Loading best model for final evaluation...
Running predictions on dev set...


INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/4 [00:00<?, ?it/s]

Predicting:   0%|          | 0/21 [00:00<?, ?it/s]

✓ Predictions completed!


In [22]:
# Verify model was saved correctly
import os
best_model_path = './BestModel/pytorch_model_roberta'
model_files = os.listdir(best_model_path) if os.path.exists(best_model_path) else []

print(f"✓ Model files saved in {os.path.abspath(best_model_path)}:")
for f in sorted(model_files):
    file_size = os.path.getsize(os.path.join(best_model_path, f))
    print(f"  - {f} ({file_size/1024/1024:.1f} MB)" if file_size > 1024*1024 else f"  - {f} ({file_size/1024:.1f} KB)")
print(f"\n✓ Best Model Successfully Saved")

✓ Model files saved in d:\imperial_homework\third_year\NLP\CW_NLP\BestModel\pytorch_model_roberta:
  - config.json (0.7 KB)
  - eval_results.txt (0.2 KB)
  - merges.txt (445.6 KB)
  - model.safetensors (475.5 MB)
  - model_args.json (2.7 KB)
  - optimizer.pt (951.1 MB)
  - scheduler.pt (1.0 KB)
  - special_tokens_map.json (0.3 KB)
  - tokenizer.json (3.4 MB)
  - tokenizer_config.json (1.3 KB)
  - training_args.bin (3.7 KB)
  - vocab.json (779.6 KB)

✓ Best Model Successfully Saved


# Model Evaluation and Performance Analysis

In [29]:
# MODEL EVALUATION: Comprehensive metrics and threshold optimization
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
import numpy as np
print("=" * 80)
print("MODEL EVALUATION - DEVELOPMENT SET")
print("=" * 80)

# Get true labels from dev set
true_labels = tedf1['label'].values

# Get predictions from the model
dev_predictions, dev_raw = final_eval_model.predict(tedf1['text'].tolist())

# Extract probability of class 1
if isinstance(dev_raw, np.ndarray):
    if len(dev_raw.shape) > 1:
        dev_probs = dev_raw[:, 1] if dev_raw.shape[1] == 2 else dev_raw
    else:
        dev_probs = dev_raw
else:
    dev_probs = dev_raw
# Try different thresholds
thresholds = np.arange(0.1, 1.0, 0.05)
results = []

for thresh in thresholds:
    preds_thresh = (dev_probs >= thresh).astype(int)
    
    acc = accuracy_score(true_labels, preds_thresh)
    prec = precision_score(true_labels, preds_thresh, zero_division=0)
    rec = recall_score(true_labels, preds_thresh, zero_division=0)
    f1_t = f1_score(true_labels, preds_thresh, zero_division=0)
    
    results.append({
        'threshold': thresh,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1_t
    })

results_df = pd.DataFrame(results)

# Find best threshold by F1
best_idx = results_df['f1'].idxmax()
best_threshold = results_df.loc[best_idx, 'threshold']
best_f1 = results_df.loc[best_idx, 'f1']
best_precision = results_df.loc[best_idx, 'precision']
best_recall = results_df.loc[best_idx, 'recall']
best_accuracy = results_df.loc[best_idx, 'accuracy']

print(f"Best Threshold: {best_threshold:.2f}")
print(f"  Accuracy:  {best_accuracy:.4f}")
print(f"  Precision: {best_precision:.4f}")
print(f"  Recall:    {best_recall:.4f}")
print(f"  F1-Score:  {best_f1:.4f}")

# Apply best threshold
final_predictions = (dev_probs >= best_threshold).astype(int)

print(f"Predicted distribution:")
print(f"  Class 0 (No-PCL): {sum(final_predictions == 0)}")
print(f"  Class 1 (PCL):    {sum(final_predictions == 1)}")

final_cm = confusion_matrix(true_labels, final_predictions)
final_tn, final_fp, final_fn, final_tp = final_cm.ravel()
print(f"\nConfusion Matrix:")
print(f"  True Negatives:  {final_tn}")
print(f"  False Positives: {final_fp}")
print(f"  False Negatives: {final_fn}")
print(f"  True Positives:  {final_tp}")

# Classification report
print(f"\nDetailed Classification Report:")
print(classification_report(true_labels, final_predictions, 
                          target_names=['No-PCL', 'PCL']))

# Save predictions to file
output_file = './dev.txt'
with open(output_file, 'w') as f:
    for pred in final_predictions:
        f.write(f"{pred}\n")
print(f"\n✓ Predictions saved to: {os.path.abspath(output_file)}")

# Summary
print(f"\n\n4. SUMMARY")
print("=" * 80)
print(f"Dataset Size: {len(tedf1)}")
print(f"Ground Truth Distribution:")
print(f"  - No-PCL: {sum(true_labels == 0)} ({100*sum(true_labels == 0)/len(true_labels):.1f}%)")
print(f"  - PCL:    {sum(true_labels == 1)} ({100*sum(true_labels == 1)/len(true_labels):.1f}%)")
print(f"\n✓ Best Performance (Threshold={best_threshold:.2f}):")
print(f"  - F1-Score:   {best_f1:.4f}")
print(f"  - Precision:  {best_precision:.4f}")
print(f"  - Recall:     {best_recall:.4f}")
print(f"  - Accuracy:   {best_accuracy:.4f}")
print(f"\n✓ Model evaluation completed!")

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


MODEL EVALUATION - DEVELOPMENT SET


  0%|          | 0/4 [00:00<?, ?it/s]

Predicting:   0%|          | 0/21 [00:00<?, ?it/s]

Best Threshold: 0.85
  Accuracy:  0.8973
  Precision: 0.4708
  Recall:    0.6482
  F1-Score:  0.5455
Predicted distribution:
  Class 0 (No-PCL): 1820
  Class 1 (PCL):    274

Confusion Matrix:
  True Negatives:  1750
  False Positives: 145
  False Negatives: 70
  True Positives:  129

Detailed Classification Report:
              precision    recall  f1-score   support

      No-PCL       0.96      0.92      0.94      1895
         PCL       0.47      0.65      0.55       199

    accuracy                           0.90      2094
   macro avg       0.72      0.79      0.74      2094
weighted avg       0.91      0.90      0.90      2094


✓ Predictions saved to: d:\imperial_homework\third_year\NLP\CW_NLP\dev.txt


4. SUMMARY
Dataset Size: 2094
Ground Truth Distribution:
  - No-PCL: 1895 (90.5%)
  - PCL:    199 (9.5%)

✓ Best Performance (Threshold=0.85):
  - F1-Score:   0.5455
  - Precision:  0.4708
  - Recall:     0.6482
  - Accuracy:   0.8973

✓ Model evaluation completed!


In [28]:
# Predict on official test set (task4_test.tsv) using the best model
import pandas as pd
import numpy as np
import os

test_path = r"D:\imperial_homework\third_year\NLP\CW_NLP\TEST\task4_test.tsv"
test_df = pd.read_csv(
    test_path,
    sep="\t",
    header=None,
    names=["id", "par_id", "community", "country", "text"],
)
print(f"Loaded test set: {len(test_df)} rows")

test_preds, test_raw = final_eval_model.predict(test_df["text"].tolist())

# Extract class-1 score consistent with dev evaluation
if isinstance(test_raw, np.ndarray):
    if len(test_raw.shape) > 1:
        test_scores = test_raw[:, 1] if test_raw.shape[1] == 2 else test_raw
    else:
        test_scores = test_raw
else:
    test_scores = test_raw

threshold = best_threshold if "best_threshold" in globals() else 0.5
test_labels = (test_scores >= threshold).astype(int)

out_path = "./test.txt"
with open(out_path, "w") as f:
    for pred in test_labels:
        f.write(f"{pred}\n")

print(f"Saved predictions to: {os.path.abspath(out_path)}")
print(f"Threshold used: {threshold:.2f}")

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Loaded test set: 3832 rows


  0%|          | 0/7 [00:00<?, ?it/s]

Predicting:   0%|          | 0/39 [00:00<?, ?it/s]

  with amp.autocast():


Saved predictions to: d:\imperial_homework\third_year\NLP\CW_NLP\test.txt
Threshold used: 0.85
