# Main imports and code

In [238]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
import torch.nn as nn

import logging
import torch
import random

from transformers import AutoTokenizer, RobertaModel, RobertaForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils.data import Dataset, DataLoader


In [239]:
# Check for GPU
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    num_devices = torch.cuda.device_count()
    for i in range(num_devices):
        device = torch.cuda.device(i)
        total_mem = torch.cuda.get_device_properties(i).total_memory / 1024**3  # Convert to GB
        allocated_mem = torch.cuda.memory_allocated(i) / 1024**3  # Convert to GB
        free_mem = total_mem - allocated_mem
        
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"Total Memory: {total_mem:.1f}GB")
        print(f"Allocated Memory: {allocated_mem:.1f}GB")
        print(f"Free Memory: {free_mem:.1f}GB")
        
        if free_mem < 8:
            print(f"Warning: GPU {i} has less than 8GB of free VRAM!")
        else:
            print(f"Using GPU {i} with {free_mem:.1f}GB free VRAM")
            break 
    device = torch.device(f"cuda:{i}")
else:
    print("Warning: No GPU devices available - running on CPU only")
    device = torch.device("cpu")


Using MPS


In [240]:
# Prepare logger
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Loading and preprocessing data

In [None]:
# Helper function to save predictions to an output file
def labels2file(p, file_name):
	with open(f'./logs/{file_name}','w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

def convert_label_to_binary(x):
    return 0 if x in [0, 1] else 1

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

# Data loader class
class PCLDataset(Dataset):
    def __init__(self, text_encodings, labels = np.array([]), country_encodings = []):
        self.text_encodings = text_encodings
        self.labels = labels

        temp, one_hot_encoding = [], ['au','bd','ca','gb','gh','hk','ie','in','jm','ke','lk','my','ng','nz','ph','pk','sg','tz','us','za']
        for c in country_encodings:
            temp.append([1.0 if (one_hot_encoding.index(c) == i) else 0.0 for i in range(len(one_hot_encoding))])
        self.country_encodings = temp

    def __len__(self):
        return len(self.text_encodings)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.text_encodings.items()}
        
        if len(self.country_encodings) != 0: 
            item['country'] = torch.tensor(self.country_encodings[idx])

        if self.labels.size != 0:
            item['labels'] = torch.tensor(self.labels[idx])

        return item

INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/FacebookAI/roberta-base/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/FacebookAI/roberta-base/e2da8e2f811d1448a5b465c236feacd80ffbac7b/tokenizer_config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main/additional_chat_templates?recursive=false&expand=false "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"


In [254]:
# Loading dataframe
dataset_path = "Dont_Patronize_Me_Trainingset"
split_dataset_path = f'{dataset_path}/data_splits'

df = pd.read_csv(
    f'{dataset_path}/dontpatronizeme_pcl.tsv', 
    names=["par_id", "art_id", "keyword", "country_code", "text", "label"],
    sep='\t', 
    skiprows=4
)

train_csv = pd.read_csv(f'{split_dataset_path}/train_semeval_parids-labels.csv')
dev_csv = pd.read_csv(f'{split_dataset_path}/dev_semeval_parids-labels.csv')
test_csv = pd.read_csv(f'{split_dataset_path}/task4_test.tsv', 
    names=["t_id", "art_id", "keyword", "country_code", "text"],
    sep='\t', 
    skiprows=4
)

df.dropna(inplace=True)

# Clean HTML tokens
texts = df["text"].to_numpy()
rules = [("&amp;", "&"), ("More&gt;&gt;", ""), ("&gt;", ""), ("<h>", ""), (". .", "."), ("  ", " "), ("  ", " ")]
counter = 0
for (substring, replacement) in rules:
    df.loc[:, "text"] = df.loc[:, "text"].str.replace(substring, replacement) 

# Remove records with abnormally long sentence length
max_tokens_threshold = 200
texts = df["text"].to_numpy()
good_indices = list(map(lambda a: len(a.split()) < max_tokens_threshold, texts))
df = df[good_indices]
print(f'{len(texts) - sum(good_indices)} samples removed which had text length greater than threshold of {max_tokens_threshold}')

11 samples removed which had text length greater than threshold of 200


In [255]:
def dataset_rebuild(data_csv):
    rows = [] # will contain par_id, country_code, text and label
    for idx in range(len(data_csv)):  
        parid = data_csv["par_id"][idx]

        # If dataframe is empty, then sample was likely cleaned out of df
        if df.loc[df["par_id"] == parid].empty: continue

        # select row from original dataset to retrieve data
        keyword = df.loc[df["par_id"] == parid].keyword.values[0]
        country = df.loc[df["par_id"] == parid].country_code.values[0]
        text = df.loc[df["par_id"]  == parid].text.values[0]
        label = df.loc[df["par_id"]  == parid].label.values[0]
        rows.append({
            'par_id':parid,
            'country_code': country,
            'community':keyword,
            'text':text,
            'label':label
        })

    return pd.DataFrame(rows)

train_set = dataset_rebuild(train_csv)
train_set = train_set.sample(frac=1)
dev_set = dataset_rebuild(dev_csv)
test_set = test_csv

# Create datasets
train_text_encodings = tokenizer(list(train_set["text"].values), truncation=True, padding=True, max_length=256)
dev_text_encodings = tokenizer(list(dev_set["text"].values), truncation=True, padding=True, max_length=256)
test_text_encodings = tokenizer(list(test_set["text"].values), truncation=True, padding=True, max_length=256)

train_dataset = PCLDataset(train_text_encodings, train_set["label"].to_numpy(), train_set["country_code"].to_numpy())
dev_dataset = PCLDataset(dev_text_encodings, dev_set["label"].to_numpy(), dev_set["country_code"].to_numpy())
test_dataset = PCLDataset(test_text_encodings, None, test_set["country_code"].to_numpy())

# Create dataloaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)


# Training baseline model

In [None]:
# Downsample negative instances
pcldf = train_set[train_set.label==1]
npos = len(pcldf)
baseline_train_set = pd.concat([pcldf,train_set[train_set.label==0][:npos*2]])

train_text_encodings = tokenizer(list(baseline_train_set["text"].values), truncation=True, padding=True)
print(train_text_encodings)
dev_text_encodings = tokenizer(list(dev_set["text"].values), truncation=True, padding=True)

# Create downsampled training datasets
baseline_train_dataset = PCLDataset(train_text_encodings, baseline_train_set["label"].to_numpy())

# Create dataloaders
baseline_train_loader = DataLoader(baseline_train_dataset, batch_size=batch_size, shuffle=True)

torch.manual_seed(0)

model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2).to(device)
optimiser = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()
softmax = torch.nn.Softmax(dim=0)
epoch = 10
num_warmup_steps = epoch * len(baseline_train_loader) * 0.1 
scheduler = get_linear_schedule_with_warmup(
    optimiser, num_warmup_steps=num_warmup_steps, num_training_steps= (epoch * len(baseline_train_loader))
)

model.train()
for e in range(epoch):
    for i, batch in enumerate(baseline_train_loader):
        torch.mps.empty_cache() # Not necessary if not using MPS
        optimiser.zero_grad()

        input_ids = batch['input_ids'].to(device)
        print(input_ids.shape)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        logits = model(input_ids, attention_mask=attention_mask).logits
        loss = criterion(softmax(logits), labels)

        if ((i + 1) % 6 == 0):
            print(f'epoch: {e}, batch: {i + 1}, loss: {loss:.5f}, lr: {optimiser.param_groups[0]['lr']:.7f}')

        loss.backward()
        optimiser.step()
        scheduler.step()

# Save checkpoint
state = {
    'model_state_dict': model.state_dict()
}
torch.save(state, f'./baseline_models/model.pth')

{'input_ids': [[0, 21426, 19302, 4834, 26, 4175, 22, 31, 444, 4384, 8952, 22, 56, 22, 1147, 146, 1221, 22, 8, 427, 211, 18589, 56, 22, 20075, 12, 29827, 6732, 61, 109, 295, 75, 4227, 5, 4139, 9, 39, 308, 1494, 22, 479, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 1711, 2000, 128, 29, 1789, 467, 16, 19223, 3187, 8, 30599, 16, 1537, 2655, 479, 15591, 8039, 9, 15958, 6363, 697, 11, 19223, 2849, 12, 19003, 1274, 14, 31343, 617, 1880, 15, 49, 36044, 87, 77, 51, 58, 2641, 479, 20, 4590, 32, 2

INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/FacebookAI/roberta-base/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/model.safetensors "HTTP/1.1 302 Found"
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1034.49it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTE

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [None]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base").to(device)
saved_state = torch.load("./baseline_models/model.pth")
model.load_state_dict(saved_state['model_state_dict'])

predictions, labels = [], []

model.eval()
with torch.no_grad():
    for index, row in dev_set.iterrows():
        labels.append(convert_label_to_binary(row.label))
        inputs = tokenizer(row.text, return_tensors="pt").to(device)
        logits = model(**inputs).logits
        predictions.append(logits.argmax(dim=1)[0].cpu())

        if (index % 100 == 0):
            print(f'completed iteration {index}')
print("Evaluation finished!\n")

# Reformat data
predictions = np.array(predictions)
labels = np.array(labels)
cm = confusion_matrix(labels, predictions)

# Print confustion matrix and metrics
print(cm)
print(classification_report(labels, predictions))

# Plot confustion matrix
df_cm = pd.DataFrame(cm, index = [i for i in "01"],
                  columns = [i for i in "01"])
plt.figure(figsize = (10,7))
sn.heatmap(df_cm, annot=True).set(xlabel = 'predictions', ylabel = 'ground truth')
plt.show()


# Implementing my approach

In [219]:
class MyModel(nn.Module):
    def __init__(self, latent_encoding_size = 256):
        super().__init__()

        self.encoder = RobertaModel.from_pretrained("FacebookAI/roberta-base")

        self.sequence_encoding_mlp = nn.Sequential(
            nn.Linear(768, 512),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(512, latent_encoding_size)
        )

        self.geo_embedding_mlp = nn.Sequential(
            nn.Linear(20, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64, latent_encoding_size)
        )

        self.layer1 = nn.Sequential(
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
        )
        self.layer2 = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
        )
        self.output = nn.Sequential(
            nn.Linear(256, 5),
            nn.BatchNorm1d(5),
            nn.Softmax(),
        )

    def forward(self, input_ids, attention_mask, geo_tokens):
        # Get geographical 
        geo_embedding = self.geo_embedding_mlp(geo_tokens)

        # Encode text tokens
        encoding = self.encoder(input_ids, attention_mask).last_hidden_state[:, 0:]
        encoding = self.sequence_encoding_mlp(encoding)

        # MLP layer
        x = encoding + geo_embedding
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.output(x)

        return x
        
        

In [220]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")

new_model = MyModel().to(device)
optimiser = torch.optim.AdamW(new_model.parameters(), lr=2e-4, weight_decay=0.01)
criterion = torch.nn.CrossEntropyLoss()
epoch = 10
num_warmup_steps = epoch * len(train_loader) * 0.1 
scheduler = get_linear_schedule_with_warmup(
    optimiser, num_warmup_steps=num_warmup_steps, num_training_steps= (epoch * len(train_loader))
)

torch.manual_seed(0)

new_model.train()
for e in range(epoch):
    for i, batch in enumerate(train_loader):
        torch.mps.empty_cache() # Not necessary if not using MPS
        optimiser.zero_grad()

        input_ids = batch['input_ids'].to(device)
        print(input_ids.shape)
        attention_mask = batch['attention_mask'].to(device)
        country_one_hot = torch.tensor(batch['country']).to(device)
        outputs = new_model(input_ids, attention_mask, country_one_hot)
        input_embeddings = outputs.hidden_states[0]
        
        labels = batch['labels'].to(device)
        loss = criterion(softmax(logits), labels)

        if ((i + 1) % 6 == 0):
            print(f'epoch: {e}, batch: {i + 1}, loss: {loss:.5f}, lr: {optimiser.param_groups[0]['lr']:.7f}')

        loss.backward()
        optimiser.step()
        scheduler.step()

        break

# # Save checkpoint
# state = {
#     'model_state_dict': new_model.state_dict()
# }
# torch.save(state, f'./new_models/model.pth')

INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/FacebookAI/roberta-base/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/FacebookAI/roberta-base/resolve/main/tokenizer_config.json "HTTP/1.1 307 Temporary Redirect"
INFO:httpx:HTTP Request: HEAD https://huggingface.co/api/resolve-cache/models/FacebookAI/roberta-base/e2da8e2f811d1448a5b465c236feacd80ffbac7b/tokenizer_config.json "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main/additional_chat_templates?recursive=false&expand=false "HTTP/1.1 404 Not Found"
INFO:httpx:HTTP Request: GET https://huggingface.co/api/models/FacebookAI/roberta-base/tree/main?recursive=true&expand=false "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: HEAD https://hugg

torch.Size([2, 225])


  country_one_hot = torch.tensor(batch['country']).to(device)


RuntimeError: running_mean should contain 225 elements not 64