# Classifier experiments

Now that we have an annotated dataset, we will try training a classifier to identify subtypes of sarcasm in news headlines.

I'll start off with a straightforward Random Forest with TF-IDF vectorization text classifier to serve as a baseline for further experiments.

In [69]:
import pandas as pd
from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [70]:
df = pd.read_csv("results_final.csv")
X = df['headline']
y = df['labels']

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [72]:
vectorizer = TfidfVectorizer()
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)

In [73]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_vector, y_train)

In [74]:
y_pred = rf.predict(X_test_vector)

In [75]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6486982031536487


In [76]:
rf = RandomForestClassifier(random_state=42, n_estimators=200, max_features='log2')
rf.fit(X_train_vector, y_train)

In [77]:
y_pred = rf.predict(X_test_vector)

In [78]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6894022735606894


In [79]:
f1 = f1_score(y_test, y_pred, average='macro')
print("F1-Score:", f1)

F1-Score: 0.6881524581617572


It works only modestly better than random guessing. Now lets try fine tuning BERT for this task.

In [80]:
dataset = load_dataset('csv', data_files='results_final.csv')['train']

train_testval = dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = train_testval['train']  # 70% for training
testval_dataset = train_testval['test']  # 30% for test+val

val_test = testval_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test['train']  # 15% for validation
test_dataset = val_test['test']  # 15% for test

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

Generating train split: 13633 examples [00:00, 486828.65 examples/s]


In [81]:
model_path = 'google-bert/bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples["headline"], truncation=True)
    return tokenized_inputs

tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 9543/9543 [00:00<00:00, 45438.21 examples/s]
Map: 100%|██████████| 2045/2045 [00:00<00:00, 45443.40 examples/s]
Map: 100%|██████████| 2045/2045 [00:00<00:00, 46996.35 examples/s]


In [83]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [84]:
accuracy = evaluate.load("accuracy")
f1_score = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    
    f1_metrics = f1_score.compute(predictions=predictions, references=labels, average="macro")
    
    # Combine metrics into a single dictionary
    metrics = {"accuracy": accuracy_score["accuracy"], "f1": f1_metrics["f1"]}
    
    return metrics

In [85]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
)




In [86]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [87]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.708,0.317074,0.877262,0.878189
2,0.3066,0.361813,0.870905,0.870431
3,0.2077,0.385435,0.89291,0.893211
4,0.1333,0.472927,0.89291,0.893718
5,0.0797,0.518187,0.900733,0.90126
6,0.0292,0.578034,0.896333,0.895529
7,0.0223,0.544445,0.904645,0.904619
8,0.0297,0.540724,0.915403,0.915698
9,0.0114,0.527407,0.916381,0.916349
10,0.0057,0.547101,0.914425,0.914395


TrainOutput(global_step=5970, training_loss=0.13373354016436603, metrics={'train_runtime': 3680.6579, 'train_samples_per_second': 25.927, 'train_steps_per_second': 1.622, 'total_flos': 1209119724900234.0, 'train_loss': 0.13373354016436603, 'epoch': 10.0})

In [88]:
predictions = trainer.predict(tokenized_data["test"])

In [89]:
logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)

{'accuracy': 0.9026894865525672, 'f1': 0.9018800351249393}


Repeat the process with RoBERTa

In [90]:
model_path = 'roberta-base'

tokenizer = AutoTokenizer.from_pretrained(model_path)

id2label = {0: "contextual contradiction", 1: "mock enthusiasm", 2: "stylistic irony", 3: "institutional critique", 4: "behavioral observation"}
label2id = {"contextual contradiction": 0, "mock enthusiasm": 1, "stylistic irony": 2, "institutional critique": 3, "behavioral observation": 4}

model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=5, id2label=id2label, label2id=label2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
tokenized_data = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 9543/9543 [00:00<00:00, 60567.18 examples/s]
Map: 100%|██████████| 2045/2045 [00:00<00:00, 49874.99 examples/s]
Map: 100%|██████████| 2045/2045 [00:00<00:00, 47463.68 examples/s]


In [92]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [61]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_ratio=0.1,
)




In [93]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

  trainer = Trainer(


In [94]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.8508,0.577363,0.759902,0.764903
2,0.4776,0.46792,0.814181,0.81511
3,0.3408,0.541477,0.82445,0.820535
4,0.242,0.717076,0.817604,0.815565
5,0.1842,0.920755,0.822005,0.822874
6,0.1004,0.975965,0.825428,0.826492
7,0.0769,1.116072,0.826895,0.82637
8,0.0545,1.288256,0.818093,0.81832


TrainOutput(global_step=4776, training_loss=0.26047007063126043, metrics={'train_runtime': 3111.8884, 'train_samples_per_second': 30.666, 'train_steps_per_second': 1.918, 'total_flos': 984898486283976.0, 'train_loss': 0.26047007063126043, 'epoch': 8.0})

In [95]:
predictions = trainer.predict(tokenized_data["test"])

In [96]:
logits = predictions.predictions
labels = predictions.label_ids

metrics = compute_metrics((logits, labels))
print(metrics)

{'accuracy': 0.8400977995110025, 'f1': 0.8390753079557882}
