In [132]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.utils import resample

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [14]:
train_df = pd.read_csv(r"\TalkHealth\Datasets\train_sarcasm.csv")
test_df = pd.read_csv(r"\TalkHealth\Datasets\test_sarcasm.csv")

print(train_df.head())
print(test_df.head())

                                               tweet  sarcastic
0  the only thing i get from college be a caffein...          1
1  i love it when professor draw a big question m...          1
2  remember the hundred email from company when c...          1
3  today my pop pop tell me i be not force to go ...          1
4  i do too and i also report cancun cruz not wor...          1
                                               tweet  sarcastic
0  size on the the toulouse team that pack be mon...          0
1                                            pinball          0
2  so the scottish government want people to get ...          1
3  villainous pro tip change the device name on h...          0
4                    i would date any of these men 🥺          0


In [15]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (3467, 2)
Test shape: (1400, 2)


In [16]:
print("Train null values", train_df.isnull().sum())
print("Train null values", test_df.isnull().sum())

Train null values tweet        1
sarcastic    0
dtype: int64
Train null values tweet        0
sarcastic    0
dtype: int64


In [17]:
train_df.dropna(inplace=True)

In [146]:
print(train_df['sarcastic'].value_counts())

sarcastic
0    2599
1     867
Name: count, dtype: int64


In [19]:
train_df['tweet_length'] = train_df['tweet'].apply(lambda x: len(x.split()))
print(train_df['tweet_length'].describe())

count    3466.000000
mean       18.574437
std        11.250773
min         1.000000
25%        10.000000
50%        16.000000
75%        24.000000
max        61.000000
Name: tweet_length, dtype: float64


In [20]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+', '', text)  
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#', '', text) 
    text = re.sub(r'[\U00010000-\U0010ffff]', '', text)  
    text = re.sub(r'[^\w\s.,!?]', '', text)  
    text = re.sub(r'\s+', ' ', text).strip()  
    return text

In [33]:
train_df['tweet'] = train_df['tweet'].apply(preprocess_text)
test_df['tweet'] = test_df['tweet'].apply(preprocess_text)

In [68]:
majority = train_df[train_df['sarcastic'] == 0]
minority = train_df[train_df['sarcastic'] == 1]
minority_upsampled = resample(minority, replace=True, n_samples=len(majority), random_state=42)
train_df_balanced = pd.concat([majority, minority_upsampled])

In [70]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df_balanced['tweet'].tolist(),
    train_df_balanced['sarcastic'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=train_df_balanced['sarcastic']
)

In [72]:
print("Train:", pd.Series(train_labels).value_counts())
print("Val:", pd.Series(val_labels).value_counts())

Train: 1    2079
0    2079
Name: count, dtype: int64
Val: 0    520
1    520
Name: count, dtype: int64


In [74]:
# Dataset class
class IronyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [76]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [78]:
train_dataset = IronyDataset(train_texts, train_labels, tokenizer)
val_dataset = IronyDataset(val_texts, val_labels, tokenizer)

In [80]:
def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [82]:
training_args = TrainingArguments(
    output_dir='./results_distilroberta',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs_distilroberta',
    logging_steps=10,
    eval_steps=50,
    load_best_model_at_end=True,
    weight_decay=0.01,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [84]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6716,0.658639,0.617308,0.595781,0.64906,0.617308
2,0.4737,0.527751,0.734615,0.7346,0.734671,0.734615
3,0.3985,0.497694,0.771154,0.770848,0.77261,0.771154


TrainOutput(global_step=390, training_loss=0.5546855095105293, metrics={'train_runtime': 262.2815, 'train_samples_per_second': 47.56, 'train_steps_per_second': 1.487, 'total_flos': 413099582708736.0, 'train_loss': 0.5546855095105293, 'epoch': 3.0})

In [86]:
model.save_pretrained('./irony_detector_distilroberta')
tokenizer.save_pretrained('./irony_detector_distilroberta')

('./irony_detector_distilroberta\\tokenizer_config.json',
 './irony_detector_distilroberta\\special_tokens_map.json',
 './irony_detector_distilroberta\\vocab.json',
 './irony_detector_distilroberta\\merges.txt',
 './irony_detector_distilroberta\\added_tokens.json',
 './irony_detector_distilroberta\\tokenizer.json')

In [88]:
test_dataset = IronyDataset(test_df['tweet'].tolist(), test_df['sarcastic'].tolist(), tokenizer)
test_loader = DataLoader(test_dataset, batch_size=64)

In [126]:
trainer.model.eval()
preds, labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].numpy()
        outputs = trainer.model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs.logits, dim=1)
        batch_preds = (probs[:, 1] > 0.4).long().cpu().numpy()
        preds.extend(batch_preds)
        labels.extend(batch_labels)

In [136]:
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
print(f"\nRoberta Accuracy: {acc:.4f}")
print(f"Roberta Precision: {precision:.4f}")
print(f"Roberta Recall: {recall:.4f}")
print(f"Roberta F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(labels, preds, digits=4))
print("Confusion Matrix:")
print(confusion_matrix(labels, preds))



Roberta Accuracy: 0.5621
Roberta Precision: 0.7834
Roberta Recall: 0.5621
Roberta F1 Score: 0.6277

Classification Report:
              precision    recall  f1-score   support

           0     0.8847    0.5625    0.6877      1200
           1     0.1758    0.5600    0.2676       200

    accuracy                         0.5621      1400
   macro avg     0.5302    0.5613    0.4777      1400
weighted avg     0.7834    0.5621    0.6277      1400

Confusion Matrix:
[[675 525]
 [ 88 112]]


In [116]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_tfidf = vectorizer.fit_transform(train_df_balanced['tweet'])
y_train = train_df_balanced['sarcastic']

X_test_tfidf = vectorizer.transform(test_df['tweet'])
y_test = test_df['sarcastic']

In [118]:
svm_model = LinearSVC(class_weight='balanced')
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)



In [134]:
svm_precision, svm_recall, svm_f1, _ = precision_recall_fscore_support(y_test, svm_preds, average='weighted')
svm_acc = accuracy_score(y_test, svm_preds)
print(f"SVM Accuracy: {svm_acc:.4f}")
print(f"SVM Precision: {svm_precision:.4f}")
print(f"SVM Recall: {svm_recall:.4f}")
print(f"SVM F1 Score: {svm_f1:.4f}")
print("SVM Classification Report:")
print(classification_report(y_test, svm_preds, digits=4))
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, svm_preds))

SVM Accuracy: 0.7457
SVM Precision: 0.7654
SVM Recall: 0.7457
SVM F1 Score: 0.7551
SVM Classification Report:
              precision    recall  f1-score   support

           0     0.8638    0.8350    0.8492      1200
           1     0.1750    0.2100    0.1909       200

    accuracy                         0.7457      1400
   macro avg     0.5194    0.5225    0.5200      1400
weighted avg     0.7654    0.7457    0.7551      1400

SVM Confusion Matrix:
[[1002  198]
 [ 158   42]]
