# SARCASM DETECTOR MODEL

This file includes all of the code towards building the model. You can test the model by downloading the trained model and tokenizer attached in the google drive link below and running the code starting the libraries and everything below the "Gradio Implementation" title. If you desire to run the whole thing, the dataset is in the link below too.

https://drive.google.com/drive/folders/13wmqXMNSEvwgCQEG32Vvtjxr_PVUqUir?usp=sharing 

In [None]:
pip install transformers datasets torch scikit-learn gradio

# Install Necessary Libraries

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
import re
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import gradio as gr



# Data Preprocessing

## Define a function to clean the text

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

## Import the dataset

In [17]:
dataset = pd.read_csv("sarcasm_dataset.csv")
dataset.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


## Explote the Dataset

In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862408 entries, 0 to 862407
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   label           862408 non-null  int64  
 1   comment         862375 non-null  object 
 2   author          862407 non-null  object 
 3   subreddit       862407 non-null  object 
 4   score           862407 non-null  float64
 5   ups             862407 non-null  float64
 6   downs           862407 non-null  float64
 7   date            862407 non-null  object 
 8   created_utc     862407 non-null  object 
 9   parent_comment  862407 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 65.8+ MB


## Delete Null Values

In [18]:
dataset = dataset.dropna(subset=['comment'])
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1010771 entries, 0 to 1010825
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   label           1010771 non-null  int64 
 1   comment         1010771 non-null  object
 2   author          1010771 non-null  object
 3   subreddit       1010771 non-null  object
 4   score           1010771 non-null  int64 
 5   ups             1010771 non-null  int64 
 6   downs           1010771 non-null  int64 
 7   date            1010771 non-null  object
 8   created_utc     1010771 non-null  object
 9   parent_comment  1010771 non-null  object
dtypes: int64(4), object(6)
memory usage: 84.8+ MB


## Extract Comment and Label columns

In [19]:
texts = dataset['comment'].apply(clean_text).tolist()
labels = dataset['label'].tolist()

## Split the data

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

## Select 100,000 rows for train,  8000 for validation

More than this was too computationally expensive

In [21]:
train_texts = train_texts[:100000]
val_texts = val_texts[:8000]
train_labels = train_labels[:100000]
val_labels = val_labels[:8000]

In [22]:
train_texts[:5]

['waow le qop shite farmer rite',
 'hes a dom for realsies though',
 'correct',
 'ding dong the fries are done',
 'rip boss hog']

# Tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=64)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=64)

# Dataset and DataLoader

In [24]:
class SarcasmDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SarcasmDataset(train_encodings, train_labels)
val_dataset = SarcasmDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model Architecture

In [None]:
class SarcasmDetector(nn.Module):
    def __init__(self):
        super(SarcasmDetector, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bilstm = nn.LSTM(input_size=768, hidden_size=128, num_layers=1,
                              batch_first=True, bidirectional=True)
        self.cnn = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():  
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state 
        lstm_out, _ = self.bilstm(x)   
        cnn_in = lstm_out.permute(0, 2, 1)  
        cnn_out = self.cnn(cnn_in)         
        pooled = torch.max(cnn_out, dim=2)[0] 
        x = self.relu(pooled)
        x = self.dropout(x)
        x = self.fc(x)
        return self.sigmoid(x).squeeze()

# Training and Evaluating the model

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SarcasmDetector().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

def train(model, loader):
    model.train()
    total_loss = 0
    for batch in loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].float().to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            preds = (outputs > 0.5).int().cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return acc, f1

# Run Training Loop

In [None]:
epochs = 5
for epoch in range(epochs):
    train_loss = train(model, train_loader)
    val_acc, val_f1 = evaluate(model, val_loader)
    print(f"Epoch {epoch+1} | Loss: {train_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

Epoch 1 | Loss: 0.6298 | Val Acc: 0.6666 | Val F1: 0.6871
Epoch 2 | Loss: 0.5998 | Val Acc: 0.6817 | Val F1: 0.6789
Epoch 3 | Loss: 0.5866 | Val Acc: 0.6851 | Val F1: 0.6824
Epoch 4 | Loss: 0.5787 | Val Acc: 0.6963 | Val F1: 0.6813
Epoch 5 | Loss: 0.5719 | Val Acc: 0.6975 | Val F1: 0.7034


# Save model state and tokenizer

In [None]:
torch.save(model.state_dict(), "sarcasm_model.pt")
tokenizer.save_pretrained("sarcasm_tokenizer")

# Gradio Implementation

## Preprocessing function

In [1]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

## Model Definition (same as before)

In [4]:
class SarcasmDetector(nn.Module):
    def __init__(self):
        super(SarcasmDetector, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.bilstm = nn.LSTM(input_size=768, hidden_size=128, num_layers=1,
                              batch_first=True, bidirectional=True)
        self.cnn = nn.Conv1d(in_channels=256, out_channels=128, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask):
        with torch.no_grad():
            outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.last_hidden_state
        lstm_out, _ = self.bilstm(x)
        cnn_in = lstm_out.permute(0, 2, 1)
        cnn_out = self.cnn(cnn_in)
        pooled = torch.max(cnn_out, dim=2)[0]
        x = self.relu(pooled)
        x = self.dropout(x)
        x = self.fc(x)
        return self.sigmoid(x).squeeze()

## Load model and tokenizer

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SarcasmDetector()
model.load_state_dict(torch.load("sarcasm_model.pt", map_location=device))
model.to(device)
model.eval()

tokenizer = BertTokenizer.from_pretrained("sarcasm_tokenizer")

## Prediction Function

In [11]:
def predict_sarcasm(text):
    cleaned = clean_text(text)
    inputs = tokenizer(cleaned, return_tensors="pt", truncation=True, padding=True, max_length=64)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask)
        prob = output.item()

    label = "Sarcastic" if prob > 0.5 else "Not Sarcastic"
    confidence = f"{prob*100:.2f}%"
    return f"{label} (Confidence: {confidence})"

## Gradio UI

In [12]:
interface = gr.Interface(
    fn=predict_sarcasm,
    inputs=gr.Textbox(lines=3, placeholder="Enter a comment..."),
    outputs="text",
    title="Sarcasm Detector",
    description="Enter a sentence",
)

## Launch the app

In [13]:
interface.launch()

* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


