In [1]:
# ===============================
# SarcasmLens: Classical Baselines (Final Subtask 2 Version)
# Models: RandomForest, LogisticRegression, LinearSVM, RBFSVM
# ===============================

# 1Ô∏è‚É£ Imports
import pandas as pd
import numpy as np
import re
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC


In [4]:
# ===============================
# 2Ô∏è‚É£ Load Dataset (CSV)
# ===============================
path = r"C:\MAIN\Projects\Sarcasm Detection\Dataset\unique_tweets.csv"  # <-- change if needed
df = pd.read_csv(path)

print("\n===== Original Dataset Info =====")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("=================================\n")


===== Original Dataset Info =====
Shape: (11367, 3)
Columns: ['ID', 'Tweet', 'Label']



In [6]:


# Automatically detect tweet text and label columns
possible_text_cols = [col for col in df.columns if "tweet" in col.lower() or "text" in col.lower()]
possible_label_cols = [col for col in df.columns if "label" in col.lower()]

text_col = possible_text_cols[0] if possible_text_cols else df.columns[1]
label_col = possible_label_cols[0] if possible_label_cols else df.columns[-1]

# Subset and rename
df = df[[text_col, label_col]]
df.columns = ["text", "label"]

print(f"‚úÖ Selected columns: {text_col} ‚Üí text | {label_col} ‚Üí label")
print(f"Dataset shape after selection: {df.shape}")
print(df.tail(5), "\n")
print(df.head(5), "\n")

‚úÖ Selected columns: text ‚Üí text | label ‚Üí label
Dataset shape after selection: (11367, 2)
                                                    text label
11362  Khiladi anari, aur shaamat equipment ki aye! B...    NO
11363  #irony RT @techno_charan: pallu k neche chhupa...    NO
11364                          Jab Thak Hai Jaan. #Irony    NO
11365  @beeba_puttar Acha! Aur koi nae mila tha #sarc...    NO
11366  @Nirmalogy sacchi mucchi mein? Yah ye bhi #Sar...    NO 

                                                text label
0  takeout burrito shielded from cold as though i...   YES
1  sight of coworkers' stupid fucking faces endur...   YES
2                                porch ceded to bats   YES
3  panicked donald trump jr. tries to cover up co...   YES
4  mike gravel can't believe his polling numbers ...   YES 



In [7]:
# ===============================
# 3Ô∏è‚É£ Text Cleaning
# ===============================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)          # Remove URLs
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)          # Remove mentions
    text = re.sub(r"#", "", text)                       # Remove hashtags
    text = re.sub(r"[^a-zA-Z\u0900-\u097F!?'\s]", " ", text)  # Keep English/Hindi letters, !, ?
    text = re.sub(r"\s+", " ", text).strip()            # Normalize spaces
    return text

df["text"] = df["text"].apply(clean_text)

print("‚úÖ Cleaning done! Sample cleaned tweets:")
print(df.sample(5, random_state=42), "\n")

‚úÖ Cleaning done! Sample cleaned tweets:
                                                    text label
10763  hahahahhaa rajeev jaise log bas khans' ke pais...    NO
6021   baby faced muscular jimmy carter tells democra...   YES
3048   bad ass engagement ring also tells the time an...   YES
5640       shopper takes bizarre journey beyond bed bath   YES
1237   nation flattered brand would go to the trouble...   YES 



In [8]:
# ===============================
# 4Ô∏è‚É£ Train / Test Split (Stratified)
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"],
)

print(f"Train size: {len(X_train)} | Test size: {len(X_test)}")

Train size: 9093 | Test size: 2274


In [9]:
# ===============================
# 5Ô∏è‚É£ TF-IDF Vectorizer
# ===============================
tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    sublinear_tf=True,
)

# ===============================
# 6Ô∏è‚É£ Define Models
# ===============================
models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200, random_state=42, n_jobs=-1
    ),
    "LogisticRegression": LogisticRegression(
        max_iter=1000, solver="liblinear", random_state=42
    ),
    "LinearSVM": LinearSVC(
        C=1.0, random_state=42
    ),
    "RBFSVM": SVC(
        kernel="rbf", C=1.0, gamma="scale", random_state=42
    ),
}

In [10]:
# ===============================
# 7Ô∏è‚É£ Track Best Model
# ===============================
best_model_name = None
best_model_pipeline = None
best_f1 = 0.0
results_summary = []

In [12]:
# ===============================
# 8Ô∏è‚É£ Training & Evaluation Loop
# ===============================
for name, model in models.items():
    print(f"\n{'='*40}\nüîπ Training {name}\n{'='*40}")
    
    # Create pipeline
    clf = Pipeline([
        ("tfidf", tfidf),
        ("model", model)
    ])
    
    # Train model
    clf.fit(X_train, y_train)
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    report = classification_report(y_test, y_pred, digits=4)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted F1-Score: {f1:.4f}")
    print("\nClassification Report:\n", report)
    print("Confusion Matrix:\n", cm)
    
    # Store results
    results_summary.append({
        "model": name,
        "accuracy": acc,
        "weighted_f1": f1
    })
    
    # Track best
    if f1 > best_f1:
        best_f1 = f1
        best_model_name = name
        best_model_pipeline = clf




üîπ Training RandomForest
Accuracy: 0.9565
Weighted F1-Score: 0.9567

Classification Report:
               precision    recall  f1-score   support

          NO     0.9122    0.9916    0.9503       954
         YES     0.9935    0.9311    0.9613      1320

    accuracy                         0.9565      2274
   macro avg     0.9529    0.9613    0.9558      2274
weighted avg     0.9594    0.9565    0.9567      2274

Confusion Matrix:
 [[ 946    8]
 [  91 1229]]

üîπ Training LogisticRegression
Accuracy: 0.9639
Weighted F1-Score: 0.9640

Classification Report:
               precision    recall  f1-score   support

          NO     0.9467    0.9686    0.9575       954
         YES     0.9769    0.9606    0.9687      1320

    accuracy                         0.9639      2274
   macro avg     0.9618    0.9646    0.9631      2274
weighted avg     0.9642    0.9639    0.9640      2274

Confusion Matrix:
 [[ 924   30]
 [  52 1268]]

üîπ Training LinearSVM




Accuracy: 0.9732
Weighted F1-Score: 0.9732

Classification Report:
               precision    recall  f1-score   support

          NO     0.9646    0.9717    0.9681       954
         YES     0.9794    0.9742    0.9768      1320

    accuracy                         0.9732      2274
   macro avg     0.9720    0.9730    0.9725      2274
weighted avg     0.9732    0.9732    0.9732      2274

Confusion Matrix:
 [[ 927   27]
 [  34 1286]]

üîπ Training RBFSVM
Accuracy: 0.9727
Weighted F1-Score: 0.9727

Classification Report:
               precision    recall  f1-score   support

          NO     0.9685    0.9665    0.9675       954
         YES     0.9758    0.9773    0.9765      1320

    accuracy                         0.9727      2274
   macro avg     0.9721    0.9719    0.9720      2274
weighted avg     0.9727    0.9727    0.9727      2274

Confusion Matrix:
 [[ 922   32]
 [  30 1290]]


In [13]:

# ===============================
# 9Ô∏è‚É£ Save Best Model & Results
# ===============================
print(f"\n{'='*50}")
print(f"üèÜ Best Model: {best_model_name}")
print(f"üèÜ Best Weighted F1: {best_f1:.4f}")
print(f"{'='*50}\n")

# Save the model
joblib.dump(best_model_pipeline, "best_baseline_model.pkl")

# Save evaluation summary
with open("results_summary.json", "w") as f:
    json.dump(results_summary, f, indent=2)

print("‚úÖ All tasks completed successfully!")
print("   -> Saved model: best_baseline_model.pkl")
print("   -> Results summary: results_summary.json")


üèÜ Best Model: LinearSVM
üèÜ Best Weighted F1: 0.9732

‚úÖ All tasks completed successfully!
   -> Saved model: best_baseline_model.pkl
   -> Results summary: results_summary.json
