-------------------

In [19]:
"""
SarcasmLens – Baseline System (Subtask 2)
-----------------------------------------
Goal:
Train and evaluate multiple classical ML models for sarcasm detection in
code-mixed (Hindi-English) text using TF-IDF embeddings.

Models implemented:
1. Random Forest
2. Logistic Regression
3. Linear SVM
4. RBF SVM

The code includes:
- Data preprocessing and cleaning
- TF-IDF vectorization
- Model training and evaluation
- Saving best model and metrics
- Testing on an external test set
"""

# Imports
import pandas as pd
import numpy as np
import re
import joblib
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score,
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC

In [20]:
# =======================================================
# 1. Load Dataset
# =======================================================
path = r"C:\MAIN\Projects\Sarcasm Detection\Dataset\unique_tweets.csv"
df = pd.read_csv(path)

print("\n===== Dataset Information =====")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("===============================\n")


===== Dataset Information =====
Shape: (11367, 3)
Columns: ['ID', 'Tweet', 'Label']



In [21]:
# Automatically detect text and label columns
possible_text_cols = [col for col in df.columns if "tweet" in col.lower() or "text" in col.lower()]
possible_label_cols = [col for col in df.columns if "label" in col.lower()]

text_col = possible_text_cols[0] if possible_text_cols else df.columns[1]
label_col = possible_label_cols[0] if possible_label_cols else df.columns[-1]

df = df[[text_col, label_col]]
df.columns = ["text", "label"]

print(f"Columns used: {text_col} → text | {label_col} → label")
print(f"Dataset shape after selection: {df.shape}\n")

Columns used: Tweet → text | Label → label
Dataset shape after selection: (11367, 2)



In [22]:
# =======================================================
# 2. Text Cleaning
# =======================================================
def clean_text(text):
    """
    Cleans text by removing URLs, mentions, hashtags, and unwanted symbols.
    Retains Hindi + English characters and punctuation (!, ?), as these may carry
    sarcastic intent.
    """
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)       # Remove URLs
    text = re.sub(r"@[A-Za-z0-9_]+", "", text)       # Remove mentions
    text = re.sub(r"#", "", text)                    # Remove hashtags
    text = re.sub(r"[^a-zA-Z\u0900-\u097F!?'\s]", " ", text)  # Keep Hindi & English
    text = re.sub(r"\s+", " ", text).strip()         # Normalize spaces
    return text

df["text"] = df["text"].apply(clean_text)
print("Text cleaning completed. Sample:\n")
print(df.sample(5, random_state=42))
print("\n")

Text cleaning completed. Sample:

                                                    text label
10763  hahahahhaa rajeev jaise log bas khans' ke pais...    NO
6021   baby faced muscular jimmy carter tells democra...   YES
3048   bad ass engagement ring also tells the time an...   YES
5640       shopper takes bizarre journey beyond bed bath   YES
1237   nation flattered brand would go to the trouble...   YES




In [23]:
# =======================================================
# 3. Train/Test Split
# =======================================================
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"],
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

print(f"Training samples: {len(X_train)}, Testing samples: {len(X_test)}\n")

Training samples: 9093, Testing samples: 2274



In [24]:
# =======================================================
# 4. TF-IDF Vectorization
# =======================================================
"""
TF-IDF (Term Frequency - Inverse Document Frequency)
represents each document as a weighted vector of tokens.

Hyperparameters:
- max_features=8000 → limits vocabulary size to the 8000 most frequent tokens.
  Prevents overfitting and reduces computation time.
- ngram_range=(1,2) → includes unigrams and bigrams, capturing short sarcastic
  phrases like "wah kya" or "oh great".
- sublinear_tf=True → applies logarithmic scaling to term frequency, which
  helps balance very common vs. rare terms.
"""

tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    sublinear_tf=True,
)

In [25]:
# =======================================================
# 5. Model Definitions
# =======================================================
"""
Each model has specific hyperparameters chosen for balance between speed and accuracy.
1. RandomForestClassifier:
   - n_estimators=200 → number of trees; chosen for stable performance without
     excessive computation.
2. LogisticRegression:
   - solver='liblinear' → efficient for small/medium datasets.
   - max_iter=1000 → ensures convergence.
3. LinearSVC:
   - C=1.0 → default regularization strength; higher values risk overfitting.
4. SVC (RBF Kernel):
   - kernel='rbf' and gamma='scale' → non-linear boundary, useful if sarcasm
     distribution is complex.
"""

models = {
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1),
    "LogisticRegression": LogisticRegression(max_iter=1000, solver="liblinear", random_state=42),
    "LinearSVM": LinearSVC(C=1.0, random_state=42),
    "RBFSVM": SVC(kernel="rbf", C=1.0, gamma="scale", random_state=42),
}

In [26]:
# =======================================================
# 6. Training and Evaluation
# =======================================================
best_model_name = None
best_model_pipeline = None
best_f1 = 0.0
results_summary = []

for name, model in models.items():
    print(f"\n==============================\nTraining {name}\n==============================")
    clf = Pipeline([
        ("tfidf", tfidf),
        ("model", model)
    ])

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted F1-Score: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_test, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    results_summary.append({
        "model": name,
        "accuracy": acc,
        "weighted_f1": f1
    })

    if f1 > best_f1:
        best_f1 = f1
        best_model_name = name
        best_model_pipeline = clf


Training RandomForest
Accuracy: 0.9565
Weighted F1-Score: 0.9567

Classification Report:
               precision    recall  f1-score   support

          NO     0.9122    0.9916    0.9503       954
         YES     0.9935    0.9311    0.9613      1320

    accuracy                         0.9565      2274
   macro avg     0.9529    0.9613    0.9558      2274
weighted avg     0.9594    0.9565    0.9567      2274

Confusion Matrix:
 [[ 946    8]
 [  91 1229]]

Training LogisticRegression
Accuracy: 0.9639
Weighted F1-Score: 0.9640

Classification Report:
               precision    recall  f1-score   support

          NO     0.9467    0.9686    0.9575       954
         YES     0.9769    0.9606    0.9687      1320

    accuracy                         0.9639      2274
   macro avg     0.9618    0.9646    0.9631      2274
weighted avg     0.9642    0.9639    0.9640      2274

Confusion Matrix:
 [[ 924   30]
 [  52 1268]]

Training LinearSVM




Accuracy: 0.9732
Weighted F1-Score: 0.9732

Classification Report:
               precision    recall  f1-score   support

          NO     0.9646    0.9717    0.9681       954
         YES     0.9794    0.9742    0.9768      1320

    accuracy                         0.9732      2274
   macro avg     0.9720    0.9730    0.9725      2274
weighted avg     0.9732    0.9732    0.9732      2274

Confusion Matrix:
 [[ 927   27]
 [  34 1286]]

Training RBFSVM
Accuracy: 0.9727
Weighted F1-Score: 0.9727

Classification Report:
               precision    recall  f1-score   support

          NO     0.9685    0.9665    0.9675       954
         YES     0.9758    0.9773    0.9765      1320

    accuracy                         0.9727      2274
   macro avg     0.9721    0.9719    0.9720      2274
weighted avg     0.9727    0.9727    0.9727      2274

Confusion Matrix:
 [[ 922   32]
 [  30 1290]]


In [28]:
# Display token features from the TF-IDF vectorizer
tfidf_features = best_model_pipeline.named_steps["tfidf"].get_feature_names_out()
print("Example tokens extracted by TF-IDF:")
print(np.random.choice(tfidf_features, 20, replace=False))

Example tokens extracted by TF-IDF:
['prime' 'mexican' 'executives' 'one on' 'itna' 'un ka' 'meri'
 'inspired by' 'push' 'struggle' 'politics kiya' 'eyes' 'ek hai' 'never'
 'dharmik' 'concert' 'oral' 'fat' 'inki' 'time when']


In [29]:
# =======================================================
# 7. Save Best Model and Results
# =======================================================
print("\n========================================")
print(f"Best Model: {best_model_name}")
print(f"Best Weighted F1: {best_f1:.4f}")
print("========================================\n")

joblib.dump(best_model_pipeline, "best_baseline_model.pkl")

with open("results_summary.json", "w") as f:
    json.dump(results_summary, f, indent=2)


print("\nModel and metrics saved successfully.\n")


Best Model: LinearSVM
Best Weighted F1: 0.9732


Model and metrics saved successfully.



In [30]:
# =======================================================
# 8. Evaluate on External Test Set
# =======================================================
test_path = r"C:\MAIN\Projects\Sarcasm Detection\Dataset\test.csv"
df_test = pd.read_csv(test_path)

print("===== External Test Data =====")
print("Shape:", df_test.shape)
print("Columns:", df_test.columns.tolist())

# Detect text and label columns
possible_text_cols = [col for col in df_test.columns if "tweet" in col.lower() or "text" in col.lower()]
possible_label_cols = [col for col in df_test.columns if "label" in col.lower()]
text_col = possible_text_cols[0] if possible_text_cols else df_test.columns[1]
label_col = possible_label_cols[0] if possible_label_cols else None

if label_col:
    df_test = df_test[[text_col, label_col]]
    df_test.columns = ["text", "label"]
else:
    df_test = df_test[[text_col]]
    df_test.columns = ["text"]

# Clean test data
df_test["text"] = df_test["text"].apply(clean_text)

# Predictions
y_pred = best_model_pipeline.predict(df_test["text"])

if "label" in df_test.columns:
    y_true = df_test["label"]
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")

    print("\n===== Test Set Evaluation =====")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted F1: {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

    df_test["predicted_label"] = y_pred
    df_test.to_csv("test_predictions_with_truth.csv", index=False, encoding="utf-8")
    print("Predictions saved to test_predictions_with_truth.csv")
else:
    df_test["predicted_label"] = y_pred
    df_test.to_csv("test_predictions.csv", index=False, encoding="utf-8")
    print("Predictions saved to test_predictions.csv (no true labels).")


===== External Test Data =====
Shape: (2109, 3)
Columns: ['ID', 'Tweet', 'Label']

===== Test Set Evaluation =====
Accuracy: 0.9919
Weighted F1: 0.9919

Classification Report:
               precision    recall  f1-score   support

          NO     0.9887    0.9873    0.9880       706
         YES     0.9936    0.9943    0.9939      1403

    accuracy                         0.9919      2109
   macro avg     0.9911    0.9908    0.9909      2109
weighted avg     0.9919    0.9919    0.9919      2109

Confusion Matrix:
 [[ 697    9]
 [   8 1395]]
Predictions saved to test_predictions_with_truth.csv
