In [1]:
import pandas as pd
import json
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
with open('sarcasm_data.json') as f:
    data = json.load(f)

In [4]:
# Convert JSON object to list
data_list = []
N = 3  # Number of previous messages to keep as context

for key, value in data.items():
    # Combine last N messages with the main utterance
    context_messages = " ".join(value["context"][-N:])  # Take last N messages
    full_text = context_messages + " " + value["utterance"]  # Merge context + utterance

    data_list.append({
        "text": full_text,  # Use concatenated text
        "label": 1 if value["sarcasm"] else 0  # Convert sarcasm True/False to 1/0
    })

In [5]:
df = pd.DataFrame(data_list)

In [6]:
# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{string.punctuation}]", "", text)
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [7]:
import nltk

In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
nltk.download('omw-1.4')  # Optional, for WordNet Lemmatizer

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:

# Check if punkt is correctly installed
nltk.data.find('tokenizers/punkt')

FileSystemPathPointer('/root/nltk_data/tokenizers/punkt')

In [14]:
import nltk
print(nltk.data.find('tokenizers/punkt'))


/root/nltk_data/tokenizers/punkt


In [15]:
import nltk
nltk.data.path.append('/root/nltk_data/tokenizers/punkt')


In [17]:
print(df.columns)


Index(['text', 'label'], dtype='object')


In [19]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [20]:
df["clean_text"] = df["text"].apply(preprocess_text)  # Apply preprocessing
print(df.head())  # Verify that "clean_text" now exists


                                                text  label  \
0  I never would have identified the fingerprints...      1   
1  Anyway, if you had your own game character we ...      1   
2  Here we go. Pad thai, no peanuts. But does it ...      0   
3  You realize that scene was rife with scientifi...      0   
4  The Mandelbrot set of complex numbers is a lit...      1   

                                          clean_text  
0  never would identified fingerprint string theo...  
1  anyway game character could hang maybe go ques...  
2  go pad thai peanut peanut oil im sure everyone...  
3  realize scene rife scientific inaccuracy yes k...  
4  mandelbrot set complex number little messy cha...  


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 words

# Fit and transform the cleaned text
X = vectorizer.fit_transform(df["clean_text"])

# Labels (target variable)
y = df["label"]  # Assuming 'label' is 1 for sarcasm, 0 for non-sarcasm

# Print shape of TF-IDF matrix
print("TF-IDF Matrix Shape:", X.shape)


TF-IDF Matrix Shape: (690, 3370)


In [22]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [23]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Initialize and train SVM model
svm_model = SVC(kernel='linear')  # Linear kernel works well for text classification
svm_model.fit(X_train, y_train)


In [25]:
# Make predictions
y_pred = svm_model.predict(X_test)

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("SVM Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

SVM Accuracy: 0.644927536231884
Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.77      0.63        53
           1       0.80      0.56      0.66        85

    accuracy                           0.64       138
   macro avg       0.66      0.67      0.64       138
weighted avg       0.69      0.64      0.65       138



In [27]:
df['label'].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,345
0,345


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure the split happens BEFORE TF-IDF transformation
X = df["clean_text"]  # This is your text column
y = df["label"]  # Labels

# Split dataset
X_train_text, X_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now apply TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Check shape
print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("TF-IDF Test Shape:", X_test_tfidf.shape)


TF-IDF Train Shape: (552, 2929)
TF-IDF Test Shape: (138, 2929)


In [33]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Train Linear SVM
svm_linear = SVC(kernel="linear", C=1.0)
svm_linear.fit(X_train_tfidf, y_train)

# Predict using Linear SVM
y_pred_linear = svm_linear.predict(X_test_tfidf)

# Evaluate performance
print("SVM (Linear) Accuracy:", accuracy_score(y_test, y_pred_linear))
print("Classification Report:\n", classification_report(y_test, y_pred_linear))


SVM (Linear) Accuracy: 0.6304347826086957
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.79      0.62        53
           1       0.80      0.53      0.64        85

    accuracy                           0.63       138
   macro avg       0.66      0.66      0.63       138
weighted avg       0.69      0.63      0.63       138



In [34]:
# Train SVM with RBF Kernel
svm_rbf = SVC(kernel="rbf", C=1.0, gamma='scale')
svm_rbf.fit(X_train_tfidf, y_train)

# Predict using RBF SVM
y_pred_rbf = svm_rbf.predict(X_test_tfidf)

# Evaluate performance
print("SVM (RBF) Accuracy:", accuracy_score(y_test, y_pred_rbf))
print("Classification Report:\n", classification_report(y_test, y_pred_rbf))


SVM (RBF) Accuracy: 0.6159420289855072
Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.89      0.64        53
           1       0.86      0.45      0.59        85

    accuracy                           0.62       138
   macro avg       0.68      0.67      0.61       138
weighted avg       0.72      0.62      0.61       138



In [38]:
print(preprocess_text("Test sentence"))

test sentence


In [39]:
print(tfidf.transform(["Test sentence"]))

  (0, 2563)	1.0


In [40]:
print(svm_linear.predict(tfidf.transform(["Test sentence"])))

[1]


In [41]:
def predict_sarcasm(text, model, vectorizer):
    text_cleaned = preprocess_text(text)  # Preprocess the input text
    text_tfidf = vectorizer.transform([text_cleaned])  # Convert to TF-IDF
    prediction = model.predict(text_tfidf)  # Predict with trained model
    return "Sarcastic 😏" if prediction[0] == 1 else "Not Sarcastic 🙂"

# Example usage
while True:
    user_input = input("Enter a sentence (or type 'exit' to quit): ")
    if user_input.lower() == "exit":
        break
    print("Prediction:", predict_sarcasm(user_input, svm_linear, tfidf))


Enter a sentence (or type 'exit' to quit): Oh great, another Monday!
Prediction: Not Sarcastic 🙂
Enter a sentence (or type 'exit' to quit): wow
Prediction: Sarcastic 😏
Enter a sentence (or type 'exit' to quit): I love spending time with my friends
Prediction: Not Sarcastic 🙂
Enter a sentence (or type 'exit' to quit): exit
