# Data Loading

In [1]:
import pandas as pd
# Reading JSON file with lines=True
df = pd.read_csv("Sarcasm.csv")
# Display the dataframe
df = df[['tweet','sarcastic']]
df.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [2]:
df.dropna(inplace=True)


# Data Cleaning

In [3]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from contractions import fix  # To handle contractions like don't -> do not

nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def clean_text_advanced(text):
    # Expand contractions
    text = fix(text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords and lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Join tokens back to a single string
    cleaned_text = ' '.join(cleaned_tokens)
    
    # Remove excessive whitespaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    
    return cleaned_text

df['tweet'] = df['tweet'].astype(str).apply(clean_text_advanced)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df

Unnamed: 0,tweet,sarcastic
0,thing got college caffeine addiction,1
1,love professor draw big question mark next ans...,1
2,remember hundred email company covid started g...,1
3,today poppop told “ forced ” go college 🙃 okay...,1
4,volphancarol littlewhitty mysticalmanatee also...,1
...,...,...
3463,population spike chicago 9 month ridiculous,0
3464,would think second last english class year pro...,0
3465,finally surfacing holiday scotland difficult d...,0
3466,could prouder today well done every student go...,0


# Data Balancing

In [5]:
df['sarcastic'].value_counts()

sarcastic
0    2600
1     867
Name: count, dtype: int64

In [7]:
from imblearn.over_sampling import RandomOverSampler
# Splitting the data into features and labels
X = df['tweet'].values.reshape(-1, 1)  # Reshaping for the oversampler
y = df['sarcastic']

# Applying Random Oversampling
oversampler = RandomOverSampler(random_state=42)
X_balanced, y_balanced = oversampler.fit_resample(X, y)

# Creating a balanced DataFrame
df = pd.DataFrame({'tweet': X_balanced.flatten(), 'sarcastic': y_balanced})

# Check the new class distribution
print("\nBalanced Class Distribution:\n", df['sarcastic'].value_counts())


Balanced Class Distribution:
 sarcastic
1    2600
0    2600
Name: count, dtype: int64


# Vectorization and train test split

In [8]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split



tfidf = TfidfVectorizer() 
X = tfidf.fit_transform(df['tweet'])

#  Define target variable
y = df['sarcastic']




#  Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)  # `with_mean=False` due to sparse matrix
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


# Model Building and evaluation

In [10]:
# 3. Train SVM Model
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = SVC(kernel='linear')  # Linear kernel is common for text classification
model.fit(X_train, y_train)

# 4. Predictions
y_pred = model.predict(X_test)

# 5. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Display results
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 88.56%
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.85      0.88       493
           1       0.87      0.91      0.89       547

    accuracy                           0.89      1040
   macro avg       0.89      0.88      0.88      1040
weighted avg       0.89      0.89      0.89      1040

Confusion Matrix:
 [[421  72]
 [ 47 500]]


In [11]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print("Classification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix (Random Forest):\n", confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 90.58%
Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.88      0.92      0.90       493
           1       0.93      0.89      0.91       547

    accuracy                           0.91      1040
   macro avg       0.91      0.91      0.91      1040
weighted avg       0.91      0.91      0.91      1040

Confusion Matrix (Random Forest):
 [[454  39]
 [ 59 488]]


# Sarcasm Detection System Function

In [12]:
def detect_sarcasm(new_headline):
    cleaned_headline = clean_text_advanced(new_headline)  
    transformed_headline = tfidf.transform([cleaned_headline])
    prediction = model.predict(transformed_headline)
    
    if prediction == 1:
        return "Sarcastic"
    else:
        return "Not Sarcastic"

In [13]:
# Example usage of detection system
test_headline = "Oh great, another Monday! I just love waking up early after the weekend."

print(f"Prediction for comment: '{test_headline}' -> \n {detect_sarcasm(test_headline)}")


Prediction for comment: 'Oh great, another Monday! I just love waking up early after the weekend.' -> 
 Not Sarcastic


In [14]:
# Example usage of detection system
test_headline = "Oh, how thoughtful! I really needed someone to explain the obvious to me."
print(f"Prediction for comment: '{test_headline}' -> \n {detect_sarcasm(test_headline)}")

Prediction for comment: 'Oh, how thoughtful! I really needed someone to explain the obvious to me.' -> 
 Not Sarcastic


In [15]:
import pickle

pickle.dump(model,open("model.pkl",'wb'))
pickle.dump(tfidf,open("tfidf.pkl",'wb'))