In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


In [2]:
pd.set_option('max_colwidth', 100)

In [3]:
df =  pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
df.rename(columns={'headline': 'text'}, inplace=True)
df.head()


Unnamed: 0,article_link,text,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5,former versace store clerk sues over secret 'black code' for minority shoppers,0
1,https://www.huffingtonpost.com/entry/roseanne-revival-review_us_5ab3a497e4b054d118e04365,"the 'roseanne' revival catches up to our thorny political mood, for better and worse",0
2,https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697,mom starting to fear son's web series closest thing she will have to grandchild,1
3,https://politics.theonion.com/boehner-just-wants-wife-to-listen-not-come-up-with-alt-1819574302,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas",1
4,https://www.huffingtonpost.com/entry/jk-rowling-wishes-snape-happy-birthday_us_569117c4e4b0cad15...,j.k. rowling wishes snape happy birthday in the most magical way,0


In [4]:
df.shape

(26709, 3)

In [5]:
df.is_sarcastic.value_counts()

0    14985
1    11724
Name: is_sarcastic, dtype: int64

In [6]:
df_serious = df[df['is_sarcastic']==0].head(10)
df_serious[['text']]


Unnamed: 0,text
0,former versace store clerk sues over secret 'black code' for minority shoppers
1,"the 'roseanne' revival catches up to our thorny political mood, for better and worse"
4,j.k. rowling wishes snape happy birthday in the most magical way
5,advancing the world's women
6,the fascinating case for eating lab-grown meat
7,"this ceo will send your kids to school, if you work for his company"
9,friday's morning email: inside trump's presser for the ages
10,airline passengers tackle man who rushes cockpit in bomb threat
11,facebook reportedly working on healthcare features and apps
12,north korea praises trump and urges us voters to reject 'dull hillary'


In [7]:
df_sarcastic = df[df['is_sarcastic']==1].head(10)
df_sarcastic[['text']]

Unnamed: 0,text
2,mom starting to fear son's web series closest thing she will have to grandchild
3,"boehner just wants wife to listen, not come up with alternative debt-reduction ideas"
8,top snake handler leaves sinking huckabee campaign
15,nuclear bomb detonates during rehearsal for 'spider-man' musical
16,cosby lawyer asks why accusers didn't come forward to be smeared by legal team years ago
17,"stock analysts confused, frightened by boar market"
20,courtroom sketch artist has clear manga influences
21,trump assures nation that decision for syrian airstrikes came after carefully considering all hi...
27,ex-con back behind bar
28,"after careful consideration, bush recommends oil drilling"


In [8]:
df['text'] = df['text'].replace('!', ' exclamation ')
df['text'] = df['text'].replace('?', ' question ')
df['text'] = df['text'].replace('\'', ' quotation ')
df['text'] = df['text'].replace('\"', ' quotation ')

In [9]:
count_vec = CountVectorizer()
bow = count_vec.fit_transform(df['text'])
bow = np.array(bow.todense())

In [10]:
X = bow
y = df['is_sarcastic']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    stratify=y)

In [12]:
model = MultinomialNB().fit(X_train, y_train)
y_pred = model.predict(X_test)

In [13]:
print('Accuracy:', accuracy_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred, average="macro"))
print('ROC AUC:', roc_auc_score(y_test, y_pred))

Accuracy: 0.8451266691626108
F1 score: 0.841376961635799
ROC AUC: 0.8388638129795593


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.88      0.87      4496
           1       0.84      0.80      0.82      3517

    accuracy                           0.85      8013
   macro avg       0.85      0.84      0.84      8013
weighted avg       0.85      0.85      0.85      8013



In [15]:
lr_model = LogisticRegression(max_iter=1000)

In [16]:
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

In [19]:
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print('F1 score:', f1_score(y_test, y_pred_lr, average="macro"))
print('ROC AUC:', roc_auc_score(y_test, y_pred_lr))

Accuracy: 0.8375140396855111
F1 score: 0.8341753211455383
ROC AUC: 0.8326062682830825


In [18]:
results = pd.DataFrame(data={'predicted': y_pred, 'actual': y_test})
predictions = results.join(df)

In [19]:
def is_correct(predicted, actual):
    if predicted == actual:
        return True
    else:
        return False

predictions['correct'] = predictions.apply(lambda x: is_correct(x.predicted, x.actual), axis=1)
predictions = predictions[['text','predicted','actual','correct']]

In [20]:
predictions[predictions['correct']==True].sample(100)


Unnamed: 0,text,predicted,actual,correct
18049,10 habits that make you look older,0,0,True
17969,rihanna throws support behind hillary clinton with perfect throwback tee,0,0,True
14899,on labor day: the tale of generational struggle for middle class wages,0,0,True
26370,if hamster only knew what happened to last hamster,1,1,True
22357,14 truths about being an asexual person,0,0,True
...,...,...,...,...
9027,woman quickly cycles through non-threatening voice inflections before expressing concern,1,1,True
23253,first-time carjacker wasn't expecting a stick shift,1,1,True
8513,"brandon grant, vice president of impulse group, shares what pride means to him",0,0,True
9563,terrified fda warns something making bananas black after several days,1,1,True
