# Sarcasm Detector

## Get and Load Data

In [2]:
import contractions
from bs4 import BeautifulSoup
import re
import unicodedata
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [3]:
df = pd.read_json('../data/processed/SarcasmDetect.json', lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


## Remove all records with no headline text. Clean and split data

In [4]:
df = df[df['headline'] != '']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


In [5]:
random_state=42 # for reproducibility

In [6]:
df['is_sarcastic'].value_counts() # see data distribution

is_sarcastic
0    14985
1    13634
Name: count, dtype: int64

In [7]:
# Split the data.
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['is_sarcastic', 'article_link']), df['is_sarcastic'], test_size=0.3, random_state=42, stratify =df['is_sarcastic'])
X_train.shape, X_test.shape

((20033, 1), (8586, 1))

In [8]:
from collections import Counter
Counter(y_train), Counter(y_test) #check split is correct

(Counter({0: 10489, 1: 9544}), Counter({0: 4496, 1: 4090}))

In [9]:
X_train.head()

Unnamed: 0,headline
24819,the most dangerous beaches for shark attacks i...
11682,frustrated man doesn't know what else he can d...
2096,americans more polarized than at any time in l...
5230,megachurch threatened by new ultrachurch
7238,dog waited in this spot for a month for her fa...


In [10]:
X_train["headline"].iloc[0]

'the most dangerous beaches for shark attacks in the u.s.'

In [11]:
y_train

24819    0
11682    1
2096     0
5230     1
7238     0
        ..
5671     1
1230     1
17862    1
26039    1
11202    1
Name: is_sarcastic, Length: 20033, dtype: int64

In [16]:
# cleaning text auxilary functions
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_corpus(docs):
    norm_docs = []
    for doc in docs:
        doc = strip_html_tags(doc)
        doc = doc.translate(doc.maketrans("\n\t\r", "   "))
        doc = doc.lower()
        doc = remove_accented_chars(doc)
        doc = contractions.fix(doc)
        # lower case and remove special characters\whitespaces
        doc = re.sub(r'[^a-zA-Z0-9\s]', ' ', doc, flags=re.I|re.A)
        doc = re.sub(' +', ' ', doc)
        doc = doc.strip()
        norm_docs.append(doc)
    return norm_docs

In [17]:
# Clean the data

norm_train_texts = pre_process_corpus(X_train['headline'].values)
norm_test_texts = pre_process_corpus(X_test['headline'].values)

  soup = BeautifulSoup(text, "html.parser")


## We build base line logistic regression model

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(binary=False, min_df=2, max_df=1.0)

cv_train_features = cv.fit_transform(norm_train_texts)
cv_test_features = cv.transform(norm_test_texts)
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (20033, 12246)  Test features shape: (8586, 12246)


In [19]:
# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, y_train)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

In [20]:
# Test model on test data
print(classification_report(y_test, lr_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, lr_bow_predictions))


              precision    recall  f1-score   support

           0       0.84      0.86      0.85      4496
           1       0.84      0.82      0.83      4090

    accuracy                           0.84      8586
   macro avg       0.84      0.84      0.84      8586
weighted avg       0.84      0.84      0.84      8586



Unnamed: 0,0,1
0,3845,651
1,716,3374
