# Sarcasm Detection

## Import libraries

In [1]:
# To store data
import pandas as pd

# To do linear algebra
import numpy as np

# To plot graphs
import matplotlib.pyplot as plt
from matplotlib.cm import get_cmap
from matplotlib.colors import rgb2hex

# To create nicer graphs
import seaborn as sns

# To create interactive graphs
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

# To vectorize texts
from sklearn.feature_extraction.text import CountVectorizer
# To decompose texts
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import PCA
from sklearn.decomposition import SparsePCA
# To visualize high dimensional dataset
from sklearn.manifold import TSNE

# To tag words
from textblob import TextBlob

# To use new datatypes
from collections import Counter

# To stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

## Load the data

In [2]:
data = pd.read_json('Sarcasm_Headlines_Dataset.json',lines=True)
data.head(10)

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0


## Vectorize headlines

In [30]:
# Create vectorizer
countVectorizer = CountVectorizer(stop_words=stop)

# Vectorize text
vectorizedText = countVectorizer.fit_transform(data['headline'].str.replace("'", '').values)

with open('counter.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(countVectorizer, f, pickle.HIGHEST_PROTOCOL)
print('Shape Vectorized Text: {}'.format(vectorizedText.shape))

Shape Vectorized Text: (26709, 25957)


In [19]:
vec = pd.DataFrame(vectorizedText.toarray(),columns=countVectorizer.get_feature_names())
vec.head()

Unnamed: 0,00,000,00000000001,00003,000th,025,03,047,071,10,...,zoos,zoroastrianism,zs,zsa,zucker,zuckerberg,zuckerbergs,zz,éclairs,ünited
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# vec['is_sarcastic'] = data.is_sarcastic

In [21]:
NLP = vec


## Train split

In [22]:
X = NLP
y = data.is_sarcastic

In [23]:
# checks the shape of 
X.shape , y.shape

((26709, 25957), (26709,))

In [24]:
from sklearn.model_selection import train_test_split, cross_val_score ,GridSearchCV
X_train , X_test , y_train , y_test = train_test_split(X ,y,test_size = 0.5,random_state = 42)

In [25]:
# calculate null accuracy (for binary classification problems coded as 0/1)
max(y_test.mean(), 1 - y_test.mean())

0.5592661924372895

## DecisionTreeClassifier

In [13]:
from sklearn.tree import DecisionTreeClassifier
# instantiate the model
clf = DecisionTreeClassifier('entropy', random_state=42)
# fit the model to the data
clf.fit(X_train, y_train)
clf.score(X_test,y_test)

0.7191314114563834

In [None]:
# instantiate the model
clf = DecisionTreeClassifier()
# fit the model to the data
clf.fit(X_train, y_train)
clf.score(X_test,y_test)

## LogisticRegression

In [26]:
# import the class
from sklearn.linear_model import LogisticRegression
# # instantiate the model
# logreg = LogisticRegression(C =  0.001, penalty = 'l1')
# # fit the model to the data
# logreg.fit(X_train,y_train)
# logreg.score(X_test,y_test)
# pickle.dumps(logreg)

In [16]:
test_text = countVectorizer.transform(['hello'.replace("'", '')])

logreg.predict( test_text )

array([0])

In [27]:
# instantiate the model
logreg = LogisticRegression()
# fit the model to the data
logreg.fit(X_train,y_train)
logreg.score(X_test,y_test)
import pickle
with open('data.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(logreg, f, pickle.HIGHEST_PROTOCOL)

## RandomForestClassifier

In [None]:
# importing module
from sklearn.ensemble import RandomForestClassifier
# instantiate the model
model=RandomForestClassifier(n_jobs=-1,random_state=123)
# fit the model to the data
model.fit(X_train,y_train)
model.score(X_test,y_test)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
# hyper Parameters Set
knn_parameters = {
    'n_neighbors':range(1,30),
    'weights':['uniform','distance'],
    'metric': ['euclidean', 'manhattan']
}
# instantiate the model
knn_gridsearcher = GridSearchCV(KNeighborsClassifier(), knn_parameters, cv=5, verbose=1, n_jobs=-1)
# fit the model to the data
knn_gridsearcher.fit(X_train, y_train)
knn_gridsearcher.best_params_
knn_gridsearcher.score(X_test,y_test)

In [None]:
# Prediction
prediction=model.predict(X_test)
#evaluation(Accuracy)
print("Accuracy:",metrics.accuracy_score(prediction,y_test))
#evaluation(Confusion Metrix)
# ---- 2-----
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,y_test))