In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import re,string,unicodedata
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import accuracy_score

from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import BaggingClassifier

In [2]:
#Importing data
SH1=pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
SH2=pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
data = pd.concat([SH1, SH2]).reset_index(drop = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [3]:
#Displaying data columns
data.columns

Index(['article_link', 'headline', 'is_sarcastic'], dtype='object')

In [4]:
#Displaying shape 
print(data.shape)

(55328, 3)


In [5]:
#Peek at data
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [6]:
#Checking distribution
data['is_sarcastic'].value_counts()

0    29970
1    25358
Name: is_sarcastic, dtype: int64

In [7]:
#Removing stop words
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

data['headline'] = data['headline'].apply(remove_stopwords)

In [8]:
X=data['headline']
y=data['is_sarcastic']

cv=CountVectorizer()
X=cv.fit_transform(X)

In [9]:
#Splitting the data into training and testing datasets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=101)

In [10]:
#Defining Naive Bayes model for training
model = MultinomialNB()
model.fit(X_train,y_train)
predictions=model.predict(X_test)

In [11]:
precision = metrics.precision_score(y_test,predictions)
recall = metrics.recall_score(y_test,predictions)
accuracy= metrics.accuracy_score(y_test,predictions)
print("Precision:",precision)
print("Recall:",recall)
print("Accuracy:",accuracy)

Precision: 0.8637178720860729
Recall: 0.8538506992318298
Accuracy: 0.8711368154708115


In [12]:
model2=LogisticRegression()
model2.fit(X_train,y_train)
predictions2=model2.predict(X_test)



In [13]:
precision = metrics.precision_score(y_test,predictions2)
recall = metrics.recall_score(y_test,predictions2)
accuracy= metrics.accuracy_score(y_test,predictions2)
print("Precision:",precision)
print("Recall:",recall)
print("Accuracy:",accuracy)

Precision: 0.9141497724451799
Recall: 0.8703959030923774
Accuracy: 0.9030363274896078


In [14]:
#Create tree
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.9427977589011386
Precision: 0.9458266452648475
Recall: 0.9285010833169195


In [15]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (number of trees trained) (i.e. weak learners)

model = BaggingClassifier(n_estimators=2)

#Fit the training feature Xs and training label Ys
model.fit(X_train, y_train)

#Use the trained model to predict the test data
y_pred = model.predict(X_test)

print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8637267305259353
