### Import Libraries


In [1]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Read the dataset

In [2]:
sarcasm = pd.read_json('./Sarcasm_Headlines_Dataset_v2.json', lines=True)

In [3]:
sarcasm.shape

(28619, 3)

### Look for sample data

In [4]:
sarcasm.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


### Check for dataframe info

In [5]:
sarcasm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28619 entries, 0 to 28618
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   is_sarcastic  28619 non-null  int64 
 1   headline      28619 non-null  object
 2   article_link  28619 non-null  object
dtypes: int64(1), object(2)
memory usage: 670.9+ KB


### To display summary statistics of dataframe

In [6]:
sarcasm.describe()

Unnamed: 0,is_sarcastic
count,28619.0
mean,0.476
std,0.499
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

### Remove punctuations

In [9]:
import string

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

sarcasm['headline'] = sarcasm['headline'].apply(lambda x: remove_punct(x))

sarcasm.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep totally nails why congress is falling ...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word stream...,https://www.theonion.com/mother-comes-pretty-c...


### Tokenize the headlines

In [10]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

sarcasm['headline'] = sarcasm['headline'].apply(lambda x: tokenize(x.lower()))

sarcasm.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,"[thirtysomething, scientists, unveil, doomsday...",https://www.theonion.com/thirtysomething-scien...
1,0,"[dem, rep, totally, nails, why, congress, is, ...",https://www.huffingtonpost.com/entry/donna-edw...
2,0,"[eat, your, veggies, 9, deliciously, different...",https://www.huffingtonpost.com/entry/eat-your-...
3,1,"[inclement, weather, prevents, liar, from, get...",https://local.theonion.com/inclement-weather-p...
4,1,"[mother, comes, pretty, close, to, using, word...",https://www.theonion.com/mother-comes-pretty-c...


### Remove stopwords

In [11]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stop]
    return text

sarcasm['headline'] = sarcasm['headline'].apply(lambda x: remove_stopwords(x))

sarcasm.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,"[thirtysomething, scientists, unveil, doomsday...",https://www.theonion.com/thirtysomething-scien...
1,0,"[dem, rep, totally, nails, congress, falling, ...",https://www.huffingtonpost.com/entry/donna-edw...
2,0,"[eat, veggies, 9, deliciously, different, reci...",https://www.huffingtonpost.com/entry/eat-your-...
3,1,"[inclement, weather, prevents, liar, getting, ...",https://local.theonion.com/inclement-weather-p...
4,1,"[mother, comes, pretty, close, using, word, st...",https://www.theonion.com/mother-comes-pretty-c...


### Creating two objects X and y

In [12]:
X = sarcasm['headline']
y = sarcasm['is_sarcastic']

### Instantiating CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(sarcasm['headline'].apply(lambda x: ' '.join(x)))
print(X_counts.shape)
print(count_vect.get_feature_names())

(28619, 29451)


### Split train and test data

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_counts, y, test_size=0.2, random_state=42)

In [15]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(22895, 29451) (22895,)
(5724, 29451) (5724,)


### Training a MultinomialNB model for classifying the headlines

In [16]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train, y_train)

### Predict the test data

In [17]:
y_pred = model.predict(X_test)
print(y_pred)

[0 1 1 ... 0 0 1]


### Displaying confusion matrix

In [18]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[2485  510]
 [ 550 2179]]


### Displaying classification report

In [19]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      2995
           1       0.81      0.80      0.80      2729

    accuracy                           0.81      5724
   macro avg       0.81      0.81      0.81      5724
weighted avg       0.81      0.81      0.81      5724



### Displaying the HMM POS tagging

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [22]:
def get_pos_tag(text):
    tags = []
    tokenized_sentence = nltk.sent_tokenize(text)
    for tokenized_text in tokenized_sentence:
        tokens = nltk.word_tokenize(tokenized_text)
        tokens = nltk.pos_tag(tokens)
        tags.append(tokens)
    return tags

sarcasm['pos_tags'] = sarcasm['headline'].apply(lambda x: ' '.join(x)).apply(lambda x: get_pos_tag(x))

sarcasm.head()

Unnamed: 0,is_sarcastic,headline,article_link,pos_tags
0,1,"[thirtysomething, scientists, unveil, doomsday...",https://www.theonion.com/thirtysomething-scien...,"[[(thirtysomething, VBG), (scientists, NNS), (..."
1,0,"[dem, rep, totally, nails, congress, falling, ...",https://www.huffingtonpost.com/entry/donna-edw...,"[[(dem, NN), (rep, NN), (totally, RB), (nails,..."
2,0,"[eat, veggies, 9, deliciously, different, reci...",https://www.huffingtonpost.com/entry/eat-your-...,"[[(eat, NN), (veggies, NNS), (9, CD), (delicio..."
3,1,"[inclement, weather, prevents, liar, getting, ...",https://local.theonion.com/inclement-weather-p...,"[[(inclement, NN), (weather, NN), (prevents, N..."
4,1,"[mother, comes, pretty, close, using, word, st...",https://www.theonion.com/mother-comes-pretty-c...,"[[(mother, NN), (comes, VBZ), (pretty, RB), (c..."


In [23]:
pd.set_option('max_colwidth', None)
sarcasm['pos_tags'].head(4)

0                                 [[(thirtysomething, VBG), (scientists, NNS), (unveil, JJ), (doomsday, JJ), (clock, NN), (hair, NN), (loss, NN)]]
1    [[(dem, NN), (rep, NN), (totally, RB), (nails, JJ), (congress, NN), (falling, VBG), (short, JJ), (gender, NN), (racial, JJ), (equality, NN)]]
2                                                       [[(eat, NN), (veggies, NNS), (9, CD), (deliciously, RB), (different, JJ), (recipes, NNS)]]
3                                                     [[(inclement, NN), (weather, NN), (prevents, NNS), (liar, VBP), (getting, VBG), (work, NN)]]
Name: pos_tags, dtype: object