<h1>Detection Of Sarcasm in news headlines using NLP</h1>

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import json
import numpy as np

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O sarcasm.json

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
datastore = None
with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

In [3]:
print(type(datastore))
print(datastore[0])
print(datastore[2])

<class 'list'>
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697', 'headline': "mom starting to fear son's web series closest thing she will have to grandchild", 'is_sarcastic': 1}


In [4]:
len(sentences)

26709

In [5]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

In [6]:
print(padded[0])

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [7]:
print(padded.shape)

(26709, 40)


In [8]:
train_size = 20000
max_length = 15
max_words = 20000

In [9]:
trainX = sentences[:train_size]
testX = sentences[train_size:]

trainY = labels[:train_size]
testY = labels[train_size:]

In [10]:
tokenizer = Tokenizer(oov_token='<OOV>', num_words = max_words)
tokenizer.fit_on_texts(trainX)

word_index = tokenizer.word_index

trainS = tokenizer.texts_to_sequences(trainX)
testS = tokenizer.texts_to_sequences(testX)

In [11]:
padTrain = pad_sequences(trainS, maxlen=max_length, padding='post', truncating='post')
padTest = pad_sequences(testS, maxlen=max_length, padding='post', truncating='post')

In [12]:
print(padTest[0])

[17706  1100  6663  9423    30 11505  2439     5   519   109     0     0
     0     0     0]


<h3>Model creation</h3>

In [13]:
model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(max_words, 16, input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.fit(padTrain, np.array(trainY), epochs=15, validation_data=(padTest, np.array(testY)), verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x11e054937c0>

<h3>Time to test</h3>

In [15]:
test_sentences = [
                  'granny starting to fear that spiders in the garden might be real',
                  'the weather today is bright and sunny'
]

sequences = tokenizer.texts_to_sequences(test_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [16]:
preds = model.predict(padded)

In [17]:
for i in range(2):
  print(test_sentences[i])
  if preds[i]>=0.5 :
    print("- Sarcastic\n")
  else :
    print("- Not Sarcastic\n")

granny starting to fear that spiders in the garden might be real
- Sarcastic

the weather today is bright and sunny
- Not Sarcastic

