In [32]:
import pandas as pd
import numpy as np
import json

In [33]:
# from google.colab import drive
# drive.mount('/content/drive/')

<span style="color:green;font-weight:bold; font-size:2em">1. Read and explore the data<span>

In [34]:
def parseJson(fname):
    for line in open(fname,encoding="utf8",errors='ignore'):
        yield eval(line)

In [35]:
data = list(parseJson('Sarcasm_Headlines_Dataset.json'))

In [36]:
len(data)

26709

In [37]:
data[0]

{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5',
 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers",
 'is_sarcastic': 0}

<span style="color:green;font-weight:bold; font-size:2em">2. Retain relevant columns<span>

In [38]:
headline =[]
sarcasm = []

for i in range(len(data)):
    headline.append(data[i]['headline'])
    sarcasm.append(data[i]['is_sarcastic'])
    

In [39]:
print(len(headline))
print(len(sarcasm))

26709
26709


In [40]:
count = dict()

for x in sarcasm:
    if x in count:
        count[x] +=1
    else:
        count[x] = 1

In [41]:
count

{0: 14985, 1: 11724}

In [42]:
print(f"The number of Sarcastic Headlines are:     {count[1]}")
print(f"The number of Non Sarcastic Headlines are: {count[0]}")
print(f"Total number of Sarcastic Headlines are:   {len(sarcasm)}")

The number of Sarcastic Headlines are:     11724
The number of Non Sarcastic Headlines are: 14985
Total number of Sarcastic Headlines are:   26709


<span style="color:green;font-weight:bold; font-size:2em">3. Get length of each sentence<span>

In [43]:
length = []
for x in headline:
    length.append(len(x))

In [44]:
max(length)

254

In [45]:
length[:10]

[78, 84, 79, 84, 64, 27, 46, 67, 50, 59]

<span style="color:green;font-weight:bold; font-size:2em">4. Define parameters<span>

<span style="color:green;font-weight:bold; font-size:2em">5. Get indices for words<span>

<span style="color:green;font-weight:bold; font-size:2em">7. Get vocabulary size<span>

In [46]:
vocab_size = 30000
oov_token = "<OOV>"
max_length = 100
padding_type = "post"
trunction_type="post"

In [47]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(headline)

In [48]:
word_index = tokenizer.word_index

In [49]:
len(word_index)

29657

In [50]:
X_sequences = tokenizer.texts_to_sequences(headline)

In [51]:
X_sequences[:5]

[[308, 15115, 679, 3337, 2298, 48, 382, 2576, 15116, 6, 2577, 8434],
 [4, 8435, 3338, 2746, 22, 2, 166, 8436, 416, 3112, 6, 258, 9, 1002],
 [145, 838, 2, 907, 1749, 2093, 582, 4719, 221, 143, 39, 46, 2, 10736],
 [1485, 36, 224, 400, 2, 1832, 29, 319, 22, 10, 2924, 1393, 6969, 968],
 [767, 719, 4720, 908, 10737, 623, 594, 5, 4, 95, 1309, 92]]

In [63]:
print(f"The X parameter is X_Sequence")
print(f"The Y parameter is Sarcasm")

The X parameter is X_Sequence
The Y parameter is Sarcasm


<span style="color:green;font-weight:bold; font-size:2em">6. Create features and labels<span>

In [52]:
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(X_sequences,maxlen=max_length, padding=padding_type, 
                       truncating=trunction_type)

In [53]:
X

array([[  308, 15115,   679, ...,     0,     0,     0],
       [    4,  8435,  3338, ...,     0,     0,     0],
       [  145,   838,     2, ...,     0,     0,     0],
       ...,
       [10735,     9,    68, ...,     0,     0,     0],
       [ 1541,   392,  4164, ...,     0,     0,     0],
       [29656,  1647,     6, ...,     0,     0,     0]])

In [54]:
y = np.array(sarcasm)

<span style="color:green;font-weight:bold; font-size:2em">8. Create a weight matrix using GloVe embeddings<span>

In [55]:
embeddings_index = {}
f = open('glove.6B.50d.txt',encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [56]:
embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [57]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from sklearn.model_selection import train_test_split

In [58]:
embedding_layer = Embedding(len(word_index) + 1,
                            50,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

<span style="color:green;font-weight:bold; font-size:2em">9. Define and compile a Bidirectional LSTM model.<span>

In [59]:
embedding_dim = 64
input_length = max_length
model = Sequential([
    embedding_layer,
  Bidirectional(LSTM(embedding_dim, return_sequences=True)),
  Bidirectional(LSTM(embedding_dim,)),
  Dense(128, activation='relu'),
  Dense(32, activation='relu'),
  Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [60]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 50)           1482900   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100, 128)          58880     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_4 (Dense)              (None, 128)               16512     
_________________________________________________________________
dense_5 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 33        
Total params: 1,661,269
Trainable params: 178,369
Non-trainable params: 1,482,900
______________________________________

<span style="color:green;font-weight:bold; font-size:2em">10. Fit the model and check the validation accuracy<span>

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test , y_train, y_test = train_test_split(X, y , test_size = 0.15,random_state=152)

In [62]:
history = model.fit(X_train, y_train, epochs=10,batch_size=50,validation_data=(X_test, y_test))


Train on 22702 samples, validate on 4007 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
