In [1]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
vocab_size = 10000
embed_dim = 16
max_len = 100
trunc = 'post'
padd = 'post'
oov_tok = "<OOV>"
train_size = 20001

In [4]:
import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)

data = list(parse_data('./Sarcasm_Headlines_Dataset_v2.json'))

In [5]:
sentences = []
labels = []

for items in data:
    sentences.append(items['headline'])
    labels.append(items['is_sarcastic'])

In [6]:
train_sentences = sentences[0:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[0:train_size]
test_labels = labels[train_size:]

In [7]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index

train_seq = tokenizer.texts_to_sequences(train_sentences)
train_pad = pad_sequences(train_seq,maxlen=max_len,padding=padd,truncating=trunc)

test_seq = tokenizer.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq,maxlen=max_len,padding=padd,truncating=trunc)

In [8]:
train_pad = np.array(train_pad)
test_labels = np.array(test_labels)
test_pad = np.array(test_pad)
test_labels = np.array(test_labels)

In [9]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_dim, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


In [10]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [11]:
num_epochs = 35
history = model.fit(train_pad, train_labels, epochs=num_epochs, validation_data=(test_pad, test_labels), verbose=2)

Train on 20001 samples, validate on 8618 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/35
 - 5s - loss: 0.6587 - acc: 0.6138 - val_loss: 0.5320 - val_acc: 0.7788
Epoch 2/35
 - 4s - loss: 0.4142 - acc: 0.8329 - val_loss: 0.3699 - val_acc: 0.8416
Epoch 3/35
 - 4s - loss: 0.3107 - acc: 0.8757 - val_loss: 0.3384 - val_acc: 0.8557
Epoch 4/35
 - 4s - loss: 0.2615 - acc: 0.8981 - val_loss: 0.3286 - val_acc: 0.8580
Epoch 5/35
 - 4s - loss: 0.2251 - acc: 0.9142 - val_loss: 0.3333 - val_acc: 0.8512
Epoch 6/35
 - 4s - loss: 0.2003 - acc: 0.9246 - val_loss: 0.3374 - val_acc: 0.8544
Epoch 7/35
 - 4s - loss: 0.1796 - acc: 0.9339 - val_loss: 0.3461 - val_acc: 0.8548
Epoch 8/35
 - 3s - loss: 0.1598 - acc: 0.9411 - val_loss: 0.3602 - val_acc: 0.8510
Epoch 9/35
 - 4s - loss: 0.1451 - acc: 0.9477 - val_loss: 0.3774 - val_acc: 0.8485
Epoch 10/35
 - 4s - loss: 0.1310 - acc: 0.9528 - val_loss: 0.3965 - val_acc: 0.8461
Epoch 11/35
 - 4s - loss: 0.1201 - acc: 0.9579 - val_loss: 0.4183 - val_

In [12]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "acc")
plot_graphs(history, "loss")

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [13]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_sentence(train_pad[0]))
print(train_sentences[2])
print(labels[2])

<OOV> scientists unveil doomsday clock of hair loss ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
eat your veggies: 9 deliciously different recipes
0


In [14]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [15]:
sentence = ["granny starting to fear spiders in the garden might be real",
            "game of thrones season finale showing this sunday night",
           "it is looking like it might be the end of the world eh?",
           "local barber arrested over homicide accuse",
           "i love you, dont i?",
           "i love eating toasted cheese and tuna sandwiches",
           "the government has shown the ability to do anything for the public and how much they care about them by opening up the beaches in Florida"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_len, padding=padd, truncating=trunc)
print(model.predict(padded))

[[6.7296398e-01]
 [1.0782140e-05]
 [2.0315547e-05]
 [9.9998641e-01]
 [8.8493005e-05]
 [3.2171738e-04]
 [7.2423718e-04]]
