In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
config = {
    "sentences": [
        "I love my dog",
        "I love my cat",
        "I really love my dog",
        "my dog loves my manatee",
    ]
}

In [3]:
tokenizer = Tokenizer(num_words=100)

In [4]:
tokenizer.fit_on_texts(config["sentences"])

In [5]:
word_index = tokenizer.word_index

In [6]:
print(word_index)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'cat': 5, 'really': 6, 'loves': 7, 'manatee': 8}


In [7]:
sequences = tokenizer.texts_to_sequences(config["sentences"])

In [8]:
print(sequences)

[[2, 3, 1, 4], [2, 3, 1, 5], [2, 6, 3, 1, 4], [1, 4, 7, 1, 8]]


In [9]:
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")

In [10]:
text1 = tokenizer.fit_on_texts(config["sentences"])
print(text1)

None


In [11]:
sequences = tokenizer.texts_to_sequences(config["sentences"])

In [12]:
print(sequences)

[[3, 4, 2, 5], [3, 4, 2, 6], [3, 7, 4, 2, 5], [2, 5, 8, 2, 9]]


How to feed tensors of different sizes to a Neural Network?

The answer to that is padding, or Ragged Tensors

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [14]:
padded = pad_sequences(sequences)

In [15]:
print(padded)

[[0 3 4 2 5]
 [0 3 4 2 6]
 [3 7 4 2 5]
 [2 5 8 2 9]]


In [16]:
padded = pad_sequences(sequences, padding="post")

In [17]:
print(padded)

[[3 4 2 5 0]
 [3 4 2 6 0]
 [3 7 4 2 5]
 [2 5 8 2 9]]


In [18]:
padded = pad_sequences(sequences, padding="post", truncating="post", maxlen=4)

In [19]:
print(padded)

[[3 4 2 5]
 [3 4 2 6]
 [3 7 4 2]
 [2 5 8 2]]


In [20]:
padded = pad_sequences(sequences, padding="post", truncating="post", maxlen=10)

In [21]:
print(padded)

[[3 4 2 5 0 0 0 0 0 0]
 [3 4 2 6 0 0 0 0 0 0]
 [3 7 4 2 5 0 0 0 0 0]
 [2 5 8 2 9 0 0 0 0 0]]


In [22]:
import json
from pprint import pprint
import pandas as pd

datastore = pd.read_json(
    "./30764_533474_bundle_archive/Sarcasm_Headlines_Dataset.json", lines=True
)

In [23]:
datastore.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
2,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
3,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0
4,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0


In [24]:
sentences = datastore["headline"].values.tolist()

In [25]:
labels = datastore["is_sarcastic"].values.tolist()

In [26]:
urls = datastore["article_link"].values.tolist()

In [27]:
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding="post")

In [28]:
print(padded[0])

[ 1  1  1  1  1 48  1  1  1  6  1  1  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]


In [29]:
print(padded.shape)

(26708, 40)


In [30]:
training_size = int(padded.shape[0] * 0.7)

In [31]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

In [32]:
import numpy as np

In [33]:
training_padded = np.array(padded[0:training_size])
testing_padded = np.array(padded[training_size:])

In [34]:
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

In [35]:
training_padded[1]

array([ 1,  1,  2,  1,  1,  1,  1,  1,  1,  1, 39, 46,  2,  1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0])

In [36]:
vocab_size = 10000
embedding_dim = 40
max_length = 40
trunc_type = "post"
padding_type = "post"
oov_tok = "<OOV>"

In [37]:
len(training_sentences[0])

78

In [38]:
len(word_index)

29658

In [39]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            len(word_index), embedding_dim, input_length=max_length
        ),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(24, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)

In [40]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [41]:
num_epochs = 30

In [42]:
history = model.fit(
    training_padded,
    training_labels,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels),
    verbose=2,
)

Train on 18695 samples, validate on 8013 samples
Epoch 1/30
18695/18695 - 11s - loss: 0.6201 - accuracy: 0.6435 - val_loss: 0.5158 - val_accuracy: 0.7289
Epoch 2/30
18695/18695 - 10s - loss: 0.5022 - accuracy: 0.7420 - val_loss: 0.4961 - val_accuracy: 0.7425
Epoch 3/30
18695/18695 - 9s - loss: 0.4939 - accuracy: 0.7473 - val_loss: 0.4934 - val_accuracy: 0.7498
Epoch 4/30
18695/18695 - 10s - loss: 0.4916 - accuracy: 0.7488 - val_loss: 0.4917 - val_accuracy: 0.7442
Epoch 5/30
18695/18695 - 11s - loss: 0.4917 - accuracy: 0.7470 - val_loss: 0.5072 - val_accuracy: 0.7450
Epoch 6/30
18695/18695 - 11s - loss: 0.4906 - accuracy: 0.7471 - val_loss: 0.4924 - val_accuracy: 0.7489
Epoch 7/30
18695/18695 - 10s - loss: 0.4911 - accuracy: 0.7454 - val_loss: 0.4937 - val_accuracy: 0.7529
Epoch 8/30
18695/18695 - 9s - loss: 0.4893 - accuracy: 0.7506 - val_loss: 0.4925 - val_accuracy: 0.7550
Epoch 9/30
18695/18695 - 11s - loss: 0.4896 - accuracy: 0.7518 - val_loss: 0.4986 - val_accuracy: 0.7498
Epoch 10

In [43]:
sentence = [
    "granny starting to fear spiders in the garden might be real",
    "the weather today is bright and sunny"
]

In [44]:
sequences = tokenizer.texts_to_sequences(sentence)

In [45]:
sequences

[[1, 1, 2, 1, 1, 5, 4, 1, 1, 24, 1], [4, 1, 1, 11, 1, 9, 1]]

In [46]:
padded = pad_sequences(sequences, maxlen= max_length, padding = padding_type, truncating = trunc_type)

In [47]:
padded

array([[ 1,  1,  2,  1,  1,  5,  4,  1,  1, 24,  1,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  1,  1, 11,  1,  9,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0]])

In [48]:
print(model.predict(padded))

[[0.09693062]
 [0.00513685]]
