In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
sentences = [
             'I love my dog',
             'I love my cat',
             'There is a dog on the street!'
]

In [None]:
# Create a tokenizer with max 100 words and train them on a list of sentence
# Print the word:index dictionary generated by them

tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'i': 2, 'love': 3, 'my': 4, 'dog': 5, 'cat': 6, 'there': 7, 'is': 8, 'a': 9, 'on': 10, 'the': 11, 'street': 12}


In [None]:
# Convert sentences into list of indexes

sequences = tokenizer.texts_to_sequences(sentences)
for i in sequences:
  print(i)

[2, 3, 4, 5]
[2, 3, 4, 6]
[7, 8, 9, 5, 10, 11, 12]


In [None]:
# Now, reverse convert words which weren't in the vocabulory
# things that weren't in vocabulory are termed as OOV
new_sentences = [
                 'Hi! My dog is Tommy',
                 'My dog love bones'
]

new_sequences = tokenizer.texts_to_sequences(new_sentences)
for i in new_sequences:
  print(i)

[1, 4, 5, 8, 1]
[4, 5, 3, 1]


In [None]:
# for training purpose, we need sentences of same length
# so we need to pad them with stuff

from tensorflow.keras.preprocessing.sequence import pad_sequences

print("Un-padded: ")
for i in sequences :
  print(i)

print("\nPadded: ")
padded = pad_sequences(sequences)
for i in padded :
  print(i)

# sentences get pre-padded with a special '0' index 
# and get converted to numpy array from list

Un-padded: 
[2, 3, 4, 5]
[2, 3, 4, 6]
[7, 8, 9, 5, 10, 11, 12]

Padded: 
[0 0 0 2 3 4 5]
[0 0 0 2 3 4 6]
[ 7  8  9  5 10 11 12]


In [None]:
# some other ways of padding 

# Padding on the end of string
padded = pad_sequences(sequences, padding='post')
for i in padded :
  print(i)
print("\n")

# put a maximum length from back
padded = pad_sequences(sequences, maxlen=5)
for i in padded :
  print(i)
print("\n")

# put a maximum length from front
padded = pad_sequences(sequences, maxlen=5, truncating='post')
for i in padded :
  print(i)

[2 3 4 5 0 0 0]
[2 3 4 6 0 0 0]
[ 7  8  9  5 10 11 12]


[0 2 3 4 5]
[0 2 3 4 6]
[ 9  5 10 11 12]


[0 2 3 4 5]
[0 2 3 4 6]
[ 7  8  9  5 10]


In [None]:
# padding




# padding

In [None]:
# getting our file
!wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json

--2021-02-14 12:36:33--  https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.137.128, 142.250.101.128, 2607:f8b0:4023:c03::80, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.137.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5643545 (5.4M) [application/json]
Saving to: ‘/tmp/sarcasm.json’


2021-02-14 12:36:34 (52.7 MB/s) - ‘/tmp/sarcasm.json’ saved [5643545/5643545]



In [None]:
import json

datastore = None
with open("/tmp/sarcasm.json", 'r') as f:
    datastore = json.load(f)

sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])


In [None]:
print(type(datastore))
print(datastore[0])
print(datastore[2])

<class 'list'>
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}
{'article_link': 'https://local.theonion.com/mom-starting-to-fear-son-s-web-series-closest-thing-she-1819576697', 'headline': "mom starting to fear son's web series closest thing she will have to grandchild", 'is_sarcastic': 1}


In [None]:
len(sentences)

26709

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

In [None]:
print(padded[0])

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]


In [None]:
print(padded.shape)

(26709, 40)


In [None]:
# PADDING





# PADDING

In [None]:
train_size = 20000
max_length = 15
max_words = 20000

In [None]:
trainX = sentences[:train_size]
testX = sentences[train_size:]

trainY = labels[:train_size]
testY = labels[train_size:]

In [None]:
tokenizer = Tokenizer(oov_token='<OOV>', num_words = max_words)
tokenizer.fit_on_texts(trainX)

word_index = tokenizer.word_index

trainS = tokenizer.texts_to_sequences(trainX)
testS = tokenizer.texts_to_sequences(testX)

In [None]:
padTrain = pad_sequences(trainS, maxlen=max_length, padding='post', truncating='post')
padTest = pad_sequences(testS, maxlen=max_length, padding='post', truncating='post')

In [None]:
print(padTest[0])

[17706  1100  6663  9423    30 11505  2439     5   519   109     0     0
     0     0     0]


In [None]:
# model creation 
import tensorflow as tf

model = tf.keras.Sequential([
                             tf.keras.layers.Embedding(max_words, 16, input_length=max_length),
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(24, activation='relu'),
                             tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
import numpy as np

model.fit(padTrain, np.array(trainY), epochs=15, validation_data=(padTest, np.array(testY)), verbose=1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fa67daee6d8>

In [None]:
# time to test it out!!!!

test_sentences = [
                  'granny starting to fear that spiders in the garden might be real',
                  'the weather today is bright and sunny'
]

sequences = tokenizer.texts_to_sequences(test_sentences)
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [None]:
preds = model.predict(padded)

In [None]:
for i in range(2):
  print(test_sentences[i])
  if preds[i]>=0.5 :
    print("- Sarcastic\n")
  else :
    print("- Not Sarcastic\n")

granny starting to fear that spiders in the garden might be real
- Sarcastic

the weather today is bright and sunny
- Not Sarcastic

