# 1. Tokenization

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [2]:
sentences = [
             "I love my dog",
             "I love my cat",
             "You love my dog!",
             "Do you think my dog is amazing?"
]

tokenizer = Tokenizer(num_words = 100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

# 2. Sequencing

In [3]:
test_data = [
    "i really love my dog",
    'my dog loves my manatee'

]

In [4]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]

In [5]:
# "post" truncating cutting from the back, "pre" truncating cutting from the front
padded_seq = pad_sequences(sequences, padding="post",truncating='post', maxlen=3)
padded_seq

array([[5, 3, 2],
       [5, 3, 2],
       [6, 3, 2],
       [8, 6, 9]])

In [6]:
test_seq = tokenizer.texts_to_sequences(test_data)
test_seq

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]

# 3. Training

## 3.1 Data Processing

In [7]:
# Commonly used constant values (can be changed)

vocab_size = 10000
tr_size = 20000
oov_tok = "<OOV>"
padding_style = "post"

max_length = 100
trunc_type='post'
embedding_dim = 16

In [8]:
import json

# obtain initial data
datastore = []
with open(r"News-Headlines-Dataset-For-Sarcasm-Detection/Sarcasm_Headlines_Dataset.json", "r") as f:
    for line in f:
        datastore.append(json.loads(line))

datastore[0] # a list of json formated `dictionary`

{'is_sarcastic': 1,
 'headline': 'thirtysomething scientists unveil doomsday clock of hair loss',
 'article_link': 'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'}

In [9]:
# generate lists of raw data we need
labels = []
sentences = []
urls = []

for item in datastore:
    labels.append(item['is_sarcastic'])
    sentences.append(item['headline'])
    urls.append(item["article_link"])

In [10]:
urls[0]

'https://www.theonion.com/thirtysomething-scientists-unveil-doomsday-clock-of-hai-1819586205'

In [11]:
### Raw data preprocessing (need to do it separately for training and test dataset, tokenization and generating sequences are essentially pre-processing data)

# 1. Tokenizer
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# 2. Sequence
tr_sequence = tokenizer.texts_to_sequences(sentences)
# 3. Padding -> output a np array (pad on post)
padded = pad_sequences(tr_sequence, padding="post")

In [12]:
padded.shape

(28619, 152)

In [13]:
# slicing pre-processed data for training (can be randomly slicing to ensure fairness)
tr_sentences = sentences[0:tr_size]
tr_labels = labels[0:tr_size]

ts_sentences = sentences[tr_size:]
ts_labels = labels[tr_size:]

# Tokenizer can be shared for generating sequences for test and training data
tokenizer = Tokenizer(oov_token=oov_tok, num_words=vocab_size)


In [14]:
# since we do not want the training data to have access to unseen words that have been tokenized initially, we need to redo our tokenization based on ONLY training sentences
tokenizer.fit_on_texts(tr_sentences) # generate the tokens

tr_seq = tokenizer.texts_to_sequences(tr_sentences) # change the sentences into matrix form of token representations of words

tr_seq_padded = pad_sequences(tr_seq, padding=padding_style, maxlen=max_length, truncating=trunc_type) # pad the sequences to make sure every line has same length

len(tokenizer.word_index.keys()) # less than the one above with all data

25898

In [15]:
# Repeat the same procedure for test dataset as well
tokenizer.fit_on_texts(ts_sentences)
ts_seq = tokenizer.texts_to_sequences(ts_sentences)
ts_seq_padded = pad_sequences(ts_seq, padding=padding_style, maxlen=max_length, truncating=trunc_type)


## 3.2 Build Model (TensorFlow)

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding,GlobalAveragePooling1D,Dense

# TODO: Continue Here
model = Sequential(
    [
        Embedding(vocab_size, embedding_dim, input_length=max_length), # embedding_dim = N refers to that each word in the vocabulary is transfered into a 1 x N list. word1 -> [0, 1, 2, .... N-1]
        GlobalAveragePooling1D(), # [1,5,7] -> [3, 6] if padding='valid', pool_size=2, stride=1. output_dim = input_dim - pool_size + 1 / stride, we plus 1 to avoid error when pool_size = voca_size
        Dense(32,activation='relu'),
        Dense(1,activation="sigmoid")
    ]
)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dense_1 (Dense)             (None, 1)                 33        
                                                                 
Total params: 160,577
Trainable params: 160,577
Non-trainable params: 0
_________________________________________________________________


In [18]:
num_epochs = 30

# Note: labels and padded sequences MUST BE IN type of np.array, since we initially read the labels as list, we need to convert them into array here
tr_labels = np.array(labels[0:tr_size])
ts_labels = np.array(labels[tr_size:])

history = model.fit(
    tr_seq_padded,
    tr_labels,
    epochs=num_epochs,
    validation_data=(ts_seq_padded, ts_labels),
    verbose=1 # define the GUI display of the training progress
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [19]:
ts_sentences = [
    "Today is so nice",
    "You look like shit",
    "they're really on top of things"
]

tokenizer.fit_on_texts(ts_sentences)
tmp_seq = tokenizer.texts_to_sequences(ts_sentences)
tmp_padded = pad_sequences(tmp_seq, padding=padding_style, truncating=trunc_type, maxlen=max_length)



In [20]:
tmp_seq = tokenizer.texts_to_sequences(["they're really on top of things"])
tmp_seq

[[735, 110, 8, 265, 3, 140]]

In [21]:
model.predict(tmp_padded)

array([[5.1062523e-09],
       [4.7942996e-04],
       [1.1391908e-02]], dtype=float32)

# 4. Generate Text