In [1]:
%load_ext lab_black

In [2]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.layers as layers
import numpy as np
import dask.dataframe as dd
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import warnings

warnings.filterwarnings(action="ignore")

In [3]:
# reading data
data = dd.read_json("/home/sharoonsaxena/Datasets/Sarcasm.json").repartition(8)
data.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [4]:
# creating train validation partition
train, validation = data.random_split([0.8, 0.2], shuffle=True)

In [5]:
# isolating headlines and labels
train_headline = train.headline.values.compute()
train_labels = train.is_sarcastic.values.compute()

val_headline = validation.headline.values.compute()
val_labels = validation.is_sarcastic.values.compute()

train_headline.shape, type(train_headline)

((21317,), numpy.ndarray)

In [6]:
# setting hyper-params
W_MAX_LEN = 200
W_VOCAB = 20000
EMBED_DIM = 32
PADDING = "post"
TRUNC = "POST"
OOV = "<OOV>"

# word tokenizer
word_tokenizer = Tokenizer(num_words=W_MAX_LEN, oov_token=OOV)
word_tokenizer.fit_on_texts(train_headline)
len(word_tokenizer.word_index)

26491

In [7]:
# setting hyper-params
C_MAX_LEN = 500
C_VOCAB = 100
EMBED_DIM = 32
PADDING = "post"
TRUNC = "post"
OOV = "<OOV>"

# word tokenizer
char_tokenizer = Tokenizer(
    num_words=C_MAX_LEN,
    oov_token=OOV,
    char_level=True,
)
char_tokenizer.fit_on_texts(train_headline)
len(char_tokenizer.word_index)

95

In [8]:
# texts to sequences using word tokenizer
train_word_sequences = word_tokenizer.texts_to_sequences(train_headline)
val_word_sequences = word_tokenizer.texts_to_sequences(val_headline)

len(train_word_sequences), type(train_word_sequences)

(21317, list)

In [9]:
# texts to sequences using char tokenizer
train_char_sequences = char_tokenizer.texts_to_sequences(train_headline)
val_char_sequences = char_tokenizer.texts_to_sequences(val_headline)

len(train_char_sequences), type(train_char_sequences)

(21317, list)

In [10]:
# padding word sequences
train_word_padded_sequences = pad_sequences(
    train_word_sequences, maxlen=W_MAX_LEN, padding=PADDING, truncating=TRUNC
)
val_word_padded_sequences = pad_sequences(
    val_word_sequences, maxlen=W_MAX_LEN, padding=PADDING, truncating=TRUNC
)

len(train_word_padded_sequences), type(train_word_padded_sequences)

(21317, numpy.ndarray)

In [11]:
# padding char sequences
train_char_padded_sequences = pad_sequences(
    train_char_sequences, maxlen=W_MAX_LEN, padding=PADDING, truncating=TRUNC
)
val_char_padded_sequences = pad_sequences(
    val_char_sequences, maxlen=W_MAX_LEN, padding=PADDING, truncating=TRUNC
)

len(train_char_padded_sequences), type(train_char_padded_sequences)

(21317, numpy.ndarray)

In [12]:
reverse_word_index = dict(
    [(value, key) for (key, value) in word_tokenizer.word_index.items()]
)


def decode_word_sentence(text):
    return " ".join([reverse_word_index.get(i, "") for i in text])


print(decode_word_sentence(train_word_padded_sequences[1]))
print(train_headline[1])

the <OOV> <OOV> <OOV> to see your <OOV> in the <OOV>                                                                                                                                                                                             
the science-backed reason to see your therapist in the morning


In [13]:
reverse_char_index = dict(
    [(value, key) for (key, value) in char_tokenizer.word_index.items()]
)


def decode_char_sentence(text):
    return "".join([reverse_char_index.get(i, "") for i in text])


print(decode_char_sentence(train_char_padded_sequences[1]))
print(train_headline[1])

the science-backed reason to see your therapist in the morning
the science-backed reason to see your therapist in the morning


In [14]:
# converting everything to list
train_word_padded_sequences.tolist(), train_labels.tolist()
val_word_padded_sequences.tolist(), val_labels.tolist()

train_char_padded_sequences.tolist()
val_char_padded_sequences.tolist()
print("done")

done


### Model 1: embedding model at word and character level

In [15]:
word_embed_model = Sequential(
    [
        layers.Embedding(
            input_dim=W_VOCAB,
            output_dim=EMBED_DIM,
            input_length=W_MAX_LEN,
        ),
        layers.Flatten(),
        layers.Dense(units=16, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

word_embed_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
word_embed_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           640000    
_________________________________________________________________
flatten (Flatten)            (None, 6400)              0         
_________________________________________________________________
dense (Dense)                (None, 16)                102416    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 742,433
Trainable params: 742,433
Non-trainable params: 0
_________________________________________________________________


In [16]:
EPOCHS = 5
history_word_embeds = word_embed_model.fit(
    x=train_word_padded_sequences,
    y=train_labels,
    epochs=EPOCHS,
    validation_data=(val_word_padded_sequences, val_labels),
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [17]:
char_embed_model = Sequential(
    [
        layers.Embedding(
            input_dim=C_VOCAB,
            output_dim=EMBED_DIM,
            input_length=C_MAX_LEN,
        ),
        layers.Flatten(),
        layers.Dense(units=16, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

char_embed_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
char_embed_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           3200      
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                256016    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 259,233
Trainable params: 259,233
Non-trainable params: 0
_________________________________________________________________


In [None]:
EPOCHS = 5
history_char_embeds = word_embed_model.fit(
    x=train_char_padded_sequences,
    y=train_labels,
    epochs=EPOCHS,
    validation_data=(val_char_padded_sequences, val_labels),
)

Epoch 1/5
Epoch 2/5
104/667 [===>..........................] - ETA: 3s - loss: 0.5646 - accuracy: 0.6971

### Model 2: RNN at character and Word tokens

In [None]:
word_GRU_model = Sequential(
    [
        layers.Embedding(
            input_dim=W_VOCAB,
            output_dim=EMBED_DIM,
            input_length=W_MAX_LEN,
        ),
        layers.Bidirectional(layers.GRU(16, return_sequences=True)),
        layers.Bidirectional(layers.GRU(16)),
        layers.Dense(units=16, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

word_GRU_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
word_GRU_model.summary()

In [None]:
EPOCHS = 5
history_word_rnn = word_rnn_model.fit(
    x=train_word_padded_sequences,
    y=train_labels,
    epochs=EPOCHS,
    validation_data=(val_word_padded_sequences, val_labels),
)

In [None]:
char_rnn_model = Sequential(
    [
        layers.Embedding(
            input_dim=C_VOCAB,
            output_dim=EMBED_DIM,
            input_length=C_MAX_LEN,
        ),
        layers.Bidirectional(layers.RNN(units=16, return_sequences=True)),
        layers.Bidirectional(layers.RNN(units=16))
        layers.Dense(units=16, activation="relu"),
        layers.Dense(units=1, activation="sigmoid"),
    ]
)

char_embed_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)
char_embed_model.summary()