<a href="https://colab.research.google.com/github/https-deeplearning-ai/tensorflow-1-public/blob/master/C3/W3/ungraded_labs/C3_W3_Lab_5_sarcasm_with_bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ungraded Lab: Training a Sarcasm Detection Model using Bidirectional LSTMs

In this lab, you will revisit the [News Headlines Dataset for Sarcasm Detection](https://www.kaggle.com/rmisra/news-headlines-dataset-for-sarcasm-detection/home) dataset and use it to train a Bi-LSTM Model.


## Download the Dataset

First, you will download the JSON file and extract the contents into lists.

In [None]:
# Download the dataset
!wget https://storage.googleapis.com/tensorflow-1-public/course3/sarcasm.json

In [None]:
import json

# Load the JSON file
with open("./sarcasm.json", 'r') as f:
    datastore = json.load(f)

# Initialize the lists
sentences = []
labels = []

# Collect sentences and labels into the lists
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])

## Split the Dataset

You will then split the lists into train and test sets.

In [None]:
training_size = 20000

# Split the sentences
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

# Split the labels
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]

## Data preprocessing

Next, you will generate the vocabulary and padded sequences.

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

## Build and Compile the Model

The architecture here is almost identical to the one you used in the previous lab with the IMDB Reviews. Try to tweak the parameters and see how it affects the training time and accuracy (both training and validation).

In [None]:
import tensorflow as tf

# Parameters
embedding_dim = 16
lstm_dim = 32
dense_dim = 24

# Model Definition with LSTM
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_dim)),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_lstm.summary()

In [None]:

```
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Generate the word index dictionary
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

# Generate and pad the training sequences
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Generate and pad the testing sequences
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Convert the labels lists into numpy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)


import tensorflow as tf

# Parameters
embedding_dim = 16
filters = 128
kernel_size = 5
dense_dim = 6

# Model Definition with Conv1D
model_conv = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(filters, kernel_size, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(dense_dim, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Set the training parameters
model_conv.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_conv.summary()
```

In [None]:
```

tf.random.set_seed(42)
ip=Input(shape=[],dtype=tf.string)
text_vec_ly=text_vec_layer(ip)
embedding_ly=layers.Embedding(vocab_size, embed_size)(text_vec_ly)
gru_ly=layers.GRU(128)(embedding_ly)
op=layers.Dense(1, activation="sigmoid")(gru_ly)
model_fnc_api_shap_empty_arr=Model(ip,op)

# model = model_fnc_api_shap_empty_arr
model_fnc_api_shap_empty_arr.compile(loss="binary_crossentropy", optimizer="nadam",
              metrics=["accuracy"])
history = model_fnc_api_shap_empty_arr.fit(train_set, validation_data=valid_set, epochs=2)

NUM_EPOCHS = 10

# Train the model
history_conv = model_conv.fit(training_padded, training_labels, epochs=NUM_EPOCHS, validation_data=(testing_padded, testing_labels))

```

In [None]:
inputs = Input(shape=(1,), dtype=tf.string)
text_vectors = char_vectorizer(inputs) # vectorize text inputs
char_embeddings = char_embed(text_vectors) # create embedding
x = Conv1D(128, kernel_size=5, padding="same", activation="relu")(char_embeddings)
x = GlobalAveragePooling1D()(x) # condense the output of our feature vector
outputs = Dense(no_classes, activation="softmax")(x)
model_char_conv = Model(inputs, outputs)

# Compile
model_char_conv.compile(loss="categorical_crossentropy", # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
no_mask_embed_ly=Embedding(input_dim=token_layer.vocabulary_size(),output_dim=embedding_dim,input_length=int(percntl_train_95))

# Create 1D convolutional model to process sequences
inputs = Input(shape=(1,), dtype=tf.string)
text_vectors = token_layer(inputs)  # vectorize text inputs
token_embeddings = no_mask_embed_ly(text_vectors)  # create embedding
x = Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
x = GlobalAveragePooling1D()(x)  # condense the output of our feature vector
outputs = Dense(no_classes, activation="softmax")(x)
model_conv_no_mask_embed_ly = tf.keras.Model(inputs, outputs)

# Compile
model_conv_no_mask_embed_ly.compile(loss="categorical_crossentropy",
                   # if your labels are integer form (not one hot) use sparse_categorical_crossentropy
                   optimizer=tf.keras.optimizers.Adam(),
                   metrics=["accuracy"])
model_conv_no_mask_embed_ly.summary()

# Fit the model
history_model_conv_no_mask_embed_ly = model_conv_no_mask_embed_ly.fit(train_dataset,
                                    steps_per_epoch=int(0.1 * len(train_dataset)),
                                    # only fit on 10% of batches for faster training time
                                    epochs=3,
                                    validation_data=val_dataset,
                                    validation_steps=int(0.1 * len(val_dataset)))  # only validate on 10% of batches
model_conv_no_mask_embed_ly.evaluate(val_dataset)

y_proba_model_conv_no_mask_embed_ly = model_conv_no_mask_embed_ly.predict(val_dataset)
## look into this
y_pred_model_conv_no_mask_embed_ly = tf.argmax(y_proba_model_conv_no_mask_embed_ly, axis=-1)

## Train the Model

In [None]:
NUM_EPOCHS = 10

# Train the model
history_lstm = model_lstm.fit(training_padded, training_labels, epochs=NUM_EPOCHS, validation_data=(testing_padded, testing_labels))

In [None]:
import matplotlib.pyplot as plt

# Plot Utility
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(history_lstm, 'accuracy')
plot_graphs(history_lstm, 'loss')