### Import libraries and dependencies

In [1]:
import json
import tensorflow as tf

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Load data

In [4]:
from google.colab import files
uploaded = files.upload()

Saving sarcasm_meta.tsv to sarcasm_meta.tsv
Saving sarcasm_vecs.tsv to sarcasm_vecs.tsv
Saving sarcasm.json to sarcasm.json


In [7]:
with open("sarcasm.json", 'r') as f:
    datastore = json.load(f)

In [8]:
sentences = []
labels = []
urls = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

#### Initial setting of hyperparameters

In [9]:
#Initial setting of hyperparameters at the start of experimentation

# vocab_size = 10000
# embedding_dim = 16
# max_length = 32
# trunc_type = 'post'
# padding_type = 'post'
# oov_token = "<OOV>"
# training_size = 20000

### Set values for hyperparameters

In [10]:
#updated hypermater values in the final form
vocab_size = 1000
embedding_dim = 16
max_length = 16
trunc_type = 'post'
padding_type = 'post'
oov_token = "<OOV>"
training_size = 20000

### Preprocess data

In [11]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]

training_labels = labels[0:training_size]
testing_labels = labels [training_size:]

In [12]:
tokenzier = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenzier.fit_on_texts(training_sentences)
word_index = tokenzier.word_index
sequences = tokenzier.texts_to_sequences(training_sentences)
padded_training = pad_sequences(sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

In [13]:
testing_sequences = tokenzier.texts_to_sequences(testing_sentences)
padding_testing = pad_sequences(testing_sequences, truncating=trunc_type, padding=padding_type, maxlen=max_length)

In [15]:
word_index = tokenizier.word_index
print(len(word_index))
print(word_index)

NameError: ignored

### Model Architecture

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs=30
history = model.fit(padded_training, training_labels, epochs=num_epochs, validation_data=(padding_testing, testing_labels))

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

def plot_graph(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
    
plot_graph(history, 'acc')
plot_graph(history, 'loss')

### Save Embedding for visualization

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_headline(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

In [None]:
import io

out_v = io.open('data/sarcasm_vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('data/sarcasm_meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + '\n')
    out_v.write('\t'.join(str(x) for x in embeddings) + '\n')
    
out_v.close()
out_m.close()