In [2]:
import json

def parse_data(file):
    for l in open(file,'r'):
        yield json.loads(l)
data = list(parse_data('Sarcasm_Headlines_Dataset.json'))

In [3]:
sentences = []
labels = []
urls = []
for item in data:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

In [4]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"
training_size = 20000
padding_type = 'post'

In [5]:
train_sentences = sentences[0:training_size]
test_sentences = sentences[training_size:]
train_labels = labels[0:training_size]
test_labels = labels[training_size:]

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
token = Tokenizer(num_words=vocab_size,oov_token = oov_tok)
token.fit_on_texts(train_sentences)
word_index = token.word_index
#print(word_index)

In [7]:
train_seq = token.texts_to_sequences(train_sentences)
train_pad = pad_sequences(train_seq, maxlen = max_length,truncating = trunc_type,padding = padding_type)
test_seq = token.texts_to_sequences(test_sentences)
test_pad = pad_sequences(test_seq, maxlen = max_length,truncating = trunc_type,padding = padding_type)

In [10]:
import tensorflow as tf
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [12]:
import numpy as np
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)
num_epochs = 5
history = model.fit(train_pad, train_labels, epochs=num_epochs, validation_data=(test_pad, test_labels),verbose = 2)

Train on 20000 samples, validate on 6709 samples
Epoch 1/5
20000/20000 - 7s - loss: 0.6727 - accuracy: 0.5794 - val_loss: 0.6109 - val_accuracy: 0.7578
Epoch 2/5
20000/20000 - 5s - loss: 0.4491 - accuracy: 0.8188 - val_loss: 0.3890 - val_accuracy: 0.8360
Epoch 3/5
20000/20000 - 4s - loss: 0.3180 - accuracy: 0.8736 - val_loss: 0.3583 - val_accuracy: 0.8468
Epoch 4/5
20000/20000 - 3s - loss: 0.2667 - accuracy: 0.8943 - val_loss: 0.3430 - val_accuracy: 0.8565
Epoch 5/5
20000/20000 - 3s - loss: 0.2307 - accuracy: 0.9105 - val_loss: 0.3412 - val_accuracy: 0.8550
