# Deep CNN-LSTM with Word Embeddings for News Headline Sarcasm Detection
Created by Paul K. Mandal

This code is based off of a paper that I submitted in ITNG 2019 titled "Deep CNN-LSTM with Word Embeddings for News Headline Sarcasm Detection." Unfortunately, the code was wiped off of my computer before I pushed it to github. Thus, there might be some slight variations in hyperparameters than what was outlined in the paper.

In [1]:
import json

data = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
    data.append(json.loads(line))

In [2]:
titles = []
y_vals = []

for i in range(0,len(data)):
    titles.append(data[i]['headline'])
    y_vals.append(data[i]['is_sarcastic'])
    

In [3]:
#Let's now do some preprocessing

import nltk
nltk.download('punkt')
from nltk import word_tokenize

titles_tokenized = []
for title in titles:
    titles_tokenized.append(word_tokenize(title))

[nltk_data] Downloading package punkt to /home/paul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
titles_an = [] #alphanumeric
for title in titles_tokenized:
    words = [word for word in title if word.isalpha()]
    titles_an.append(words)

In [5]:
titles_an[0]

['former',
 'versace',
 'store',
 'clerk',
 'sues',
 'over',
 'secret',
 'code',
 'for',
 'minority',
 'shoppers']

In [6]:
#Let's now stem the words
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
titles_preprocessed = []
for title in titles_an:
    stemmed = [porter.stem(word) for word in title]
    titles_preprocessed.append(stemmed)

In [7]:
titles_preprocessed[0]

['former',
 'versac',
 'store',
 'clerk',
 'sue',
 'over',
 'secret',
 'code',
 'for',
 'minor',
 'shopper']

In [8]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
maxlen = 20
max_words = 10000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(titles_preprocessed)
sequences = tokenizer.texts_to_sequences(titles_preprocessed)

In [9]:
sequences[0]

[370, 9791, 715, 2738, 1284, 55, 321, 1892, 5, 1285, 3308]

In [10]:
#Convert back to text
text = tokenizer.sequences_to_texts(sequences)
text[0]

'former versac store clerk sue over secret code for minor shopper'

In [11]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' %len(word_index))

Found 16436 unique tokens.


In [12]:
x_vals = pad_sequences(sequences, maxlen=maxlen)

In [13]:
print('shape of data tensor:', x_vals.shape)

shape of data tensor: (26709, 20)


In [14]:
x_test = x_vals[:5000]
x_train = x_vals[5000:]
x_val = x_train[:5000]
x_partial_train = x_train[5000:]

y = np.asarray(y_vals).astype('float32')
y_test = y[:5000]
y_train = y[5000:]
y_val = y_train[:5000]
y_partial_train = y_train[5000:]

In [15]:
#These two lines need to be uncommented to download the weight embeddings if it has not been done before
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [16]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i<max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [21]:
from tensorflow import keras
from keras import layers
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.layers import Bidirectional

BATCH_SIZE = 128

model = Sequential()
model.add(layers.Embedding(max_words, 128, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(16, recurrent_dropout=0.5)))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])



In [20]:
history = model.fit(x_partial_train, y_partial_train, epochs = 20, batch_size = BATCH_SIZE, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [22]:
history = model.fit(x_train, y_train, epochs = 2, batch_size = BATCH_SIZE)

Epoch 1/2
Epoch 2/2


In [23]:
results = model.evaluate(x_test, y_test)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 conv1d_4 (Conv1D)           (None, 20, 32)            28704     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 4, 32)            0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 4, 32)             7200      
                                                                 
 time_distributed_2 (TimeDis  (None, 4, 32)            0         
 tributed)                                                       
                                                                 
 bidirectional_2 (Bidirectio  (None, 32)              