This is a very simple Feed Forward Neural Network that I wrote in a few hours. As Andrew Ng said, the best thing to do is to immediately create a simple model and then add to it. I will be attempting to create a more advanced model later.

In [2]:
import json

data = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
    data.append(json.loads(line))

In [143]:
titles = []
y_vals = []

for i in range(0,len(data)):
    titles.append(data[i]['headline'])
    y_vals.append(data[i]['is_sarcastic'])
    

In [25]:
#Let's now do some preprocessing

import nltk
nltk.download('punkt')
from nltk import word_tokenize

titles_tokenized = []
for title in titles:
    titles_tokenized.append(word_tokenize(title))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Obdyg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [31]:
titles_an = [] #alphanumeric
for title in titles_tokenized:
    words = [word for word in title if word.isalpha()]
    titles_an.append(words)

In [30]:
titles_an[0]

['former',
 'versace',
 'store',
 'clerk',
 'sues',
 'over',
 'secret',
 'code',
 'for',
 'minority',
 'shoppers']

In [32]:
#Let's now stem the words
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
titles_preprocessed = []
for title in titles_an:
    stemmed = [porter.stem(word) for word in title]
    titles_preprocessed.append(stemmed)

In [33]:
titles_preprocessed[0]

['former',
 'versac',
 'store',
 'clerk',
 'sue',
 'over',
 'secret',
 'code',
 'for',
 'minor',
 'shopper']

In [62]:
#Brilliant. Now, let's create a large list of all of the words and find the 10,000 most frequent ones
word_list = []

for title in titles_preprocessed:
    for word in title:
        word_list.append(word)

In [63]:
from collections import Counter #Here, we create a counter
freq_list = Counter(word_list)

In [64]:
dictionary = freq_list.most_common(10000) #Get the 10,000 most common words

In [77]:
dictionary = list(zip(*dictionary))[0] #Remove the counts

In [88]:
#We now have a list with the 10000 most common words. Let us convert our sentences to lists of these words in order to
#       feed it into the Neural Network
nums = range(0,10000)

In [104]:
word_int = dict(zip(dictionary, nums))

In [117]:
x_vals = []

for title in titles_preprocessed:
    x_vals.append([word_int[x] for x in title if x in word_int.keys()])

In [170]:
#Now, let's format the data for the Neural Network and divide the training, validation, and test sets
import numpy as np

x = np.array(x_vals)
test_data = x[:5000]
train_data = x[5000:]

def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

from keras.utils.np_utils import to_categorical

y = np.asarray(y_vals).astype('float32')
y_test = y[:5000]
y_train = y[5000:]

In [171]:
#Create the validation set
x_val = x_train[:5000]
x_partial_train = x_train[5000:]

y_val = y_train[:5000]
y_partial_train = y_train[5000:]

In [157]:
#Prevent Tensorflow from allocating my entire GPU
import keras

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

In [202]:
#Now, let us define our model
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape = (10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [203]:
history = model.fit(x_partial_train, y_partial_train, epochs = 20, batch_size = 512, validation_data=(x_val, y_val))

Train on 16709 samples, validate on 5000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [204]:
#Let us train the model with 6 epochs.
model = models.Sequential()
model.add(layers.Dense(32, activation='relu', input_shape = (10000,)))
model.add(layers.Dense(4, activation='relu'))
model.add(layers.Dense(1, activation = 'sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs = 4, batch_size = 512)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x287aba25940>

In [205]:
results = model.evaluate(x_test, y_test)
results



[0.35768395495414734, 0.8456]