A text-based sarcasm detector trained on the reddit sarcasm dataset.

In [1]:
import keras
import tensorflow as tf
import numpy as np
import pandas as pd
import pickle
import bcolz
import re
import h5py

Using TensorFlow backend.


In [2]:
SARC_train_path = 'data/train-balanced.csv'
SARC_test_path = 'data/test-balanced.csv'
# This refers to the glove embedding size
DIM_SIZE = 100

# Get Glove Embeddings

In [3]:
def save_array(fname, arr):
    c=bcolz.carray(arr, rootdir=fname, mode='w')
    c.flush()

In [4]:
def load_array(fname):
    return bcolz.open(fname)[:]

In [5]:
def unpack_glove():
    path = 'glove/'
    name = '6B.100d'
    res_path = 'glove/pickled/'
    with open(path+ 'glove.' + name + '.txt', 'r') as f: lines = [line.split() for line in f]
    words = [d[0] for d in lines]
    vecs = np.stack(np.array(d[1:], dtype=np.float32) for d in lines)
    wordidx = {o:i for i,o in enumerate(words)}
    save_array(res_path+name+'.dat', vecs)
    pickle.dump(words, open(res_path+name+'_words.pkl','wb'))
    pickle.dump(wordidx, open(res_path+name+'_idx.pkl','wb'))

In [8]:
# Only run this once
# unpack_glove()

In [6]:
def load_glove():
    loc = 'glove/pickled/6B.100d'
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb')),
        pickle.load(open(loc+'_idx.pkl','rb')))

In [7]:
glove_vecs, glove_words, glove_word2id = load_glove()

# Get sarcasm training data

In [None]:
# Use this command in shell to take a peak at the beginning of a large file:
# "head -10 <large file>"

In [8]:
train_tbl = pd.read_csv(SARC_train_path,
                       names=["label", "comment", "author", "subreddit", "score", "ups", "downs", "date", "created_utc", "parent_comment"],
                       usecols=range(0, 2),
                       header=None,
                       encoding="ISO-8859-1",
                       sep='\t',
                       dtype={"label": int, "comment": object})

In [34]:
train_tbl.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [9]:
# Looks good! But it also looks like it's sorted and I don't want that, so let's shuffle
train_tbl = train_tbl.sample(frac=1).reset_index(drop=True)

In [36]:
train_tbl.head()

Unnamed: 0,label,comment
0,1,But we live in a completely black and white wo...
1,1,Thats to funny!
2,1,But... it's Danny Devito... and you're on Redd...
3,1,Yeah because that will really take away all th...
4,1,Yeah this tv phenomenom is ALWAYS over looked.


In [11]:
train_lbls = train_tbl.ix[:,0].values
train_coms = train_tbl.ix[:,1].values.astype(str).tolist()

# Get sarcasm test data

In [12]:
test_tbl = pd.read_csv(SARC_test_path,
                       names=["label", "comment", "author", "subreddit", "score", "ups", "downs", "date", "created_utc", "parent_comment"],
                       usecols=range(0, 2),
                       header=None,
                       encoding="ISO-8859-1",
                       sep='\t')

In [13]:
test_tbl = test_tbl.sample(frac=1).reset_index(drop=True)

In [15]:
test_lbls = test_tbl.ix[:,0].values
test_coms = test_tbl.ix[:,1].values.astype(str).tolist()

# Tokenize text and turn it into sequences

In [16]:
vocab_size = 10000
comment_length = 200

# Tokenize comments, turn them into sequences and pad them
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_coms)
train_seqs = tokenizer.texts_to_sequences(train_coms)
SARC_word2id = tokenizer.word_index
print(len(SARC_word2id))

train_seqs = pad_sequences(train_seqs, maxlen=comment_length)

180583


In [17]:
tokenizer.fit_on_texts(test_coms)
test_seqs = tokenizer.texts_to_sequences(test_coms)
test_seqs = pad_sequences(test_seqs, maxlen=comment_length)

# Get positive and negative sentiment words

In [18]:
pos = pd.read_csv('data/positive.txt', header=None)
neg = pd.read_csv('data/negative.txt', header=None)

# Build embedding matrix

When you add the sentiments into the embeddings, it takes a long time.

Save it the first time and then load it every time after that.

In [60]:
# Build embedding matrix without sentiments
embed_mat = np.zeros((len(SARC_word2id) + 1, DIM_SIZE))
for word, i in SARC_word2id.items():
    # Use .get() instead of [] so it will return None if key
    # is not present
    embed_id = glove_word2id.get(word)
    if embed_id == None:
        # If it's not in glove, then use random vector
        embed_vec = np.random.normal(0.6, size=(DIM_SIZE,))
    else:
        embed_vec = glove_vecs[embed_id]
        
    embed_mat[i,:] = embed_vec

embed_mat[-1,:] = np.random.normal(0.6, size=(DIM_SIZE,)) / 3.0

In [54]:
# Build embedding matrix with sentiments
embed_mat = np.zeros((len(SARC_word2id) + 1, DIM_SIZE + 1))
for word, i in SARC_word2id.items():
    # Use .get() instead of [] so it will return None if key
    # is not present
    embed_id = glove_word2id.get(word)
    embed_vec = np.empty((DIM_SIZE + 1,))
    if embed_id == None:
        # If it's not in glove, then use random vector
        embed_vec[:DIM_SIZE] = np.random.normal(0.6, size=(DIM_SIZE,))
    else:
        embed_vec[:DIM_SIZE] = glove_vecs[embed_id]
    
    # Append sentiment signifier
    if pos[0].str.contains(word).any():
        np.append(embed_vec, 0.5)
    elif neg[0].str.contains(word).any():
        np.append(embed_vec, -0.5)
    else:
        np.append(embed_vec, 0.0)
        
    embed_mat[i,:] = embed_vec
    
    # Update on progress
    if (i % 10000 == 0):
        print(i, " embeddings constructed")

embed_mat[-1,:DIM_SIZE] = np.random.normal(0.6, size=(DIM_SIZE,)) / 3.0
embed_mat[-1,-1] = 0.0


10000  embeddings constructed
20000  embeddings constructed
30000  embeddings constructed
40000  embeddings constructed
50000  embeddings constructed
60000  embeddings constructed
70000  embeddings constructed
80000  embeddings constructed
90000  embeddings constructed
100000  embeddings constructed
110000  embeddings constructed
120000  embeddings constructed
130000  embeddings constructed
140000  embeddings constructed
150000  embeddings constructed
160000  embeddings constructed
170000  embeddings constructed
180000  embeddings constructed


In [55]:
# Save embedding matrix so you don't have to re-build it every time
# with sentiments
outfile = 'data/embed.npy'
np.save(outfile, embed_mat)

In [None]:
# Load embedding matrix with sentiments
outfile = 'data/embed.npy'
embed_mat = np.load(outfile)

# Build the model

In [25]:
from keras import backend as K
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers.core import Dropout, Dense, Flatten
from keras.layers.convolutional import Convolution1D
from keras.layers import Embedding, merge
from keras.layers.pooling import MaxPooling1D
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

In [65]:
K.clear_session()

In [66]:
# Add one to dim_size if you're including sentiments
Embed_Layer = Embedding(len(SARC_word2id)+1,
                            DIM_SIZE,
                            weights=[embed_mat],
                            input_length=comment_length,
                            trainable=False)

In [67]:
# Keras model
model = Sequential([
    Embed_Layer,
    Convolution1D(128, 3, padding='same', activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    Dropout(0.5),
    MaxPooling1D(),
    Convolution1D(256, 3, padding='same', activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    BatchNormalization(),
    Dropout(0.5),
    MaxPooling1D(),
    Convolution1D(512, 3, padding='same', activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
    BatchNormalization(),
    Flatten(),
    Dense(100, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01)),
#     BatchNormalization(),
#     Dropout(0.7),
    Dense(1, activation='sigmoid', kernel_regularizer=keras.regularizers.l2(0.01))])

In [68]:
# Note: may have to weight this loss if the classes aren't balanced
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

model.fit(train_seqs, train_lbls, validation_data=(test_seqs, test_lbls), epochs=2, batch_size=64)

Train on 1010826 samples, validate on 251608 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9f38f84dd8>

In [144]:
# Now allow it to train embedding as well
model.layers[0].trainable=True
model.optimizer.lr=1e-4

In [146]:
model.fit(train_seqs, train_lbls, validation_data=(test_seqs, test_lbls), epochs=2, batch_size=64)

In [112]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 100)          18058400  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 128)          64128     
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 128)          512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 128)          0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 100, 128)          82048     
_________________________________________________________________
batch_normalization_2 (Batch (None, 100, 128)          512       
__________

In [69]:
def preprocess_for_prediction(text):
    
    text = text.lower()
    
    tokenizer.fit_on_texts([text])
    text_seqs = tokenizer.texts_to_sequences([text])
    text_seqs = pad_sequences(text_seqs, maxlen=comment_length)
    
    return text_seqs

In [71]:
sample_sarcastic = "Isn't getting cheated on great? I just love it."

In [72]:
sample_normal = "Your dog is beautiful, I'd like to get a labrador some day as well."

In [73]:
sample_in = preprocess_for_prediction(sample_sarcastic)
prediction = model.predict(sample_in)
print(prediction)

[[ 0.36336055]]


In [74]:
sample_in = preprocess_for_prediction(sample_normal)
prediction = model.predict(sample_in)
print(prediction)

[[ 0.51159787]]


# Save model

In [None]:
model_path = 'model/'
model.save_weights(model_path+'SARC_100.h5')