In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, CuDNNGRU, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gensim.downloader as api
import string
import numpy as np
import json


#######
# Load data
#######
fp = 'Sarcasm_Headlines_Dataset.json'

def load_data(fp):
    # Return data as list of list, first element of each data point is
    # headline, second is 0/1 indicator for is_sarcastic
    with open(fp, 'r') as f:
        data = f.readlines()
        data = [json.loads(line) for line in data]
        return np.array([[row['headline'], row['is_sarcastic']] for row in data])

data = load_data(fp)

Using TensorFlow backend.


In [5]:
########
# Bernoulli model (naive bayes)
########

class bernoulli_model():
    def __init__(self, data):
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(data[:,0], data[:,1], test_size=0.30, random_state=20)
        self.sarcasm_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                ('tfidf', TfidfTransformer(use_idf=True)),
                                ('clf', BernoulliNB())])

    def model_results(self):
        sarcasm_clf = self.sarcasm_clf.fit(self.X_train, self.y_train)
        predictions = sarcasm_clf.predict(self.X_test)
        print('Accuracy is:', np.mean(predictions == self.y_test))
        print('Positive class ratio is:', np.mean(self.y_test == '1'))

    def model_GS(self):
        parameters = {'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
                        'tfidf__use_idf': (True, False),
                        'clf__alpha': (1.0, .1, .01, )}

        gs_clf = GridSearchCV(self.sarcasm_clf, parameters, n_jobs=-1, cv=5)
        gs_clf = gs_clf.fit(self.X_train, self.y_train)
        print('GS best score:', gs_clf.best_score_)
        print('GS best params:', gs_clf.best_params_)

m1_bernoulli = bernoulli_model(data)
m1_bernoulli.model_results()
m1_bernoulli.model_GS()

Accuracy is: 0.7984525146636715
Positive class ratio is: 0.44128291526269814
GS best score: 0.7968014548566539
GS best params: {'clf__alpha': 1.0, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [4]:
########
# Logistic regression model
########

class logistic():
    def __init__(self, data):
        self.X_train, self.X_test, self.y_train, self.y_test = \
            train_test_split(data[:,0], data[:,1], test_size=0.30, random_state=20)
        self.sarcasm_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                                ('tfidf', TfidfTransformer(use_idf=True)),
                                ('clf', LogisticRegression(solver='lbfgs'))])

    def model_results(self):
        sarcasm_clf = self.sarcasm_clf.fit(self.X_train, self.y_train)
        predictions = sarcasm_clf.predict(self.X_test)
        print('Accuracy is:', np.mean(predictions == self.y_test))
        print('Positive class ratio is:', np.mean(self.y_test == '1'))

m2_logistic = logistic(data)
m2_logistic.model_results()

Accuracy is: 0.7855984025957818
Positive class ratio is: 0.44128291526269814


In [43]:
########
# Using empty word embedding and LSTM
########

class LSTM_model():
    def __init__(self, data):
        self.X_train, self.X_test, self.y_train, self.y_test = \
                    train_test_split(data[:,0], data[:,1], test_size=0.30, random_state=20)
        self.y_train, self.y_test = list(map(int, self.y_train)), list(map(int, self.y_test))
        self.model = None

    def run_model(self):
        def clean_sen(sen):
            tokens = word_tokenize(sen)
            tokens = [w.lower() for w in tokens]

            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in tokens]
            words = [word for word in stripped if word.isalpha()]

            stop_words = set(stopwords.words('english'))
            words = [w for w in words if not w in stop_words]

            return ' '.join(words)

        self.X_train = list(map(lambda x: clean_sen(x), self.X_train))
        self.X_test = list(map(lambda x: clean_sen(x), self.X_test))

        all_data = self.X_train + self.X_test
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(all_data)

        # Max num of words in headline
        max_len = max([len(s.split()) for s in all_data])
        vocab_size = len(tokenizer.word_index) + 1

        X_train_tokens = tokenizer.texts_to_sequences(self.X_train)
        X_test_tokens = tokenizer.texts_to_sequences(self.X_test)

        X_train_pad = pad_sequences(X_train_tokens, maxlen=max_len)
        X_test_pad = pad_sequences(X_test_tokens, maxlen=max_len)

        EMBEDDING_DIMS = 50

        self.model = Sequential()
        self.model.add(Embedding(vocab_size, EMBEDDING_DIMS, input_length=max_len))
        self.model.add(LSTM(units=20, dropout=0.2, recurrent_dropout=0.2))
        self.model.add(Dense(1, activation='sigmoid'))
        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.fit(X_train_pad, y_train, batch_size=128, epochs=3, validation_split=0.2)
        print("Evaluating model against test set:")
        print(self.model.evaluate(X_test_pad, self.y_test))
        print(self.model.metrics_names)

m3_LSTM = LSTM_model(data)
m3_LSTM.run_model()

Train on 14956 samples, validate on 3740 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluating model against test set:
[0.47915095404770497, 0.8003244727466878]
['loss', 'acc']


In [41]:
#######
# Using pre-trained word vector and 2 stacked GRUs
# #######

class GRU_model():
    def __init__(self, data):
        self.X_train, self.X_test, self.y_train, self.y_test = \
                    train_test_split(data[:,0], data[:,1], test_size=0.30, random_state=20)
        self.y_train, self.y_test = list(map(int, self.y_train)), list(map(int, self.y_test))
        self.model = None

    def run_model(self):
        def clean_sen(sen):
            tokens = word_tokenize(sen)
            tokens = [w.lower() for w in tokens]

            table = str.maketrans('', '', string.punctuation)
            stripped = [w.translate(table) for w in tokens]
            words = [word for word in stripped if word.isalpha()]

            stop_words = set(stopwords.words('english'))
            words = [w for w in words if not w in stop_words]

            return ' '.join(words)

        self.X_train = list(map(lambda x: clean_sen(x), self.X_train))
        self.X_test = list(map(lambda x: clean_sen(x), self.X_test))

        # Max num of words in headline
        max_len = max([len(s.split()) for s in all_data])
        vocab_size = len(tokenizer.word_index) + 1

        X_train_tokens = tokenizer.texts_to_sequences(self.X_train)
        X_test_tokens = tokenizer.texts_to_sequences(self.X_test)

        X_train_pad = pad_sequences(X_train_tokens, maxlen=max_len)
        X_test_pad = pad_sequences(X_test_tokens, maxlen=max_len)

        pretrained_embedding = api.load("glove-wiki-gigaword-100")
        word_index = tokenizer.word_index
        num_words = len(word_index) + 1

        embedding_matrix = np.zeros((num_words, 100))

        for word, index in word_index.items():
            if word not in pretrained_embedding:
                continue
            embedding_vector = pretrained_embedding[word]
            if embedding_vector is not None:
                embedding_matrix[index] = embedding_vector

        # create model
        self.model = Sequential()
        embedding_layer = Embedding(num_words,
                                    100, 
                                    embeddings_initializer=Constant(embedding_matrix),
                                    input_length=max_len,
                                    trainable=True)

        self.model.add(embedding_layer)
        self.model.add(CuDNNGRU(units=20, return_sequences=True))
        self.model.add(CuDNNGRU(units=20))
        self.model.add(Dense(1, activation='sigmoid'))

        self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        self.model.fit(X_train_pad, y_train, validation_split=0.2, epochs = 3)
        print("Evaluating model against test set:")
        print(self.model.evaluate(X_test_pad, self.y_test))
        print(self.model.metrics_names)

m4_GRU = GRU_model(data)
m4_GRU.run_model()

Train on 14956 samples, validate on 3740 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Evaluating model against test set:
[0.47489634850610435, 0.8109322351551258]
['loss', 'acc']
