In [2]:
import keras
import os
import pandas as pd
import urllib

from gensim.models import Word2Vec
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Flatten, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from nltk.tokenize import word_tokenize


Using Theano backend.


In [156]:
# Download annotated comments and annotations. 
# If you're Tracy, Courtney, or Amandalynne, don't run this step 
# because you already have the data! If you aren't us, you will 
# probably need to do this step. 
# It will take a while. 
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7038044' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7383751' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

KeyboardInterrupt: 

In [3]:
class SentenceGenerator():
        def __init__(self, df):
            self.df = df
            self.sentences = []
            self.vocab = set()
        def gen_sentences(self):
            for sentence in self.df['comment']:
                tokens = word_tokenize(sentence)
                self.sentences.append(tokens)
                self.vocab.update(tokens)
            return self.sentences


In [4]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes
# Something to consider: remove Wikipedia style markup (::'s and =='s)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [5]:
# Grab the training data (seems to be 60%)
train_data = comments.loc[comments['split'] == 'train']
valid_data = comments.loc[comments['split'] == 'dev']
test_data = comments.loc[comments['split'] == 'test']


In [6]:
#tokenize and return list of sentences
train = SentenceGenerator(train_data)
train_sentences = train.gen_sentences()

valid = SentenceGenerator(valid_data)
valid_sentences = valid.gen_sentences()

test = SentenceGenerator(test_data)
test_sentences = test.gen_sentences()



In [7]:
# max number of words found in all docs
vocab_size = len(train.vocab)
print(vocab_size)

55806


In [8]:
wordvec_model = Word2Vec(sentences=train_sentences, size=200, sg=1)
weights = wordvec_model.syn0

In [9]:
import numpy as np

def vectorize_sentences(sentences, w2v_model, vocab_size):
    num_sentences = len(sentences)
    X = np.zeros((num_sentences, vocab_size, 200), dtype=np.float32)
    
    empty_word = np.zeros(200, dtype=np.float32)
    for idx, s in enumerate(sentences):
        for jdx, word in enumerate(s):
            if jdx == vocab_size:
                break 
            else:
                if word in w2v_model:
                    X[idx, jdx, :] = w2v_model[word]
                else:
                    X[idx, jdx, :] = empty_word
    return X



In [10]:
train_labels = train_data["attack"]
valid_labels = valid_data["attack"]
test_labels = test_data["attack"]
print(train_labels.shape)

(11238,)


In [11]:
train_X = vectorize_sentences(train_sentences, wordvec_model, vocab_size)
valid_X = vectorize_sentences(valid_sentences, wordvec_model, vocab_size)
test_X = vectorize_sentences(test_sentences, wordvec_model, vocab_size)

In [12]:
print(train_X.shape, valid_X.shape, test_X.shape)

(11238, 55806, 200) (3817, 55806, 200) (3778, 55806, 200)


In [None]:
model = Sequential()

model.add(LSTM(int(vocab_size*1.5), input_shape=(vocab_size, 200)))
model.add(Dropout(0.3))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [96]:
# don't know if this works yet because patas is down.
# Train model
model.fit(train_X, train_labels, batch_size=128, nb_epoch=4, validation_data=(valid_X, valid_labels))

# Evaluate model
score, acc = model.evaluate(X_test, Y_test, batch_size=128)
    
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)