In [2]:
import keras
import os
import pandas as pd
import urllib

from gensim.models import Word2Vec
from keras.layers import LSTM
from keras.layers.core import Dense, Activation, Flatten, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from nltk.tokenize import word_tokenize


Using Theano backend.


In [14]:
# Download annotated comments and annotations. 
# If you're Tracy, Courtney, or Amandalynne, don't run this step 
# because you already have the data! If you aren't us, you will 
# probably need to do this step. 
# It will take a while. 
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7038044' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7383751' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [4]:
class SentenceGenerator():
        def __init__(self, df):
            self.df = df
            self.sentences = []
            self.vocab = set()
        def gen_sentences(self):
            for sentence in self.df['comment']:
                tokens = word_tokenize(sentence)
                self.sentences.append(tokens)
                self.vocab.update(tokens)
            return self.sentences


In [5]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes
# Something to consider: remove Wikipedia style markup (::'s and =='s)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [47]:
# Grab the training data (seems to be 60%)
train_data = comments.loc[comments['split'] == 'train']
valid_data = comments.loc[comments['split'] == 'dev']
test_data = comments.loc[comments['split'] == 'test']


In [48]:
train = SentenceGenerator(train_data)
train_sentences = train.gen_sentences()[:11000]

valid = SentenceGenerator(valid_data)
valid_sentences = valid.gen_sentences()[:3000]

test = SentenceGenerator(test_data)
test_sentences = test.gen_sentences()[:3000]
print(len(train_sentences))

11000


In [49]:
# max number of words found in all docs
#vocab_size = len(train.vocab)
vocab_size = 5000
print(vocab_size)

5000


In [51]:
wordvec_model = Word2Vec(sentences=train_sentences, size=200, sg=1, max_vocab_size=vocab_size)

In [52]:
import numpy as np

def vectorize_sentences(sentences, w2v_model, vocab_size):
    num_sentences = len(sentences) *0.5
    X = np.zeros((num_sentences, vocab_size, 200), dtype=np.float32)
    
    empty_word = np.zeros((200), dtype=np.float32)
    for idx, s in enumerate(sentences):
        if idx == num_sentences:
            break
        for jdx, word in enumerate(s):
            if jdx == vocab_size:
                break 
            else:
                if word in w2v_model:
                    X[idx, jdx, :] = w2v_model[word]
                else:
                    X[idx, jdx, :] = empty_word
    yield X



In [53]:
train_labels = train_data["attack"]
train_labels = train_labels.iloc[:11000]
valid_labels = valid_data["attack"]
valid_labels = valid_labels.iloc[:3000]
test_labels = test_data["attack"]
test_labels = test_labels.iloc[:3000]
print(train_labels.shape, valid_labels.shape, test_labels.shape)

(11000,) (3000,) (3000,)


In [54]:
train_X = list(vectorize_sentences(train_sentences, wordvec_model, vocab_size))
valid_X = list(vectorize_sentences(valid_sentences, wordvec_model, vocab_size))
test_X = list(vectorize_sentences(test_sentences, wordvec_model, vocab_size))

In [55]:
print(train_X.shape, valid_X.shape, test_X.shape)
print(train_labels.shape)

AttributeError: 'list' object has no attribute 'shape'

In [89]:
# emulating the model by having 200 "hashes" with 50 layers
model = Sequential()
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Dense(200, input_shape=(vocab_size, 200)))
model.add(Flatten())
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [90]:
# don't know if this works yet because patas is down.
# Train model
model.fit(train_X, train_labels, batch_size=128, nb_epoch=4)
# Evaluate model
#score, acc = model.evaluate(X_test, Y_test, batch_size=128)
    
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)



ValueError: Error when checking model target: expected flatten_11 to have shape (None, 11161200) but got array with shape (11238, 1)