In [1]:
import keras
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import urllib


from keras.layers.core import Dense, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from sklearn import metrics

Using Theano backend.


In [2]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes (following Wulczyn)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [3]:
train_data = comments.loc[comments['split'] == 'train']
dev_data = comments.loc[comments['split'] == 'dev']
test_data = comments.loc[comments['split'] == 'test']

# The list of gold-standard labels for the data
train_labels = train_data["attack"].tolist()
dev_labels = dev_data["attack"].tolist()
test_labels = test_data["attack"].tolist()
print(len([x for x in train_labels if x]))

# Put all the training data (comments) into a list
train_texts = train_data["comment"].tolist()
dev_texts = dev_data["comment"].tolist()
test_texts = test_data["comment"].tolist()

65104


In [4]:
# Put all the comments into lists
train_texts = train_data["comment"].tolist()
dev_texts = dev_data["comment"].tolist()
test_texts = test_data["comment"].tolist()

In [5]:
# A set of character unigrams.
char_unigrams = set(''.join(train_texts))

# Note: there are 1557 unique characters.
len(char_unigrams)

1557

In [6]:
def char_ngram(text, n):
    """Create ngrams for a single line."""
    z = []
    text2 = '*'+text+'*'
    for k in range(n, n+1):
        new_info = [text2[i:i+k] for i in range(len(text2)-k+1)]
        z.append(new_info)
    z = [ngram for ngrams in z for ngram in ngrams]
    return " ".join(z)

def create_ngrams(text, n=1):
    """Create ngrams for a whole matrix/list of texts."""
    ngrammed_text = []
    for t in text:
        ngrams = char_ngram(t, n)
        ngrammed_text.append(ngrams)
    return ngrammed_text

In [7]:
train_texts = create_ngrams(train_texts, 1)
dev_texts = create_ngrams(dev_texts, 1)
test_texts = create_ngrams(test_texts, 1)

In [8]:
print(train_texts[0])

*   -   T h i s   i s   n o t       c r e a t i v e     .     T h o s e   a r e   t h e   d i c t i o n a r y   d e f i n i t i o n s   o f   t h e   t e r m s       i n s u r a n c e       a n d       e n s u r a n c e       a s   p r o p e r l y   a p p l i e d   t o       d e s t r u c t i o n     .     I f   y o u   d o n ' t   u n d e r s t a n d   t h a t ,   f i n e ,   l e g i t i m a t e   c r i t i c i s m ,   I ' l l   w r i t e   u p       t h r e e   m a n   c e l l       a n d       b o u n t y   h u n t e r       a n d   t h e n   i t   w i l l   b e   e a s y   t o   u n d e r s t a n d   w h y       e n s u r e d       a n d       i n s u r e d       a r e   d i f f e r e n t   -   a n d   w h y   b o t h   d i f f e r   f r o m       a s s u r e d     .     T h e   s e n t e n c e   y o u   q u o t e   i s   a b s o l u t e l y   n e u t r a l .     Y o u   j u s t   a r e n ' t   f a m i l i a r   w i t h   t h e   u n d e r l y i n g   t h e o r y   o f   s t r i k 

In [9]:
def even_split(comments, labels):
    """Return a slice of the data with randomly selected False (non-attack) data to create an 
    even split with True (attack) data"""
    
    # grab all the attacks to see how many we need to match
    attack_indices = [i for i in range(len(comments)) if labels[i] == True]
    new_training = [comments[i] for i in attack_indices]
    new_labels = [labels[i] for i in attack_indices]
    
    # grab all the ones that are not attacks, shuffle them
    # select the same number of non-attacks
    non_attack_indices = [i for i in range(len(comments)) if labels[i] == False]
    random.shuffle(non_attack_indices)
    for i in range(len(attack_indices)):
        new_training.append(comments[i])
        new_labels.append(labels[i])
    
    return new_training, new_labels

In [10]:

# AUC measure
def auc_score(y_true, y_pred):
    return metrics.roc_auc_score(y_true, y_pred)

# Plot the AUROC
def plot_ROC(fpr, tpr, roc_auc, name):
    plt.figure()
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.savefig("{0}.png".format(name))

In [11]:
# The char level tokenizer
tokenizer = keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(train_texts)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)


1310


In [12]:
tokenizer.fit_on_texts(dev_texts)
tokenizer.fit_on_texts(test_texts)

In [13]:
tokenizer.word_counts

{'ڰ': 2,
 'ह': 23,
 '李': 1,
 '£': 70,
 '合': 2,
 '\xa0': 3188,
 '‟': 3,
 'ਧ': 1,
 'ۻ': 3,
 'ɓ': 1,
 '貢': 3,
 '७': 2,
 'ܘ': 3,
 'ु': 9,
 'ḥ': 3,
 '留': 3,
 '島': 3,
 'ằ': 1,
 'і': 9,
 'ร': 2,
 'я': 94,
 'λ': 50,
 '八': 3,
 'ὂ': 1,
 'グ': 1,
 '✋': 1,
 'ਅ': 2,
 '㊟': 2,
 'ш': 40,
 'ਿ': 4,
 '文': 1,
 'ь': 76,
 'ǖ': 1,
 'ϊ': 1,
 'ম': 1,
 '通': 1,
 '风': 2,
 'は': 6,
 '付': 1,
 '₴': 1,
 '獨': 2,
 'ế': 2,
 'κ': 36,
 '倶': 1,
 '₦': 1,
 'ặ': 2,
 '¨': 30,
 '云': 1,
 '很': 3,
 'ї': 5,
 'ｷ': 1,
 'ੰ': 5,
 '国': 4,
 'ˌ': 13,
 'ة': 41,
 'ή': 17,
 '̞': 1,
 '口': 1,
 '▪': 2,
 '庄': 1,
 'ਚ': 1,
 '路': 1,
 '県': 1,
 '也': 7,
 'り': 3,
 'ギ': 1,
 '衛': 1,
 'ে': 1,
 'س': 38,
 '騁': 1,
 '∞': 11,
 'ư': 6,
 'ḷ': 10,
 '條': 1,
 'व': 5,
 'ɢ': 1,
 '曰': 3,
 'া': 3,
 'チ': 3,
 '⌊': 1,
 '穀': 4,
 '現': 2,
 'њ': 3,
 'य': 8,
 'ż': 32,
 'ʙ': 1,
 '済': 7,
 '第': 1,
 'j': 82139,
 'τ': 26,
 'ῶ': 2,
 'ī': 28,
 'ݜ': 1,
 'ठ': 2,
 'ă': 65,
 'ệ': 1,
 'ె': 1,
 '顧': 1,
 'γ': 19,
 '✰': 11,
 '਼': 4,
 '玉': 4,
 '寧': 1,
 'ு': 2,
 '者': 2,
 'ׁ': 3,
 'ụ': 5,
 'ʋ': 1

In [14]:
def texts_to_matrix(texts, tokenizer):
    """Given a section of the data, return a matrix representing comments"""
    matrix = tokenizer.texts_to_matrix(texts)
    return matrix

In [15]:
# evenly split the data to have equal amounts of attacks 
train_texts, train_labels = even_split(train_texts, train_labels)
dev_texts, dev_labels = even_split(dev_texts, dev_labels)
test_texts, test_labels = even_split(test_texts, test_labels)

In [16]:
train_matrix = texts_to_matrix(train_texts, tokenizer)
dev_matrix = texts_to_matrix(dev_texts, tokenizer)
test_matrix = texts_to_matrix(test_texts, tokenizer)

In [17]:
print(train_matrix[0])

[ 0.  0.  1. ...,  0.  0.  0.]


In [18]:
# Dimensions of our training matrix
train_matrix.shape

(840, 1692)

In [19]:
# Make a model.
model = Sequential()
# Add embedding layer.
# Recall from earlier that char unigram vocab size is 1557.
# Train_matrix shape is 69526 x 1558, use that as input dimension.
model.add(Embedding(vocab_size, 10, input_length=train_matrix.shape[1]))
#model.add(Dense(train_matrix.shape[1], input_dim=train_matrix.shape[1]))
model.add(Flatten())
# Commented out for now bc... the example I was following only had one Dense layer

model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [20]:
model.fit(train_matrix, train_labels,
          batch_size=100,
          epochs=4,
          validation_data=(dev_matrix, dev_labels))

Train on 840 samples, validate on 308 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x1132df5f8>

In [22]:
# Output predictions so that we can run AUC eval
# Remember to recompile the model first before running this
y_pred = model.predict(test_matrix, batch_size=2)
roc_auc = auc_score(test_labels, y_pred)
print('AUROC:{}'.format(roc_auc))

AUROC:0.6319406392694064


In [23]:
fpr, tpr, _ = metrics.roc_curve(test_labels, y_pred)

plot_ROC(fpr, tpr, roc_auc, "unigram-model")