# Training a neural net to classify personal attacks in Wikipedia comments

In [64]:
import keras
import os
import pandas as pd
import urllib


from keras.layers.core import Dense, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer

## Load and pre-process the data.
### This is the exact preprocessing method used by Wulczyn, et al, the curators of the data.

In [None]:
# Download annotated comments and annotations. 
# If you're Tracy, Courtney, or Amandalynne, don't run this step 
# because you already have the data! If you aren't us, you will 
# probably need to do this step. 
# It will take a while. 
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7038044' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7383751' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [33]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes
# Something to consider: remove Wikipedia style markup (::'s and =='s)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [34]:
# Take a look at the comments
# Look: they've already split the data into train / dev / test :) 
comments

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,False,article,random,train,False
44816,:: the term standard model is itself le...,2002,False,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,False,article,random,train,False
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
111032,:If you ever claimed in a Judaic studies prog...,2002,True,article,random,dev,False
120283,"My apologies I'm English, I watch cricket,...",2002,True,article,random,dev,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,True,article,random,train,False


In [30]:
# Grab the training data (seems to be 60%)
train_data = comments.loc[comments['split'] == 'train']

In [31]:
train_data

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,False,article,random,train,False
44816,:: the term standard model is itself le...,2002,False,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,False,article,random,train,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,True,article,random,train,False
133562,:Correct. Full biographical details will put...,2002,True,article,random,train,False
138117,Care should be taken to distinguish when an...,2002,True,article,random,train,False
192579,:<>> :I fail to see the distinction. Who ...,2002,True,article,random,train,False


In [37]:
# Put all the training data (comments) into a list
train_texts = train_data["comment"].tolist()

In [38]:
# If you want to see a snippet of it for a sanity check:
train_text[:10]

[" - This is not   creative  .  Those are the dictionary definitions of the terms   insurance   and   ensurance   as properly applied to   destruction  .  If you don't understand that, fine, legitimate criticism, I'll write up   three man cell   and   bounty hunter   and then it will be easy to understand why   ensured   and   insured   are different - and why both differ from   assured  .  The sentence you quote is absolutely neutral.  You just aren't familiar with the underlying theory of strike-back (e.g. submarines as employed in nuclear warfare) guiding the insurance, nor likely the three man cell structure that kept the IRA from being broken by the British.  If that's my fault, fine, I can fix that to explain.  But ther'es nothing   personal   or   creative   about it.  I'm tired of arguing with you.  Re: the other article,   multi-party   turns up plenty, and there is more use of   mutually   than   mutual  .  If I were to apply your standard I'd be moving   Mutual Assured Destr

In [29]:
len(train_text)

69526

In [39]:
# A set of character unigrams.
# This number may come in handy later
char_unigrams = set(train_text)

# Note: there are 728 unique characters.

In [32]:
# This function makes the char ngrams we want. 
# Default to 1-5, as in the paper
def char_ngram_generator(text, n1=1, n2=5):
    z = []
    text2 = '*'+text+'*'
    for k in range(n1,n2):
        z.append([text2[i:i+k] for i in range(len(text2)-k+1)])
    z = [ngram for ngrams in z for ngram in ngrams]
    z.append(text)
    return set(z)

# Fiddling around with keras Tokenizer to make the character embeddings. Work in progress.

In [40]:
# The char-level tokenizer will only produce character unigrams.
# Figuring out the 2..5 grams will come another day.
# We may want to set a threshold for frequency, e.g. only care about
# top 100 most frequent chars / char sequences. 
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [41]:
# Fit it to the training data.
tokenizer.fit_on_texts(train_texts)

In [44]:
# Take a look at the character counts for fun
tokenizer.word_counts

{' ': 5351570,
 '-': 51321,
 'T': 113251,
 'h': 943891,
 'i': 1572357,
 's': 1326059,
 'n': 1417949,
 'o': 1659847,
 't': 1889196,
 'c': 612939,
 'r': 1188214,
 'e': 2486567,
 'a': 1696248,
 'v': 214828,
 '.': 294108,
 'd': 764831,
 'y': 454043,
 'f': 392185,
 'm': 487940,
 'u': 657833,
 'p': 426631,
 'l': 856363,
 'I': 189734,
 "'": 98998,
 ',': 209650,
 'g': 427725,
 'w': 341809,
 'b': 310011,
 'q': 19512,
 'Y': 40097,
 'j': 31726,
 'k': 223359,
 '(': 31615,
 ')': 34335,
 'R': 55088,
 'A': 114695,
 'B': 42454,
 'x': 39931,
 ':': 105879,
 'M': 48770,
 'D': 44483,
 'U': 48932,
 'S': 88004,
 '1': 35413,
 '9': 15515,
 '5': 12718,
 '0': 41861,
 '2': 28258,
 'G': 40618,
 'W': 67605,
 '?': 34090,
 'C': 68171,
 'N': 63965,
 'F': 44156,
 'P': 55064,
 'O': 71824,
 'V': 14918,
 'K': 29262,
 'z': 19348,
 'L': 46460,
 'H': 62782,
 '[': 3258,
 ']': 3661,
 'J': 18911,
 '6': 10077,
 '!': 66567,
 ';': 11675,
 'E': 76531,
 '<': 203,
 '>': 721,
 '/': 27683,
 'ö': 139,
 '4': 11216,
 '3': 14261,
 '_': 81

In [48]:
# Transform each comment in the training data to arrays of equal length
train_matrix = tokenizer.texts_to_matrix(train_text)

In [55]:
# Dimensions of our training matrix
train_matrix.shape[0], train_matrix.shape[1]

(69526, 1558)

In [54]:
# The list of gold-standard labels for training data
train_labels = train_data["attack"].tolist()

In [72]:
# Make a model.
model = Sequential()
model.add(Dense(2, input_dim=len(char_unigrams)))
model.add(Dense(1, activation='sigmoid'))

In [73]:
# Recall from earlier that char unigram vocab size is 728.
# Make the embedding... 
# There's probably something wrong here so fix later.
model.add(Embedding(728, 10, input_length=train_matrix.shape[1]))

In [74]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [75]:
# Validate on the training data to start out.
# This doesn't work at the moment.
model.fit(train_matrix, train_labels,
          batch_size=100,
          epochs=2,
          validation_data=(train_matrix, train_labels))

ValueError: Error when checking model input: expected dense_4_input to have shape (None, 69449) but got array with shape (69526, 1558)