# Training a neural net to classify personal attacks in Wikipedia comments

In [1]:
import os
import pandas as pd
import urllib

## Load and pre-process the data.
### This is the exact preprocessing method used by Wulczyn, et al, the curators of the data.

In [None]:
# Download annotated comments and annotations. 
# If you're Tracy, Courtney, or Amandalynne, don't run this step 
# because you already have the data! If you aren't us, you will 
# probably need to do this step. 
# It will take a while. 
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7038044' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7383751' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [10]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes
# Something to consider: remove Wikipedia style markup (::'s and =='s)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [8]:
# Take a look at the comments
# Look: they've already split the data into train / dev / test :) 
comments

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,True,article,random,train,False
44816,:: the term standard model is itself le...,2002,True,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,True,article,random,train,False
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
111032,:If you ever claimed in a Judaic studies prog...,2002,True,article,random,dev,False
120283,"My apologies I'm English, I watch cricket,...",2002,True,article,random,dev,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,False,article,random,train,False


In [15]:
# Grab the training data (seems to be 60%)
train_data = comments.loc[comments['split'] == 'train']

In [14]:
train_data

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,True,article,random,train,False
44816,:: the term standard model is itself le...,2002,True,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,True,article,random,train,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,False,article,random,train,False
133562,:Correct. Full biographical details will put...,2002,True,article,random,train,False
138117,Care should be taken to distinguish when an...,2002,True,article,random,train,False
192579,:<>> :I fail to see the distinction. Who ...,2002,True,article,random,train,False


In [25]:
# Concatenate all the training data (comments) into a big f*cking string
# Don't print it, it's huge.
train_text = train_data["comment"].str.cat()

In [26]:
# If you want to see a snippet of it for a sanity check:
train_text[:100]

' - This is not   creative  .  Those are the dictionary definitions of the terms   insurance   and   '

In [30]:
# A set of character unigrams.
char_unigrams = set(train_text)

# Note: there are 728 unique characters.

In [32]:
# This function makes the char ngrams we want. 
# Default to 1-5, as in the paper
def char_ngram_generator(text, n1=1, n2=5):
    z = []
    text2 = '*'+text+'*'
    for k in range(n1,n2):
        z.append([text2[i:i+k] for i in range(len(text2)-k+1)])
    z = [ngram for ngrams in z for ngram in ngrams]
    z.append(text)
    return set(z)