# Training a neural net to classify personal attacks in Wikipedia comments

In [1]:
import os
import pandas as pd
import urllib

## Load and pre-process the data.
### This is the exact preprocessing method used by Wulczyn, et al, the curators of the data.

In [None]:
# Download annotated comments and annotations. 
# If you're Tracy, Courtney, or Amandalynne, don't run this step 
# because you already have the data! If you aren't us, you will 
# probably need to do this step. 
# It will take a while. 
ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7038044' 
ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7383751' 


def download_file(url, fname):
    urllib.request.urlretrieve(url, fname)

                
download_file(ANNOTATED_COMMENTS_URL, 'attack_annotated_comments.tsv')
download_file(ANNOTATIONS_URL, 'attack_annotations.tsv')

In [10]:
# Read the data into a Pandas dataframe.
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

# Label a comment as an attack if over half of annotators did so.
# We can tinker with this threshold later.
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

# Join labels and comments
comments['attack'] = labels

# Preprocess the data -- remove newlines, tabs, quotes
# Something to consider: remove Wikipedia style markup (::'s and =='s)
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("`", " "))

In [8]:
# Take a look at the comments
# Look: they've already split the data into train / dev / test :) 
comments

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,True,article,random,train,False
44816,:: the term standard model is itself le...,2002,True,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,True,article,random,train,False
89320,"Next, maybe you could work on being less cond...",2002,True,article,random,dev,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
111032,:If you ever claimed in a Judaic studies prog...,2002,True,article,random,dev,False
120283,"My apologies I'm English, I watch cricket,...",2002,True,article,random,dev,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,False,article,random,train,False


In [15]:
# Grab the training data (seems to be 60%)
train_data = comments.loc[comments['split'] == 'train']

In [14]:
train_data

Unnamed: 0_level_0,comment,year,logged_in,ns,sample,split,attack
rev_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
37675,- This is not creative . Those are the di...,2002,True,article,random,train,False
44816,:: the term standard model is itself le...,2002,True,article,random,train,False
49851,"True or false, the situation as of March 200...",2002,True,article,random,train,False
93890,This page will need disambiguation.,2002,True,article,random,train,False
102817,- Important note for all sysops: There is a ...,2002,True,user,random,train,False
103624,I removed the following: All names of early P...,2002,True,article,random,train,False
128532,"Someone wrote: More recognizable, perhaps, is...",2002,False,article,random,train,False
133562,:Correct. Full biographical details will put...,2002,True,article,random,train,False
138117,Care should be taken to distinguish when an...,2002,True,article,random,train,False
192579,:<>> :I fail to see the distinction. Who ...,2002,True,article,random,train,False


In [25]:
# Concatenate all the training data (comments) into a big f*cking string
# Don't print it, it's huge.
train_text = train_data["comment"].str.cat()

In [26]:
# If you want to see a snippet of it for a sanity check:
train_text[:100]

' - This is not   creative  .  Those are the dictionary definitions of the terms   insurance   and   '

In [30]:
# A set of character unigrams.
char_unigrams = set(train_text)

# Note: there are 728 unique characters.

In [32]:
# This function makes the char ngrams we want. 
# Default to 1-5, as in the paper
def char_ngram_generator(text, n1=1, n2=5):
    z = []
    text2 = '*'+text+'*'
    for k in range(n1,n2):
        z.append([text2[i:i+k] for i in range(len(text2)-k+1)])
    z = [ngram for ngrams in z for ngram in ngrams]
    z.append(text)
    return set(z)

# Fiddling around with keras Tokenizer to make the character embeddings. Work in progress.

In [38]:
import keras

In [60]:
# The char-level tokenizer will only produce character unigrams.
# Figuring out the 2..5 grams will come another day.
# We may want to set a threshold for frequency, e.g. only care about
# top 100 most frequent chars / char sequences. 
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [64]:
# Fit it to the training data.
tokenizer.fit_on_texts(train_text)

In [65]:
# Take a look at the character counts
tokenizer.word_counts

{' ': 2547708,
 '!': 50185,
 '#': 2120,
 '$': 190,
 '%': 1296,
 '&': 1139,
 "'": 45785,
 '(': 15054,
 ')': 16122,
 '*': 7660,
 '+': 517,
 ',': 100465,
 '-': 22935,
 '.': 140606,
 '/': 10610,
 '0': 20203,
 '1': 14070,
 '2': 11252,
 '3': 5612,
 '4': 4936,
 '5': 5594,
 '6': 4799,
 '7': 4822,
 '8': 4732,
 '9': 7358,
 ':': 43474,
 ';': 4925,
 '<': 106,
 '=': 54563,
 '>': 327,
 '?': 16129,
 '@': 255,
 'A': 59948,
 'B': 19659,
 'C': 34860,
 'D': 21347,
 'E': 40705,
 'F': 24124,
 'G': 20172,
 'H': 36527,
 'I': 91392,
 'J': 9979,
 'K': 15923,
 'L': 23195,
 'M': 24157,
 'N': 32154,
 'O': 38611,
 'P': 24934,
 'Q': 1142,
 'R': 26993,
 'S': 39356,
 'T': 56853,
 'U': 27008,
 'V': 7975,
 'W': 37668,
 'X': 810,
 'Y': 19269,
 'Z': 1086,
 '[': 1532,
 '\\': 92,
 ']': 1812,
 '^': 174,
 '_': 3488,
 'a': 805785,
 'b': 147174,
 'c': 287161,
 'd': 359355,
 'e': 1181147,
 'f': 186884,
 'g': 202572,
 'h': 449346,
 'i': 743982,
 'j': 15266,
 'k': 107298,
 'l': 409604,
 'm': 233064,
 'n': 668659,
 'o': 785890,
 '

In [66]:
tokenizer.word_index

{' ': 1,
 '!': 30,
 '#': 76,
 '$': 98,
 '%': 82,
 '&': 86,
 "'": 31,
 '(': 57,
 ')': 54,
 '*': 65,
 '+': 93,
 ',': 25,
 '-': 46,
 '.': 22,
 '/': 60,
 '0': 48,
 '1': 58,
 '2': 59,
 '3': 68,
 '4': 70,
 '5': 69,
 '6': 73,
 '7': 72,
 '8': 74,
 '9': 66,
 ':': 32,
 ';': 71,
 '<': 104,
 '=': 29,
 '>': 95,
 '?': 53,
 '@': 96,
 'A': 27,
 'B': 50,
 'C': 38,
 'D': 47,
 'E': 33,
 'F': 44,
 'G': 49,
 'H': 37,
 'I': 26,
 'J': 61,
 'K': 55,
 'L': 45,
 'M': 43,
 'N': 39,
 'O': 35,
 'P': 42,
 'Q': 84,
 'R': 41,
 'S': 34,
 'T': 28,
 'U': 40,
 'V': 64,
 'W': 36,
 'X': 89,
 'Y': 52,
 'Z': 87,
 '[': 78,
 '\\': 106,
 ']': 77,
 '^': 99,
 '_': 75,
 'a': 4,
 'b': 21,
 'c': 14,
 'd': 12,
 'e': 2,
 'f': 19,
 'g': 18,
 'h': 10,
 'i': 6,
 'j': 56,
 'k': 23,
 'l': 11,
 'm': 15,
 'n': 7,
 'o': 5,
 'p': 17,
 'q': 63,
 'r': 9,
 's': 8,
 't': 3,
 'u': 13,
 'v': 24,
 'w': 20,
 'x': 51,
 'y': 16,
 'z': 62,
 '{': 80,
 '|': 67,
 '}': 79,
 '~': 81,
 '\x93': 196,
 '\x94': 197,
 '\x95': 427,
 '\x97': 339,
 '\xa0': 92,
 '¡': 3

In [67]:
tokenizer.texts_to_sequences(train_text)

[[1],
 [46],
 [1],
 [28],
 [10],
 [6],
 [8],
 [1],
 [6],
 [8],
 [1],
 [7],
 [5],
 [3],
 [1],
 [1],
 [1],
 [14],
 [9],
 [2],
 [4],
 [3],
 [6],
 [24],
 [2],
 [1],
 [1],
 [22],
 [1],
 [1],
 [28],
 [10],
 [5],
 [8],
 [2],
 [1],
 [4],
 [9],
 [2],
 [1],
 [3],
 [10],
 [2],
 [1],
 [12],
 [6],
 [14],
 [3],
 [6],
 [5],
 [7],
 [4],
 [9],
 [16],
 [1],
 [12],
 [2],
 [19],
 [6],
 [7],
 [6],
 [3],
 [6],
 [5],
 [7],
 [8],
 [1],
 [5],
 [19],
 [1],
 [3],
 [10],
 [2],
 [1],
 [3],
 [2],
 [9],
 [15],
 [8],
 [1],
 [1],
 [1],
 [6],
 [7],
 [8],
 [13],
 [9],
 [4],
 [7],
 [14],
 [2],
 [1],
 [1],
 [1],
 [4],
 [7],
 [12],
 [1],
 [1],
 [1],
 [2],
 [7],
 [8],
 [13],
 [9],
 [4],
 [7],
 [14],
 [2],
 [1],
 [1],
 [1],
 [4],
 [8],
 [1],
 [17],
 [9],
 [5],
 [17],
 [2],
 [9],
 [11],
 [16],
 [1],
 [4],
 [17],
 [17],
 [11],
 [6],
 [2],
 [12],
 [1],
 [3],
 [5],
 [1],
 [1],
 [1],
 [12],
 [2],
 [8],
 [3],
 [9],
 [13],
 [14],
 [3],
 [6],
 [5],
 [7],
 [1],
 [1],
 [22],
 [1],
 [1],
 [26],
 [19],
 [1],
 [16],
 [5],
 [13],
 [1],
 [

In [68]:
len(_)

13620046

use this for help
http://www.orbifold.net/default/2017/01/10/embedding-and-tokenizer-in-keras/