In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [2]:
path = 'data/'

TRAIN_DATA_FILE = path + 'train.csv'
TEST_DATA_FILE = path + 'test.csv'
TRAIN_DATA_FILE2 = path + 'wiki_debias_train.csv'
TEST_DATA_FILE2 = path + 'wiki_debias_test.csv'
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)
train_df2 = pd.read_csv(TRAIN_DATA_FILE)
test_df2 = pd.read_csv(TEST_DATA_FILE)
print(len(train_df))
print(len(test_df))

95692
31866


In [3]:
print(len(train_df2))
print(len(test_df2))

95692
31866


In [4]:
train_df.head(10)

Unnamed: 0,rev_id,toxicity,comment,year,logged_in,ns,sample,split,is_toxic
0,2232.0,0.1,This: :One can make an analogy in mathematical...,2002,True,article,random,train,False
1,4216.0,0.0,` :Clarification for you (and Zundark's righ...,2002,True,user,random,train,False
2,26547.0,0.0,`This is such a fun entry. Devotchka I once...,2002,True,article,random,train,False
3,37330.0,0.3,` I fixed the link; I also removed ``homeopa...,2002,True,article,random,train,False
4,37346.0,0.1,`If they are ``indisputable`` then why does th...,2002,True,article,random,train,False
5,44377.0,0.0,` The concept of ``viral meme`` is not a mai...,2002,True,article,random,train,False
6,66667.0,0.0,"`just quick notes, since i don't have the time...",2002,True,article,random,train,False
7,91460.0,0.1,`The actual idea behind time-out is to get the...,2002,True,article,random,train,False
8,114890.0,0.1,"` Gjalexei, you asked about whether there...",2002,True,user,random,train,False
9,132491.0,0.1,"`] :: When I'm angry, I can't write from the ...",2002,True,article,random,train,False


## Preprocess The Text

In [5]:
print('Processing text dataset')
from nltk.tokenize import WordPunctTokenizer
from collections import Counter
from string import punctuation, ascii_lowercase
import regex as re
from tqdm import tqdm

# replace urls
re_url = re.compile(r"((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\
                    .([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*",
                    re.MULTILINE|re.UNICODE)
# replace ips
re_ip = re.compile("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")

# setup tokenizer
tokenizer = WordPunctTokenizer()
vocab = Counter()

def text_to_wordlist(text, lower=False):
    # replace URLs
    text = re_url.sub("URL", text)
    
    # replace IPs
    text = re_ip.sub("IPADDRESS", text)
    
    # Tokenize
    text = tokenizer.tokenize(text)
    
    # optional: lower case
    if lower:
        text = [t.lower() for t in text]
    
    # Return a list of words
    vocab.update(text)
    return text

def process_comments(list_sentences, lower=False):
    comments = []
    for text in tqdm(list_sentences):
        txt = text_to_wordlist(text, lower=lower)
        comments.append(txt)
    return comments


list_sentences_train = list(train_df["comment"].fillna("NAN_WORD").values)
list_sentences_test = list(test_df["comment"].fillna("NAN_WORD").values)

comments = process_comments(list_sentences_train + list_sentences_test, lower=True)

Processing text dataset


100%|████████████████████████████████| 127558/127558 [00:13<00:00, 9206.97it/s]


In [6]:
print("The vocabulary contains {} unique tokens".format(len(vocab)))

The vocabulary contains 168550 unique tokens


In [7]:
print(comments[0])

['this', ':', ':', 'one', 'can', 'make', 'an', 'analogy', 'in', 'mathematical', 'terms', 'by', 'envisioning', 'the', 'distribution', 'of', 'opinions', 'in', 'a', 'population', 'as', 'a', 'gaussian', 'curve', '.', 'we', 'would', 'then', 'say', 'that', 'the', 'consensus', 'would', 'be', 'a', 'statement', 'that', 'represents', 'the', 'range', 'of', 'opinions', 'within', 'perhaps', 'three', 'standard', 'deviations', 'of', 'the', 'mean', 'opinion', '.', 'sounds', 'arbitrary', 'and', 'ad', 'hoc', '.', 'does', 'it', 'really', 'belong', 'in', 'n', 'encyclopedia', 'article', '?', 'i', 'don', "'", 't', 'see', 'that', 'it', 'adds', 'anything', 'useful', '.', 'the', 'paragraph', 'that', 'follows', 'seems', 'much', 'more', 'useful', '.', 'are', 'there', 'any', 'political', 'theorists', 'out', 'there', 'who', 'can', 'clarify', 'the', 'issues', '?', 'it', 'seems', 'to', 'me', 'that', 'this', 'is', 'an', 'issue', 'that', 'locke', ',', 'rousseau', ',', 'de', 'toqueville', ',', 'and', 'others', 'must', 

## Model The Word Vectors With Gensim

####  CBOW

In [8]:
from gensim.models import Word2Vec



In [9]:
model = Word2Vec(comments, size=100, window=1, min_count=5, workers=3, sg=0, negative=5)

In [13]:
word_vectors = list(model.wv.vocab)

In [14]:
print(word_vectors)






In [11]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

Number of word vectors: 38626


In [20]:
word_vectors.vocab['this']

<gensim.models.keyedvectors.Vocab at 0x1e586732fd0>

### Let’s see if we have trained semantically reasonable word vectors.

In [22]:
model.wv.most_similar_cosmul(positive=['dog'])

[('mothers', 0.8320167660713196),
 ('filthy', 0.8199912309646606),
 ('commie', 0.8071396350860596),
 ('fat', 0.806575357913971),
 ('tits', 0.8059297204017639),
 ('soul', 0.8041910529136658),
 ('kkk', 0.8035542964935303),
 ('pig', 0.7999925017356873),
 ('mother', 0.7999717593193054),
 ('pussy', 0.7991694808006287)]

## Initialize The Embeddings In Keras

In [23]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [24]:
train_df.head(1)

Unnamed: 0,rev_id,toxicity,comment,year,logged_in,ns,sample,split,is_toxic
0,2232.0,0.1,This: :One can make an analogy in mathematical...,2002,True,article,random,train,False


In [25]:
from keras.preprocessing.sequence import pad_sequences

word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
sequences = [[word_index.get(t, 0) for t in comment]
             for comment in comments[:len(list_sentences_train)]]
test_sequences = [[word_index.get(t, 0)  for t in comment] 
                  for comment in comments[len(list_sentences_train):]]

# pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
#list_classes = ["toxicity","year","logged_in","ns","sample","split"]
list_classes = ["toxicity"]
y = train_df[list_classes].values
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


Shape of data tensor: (95692, 200)
Shape of label tensor: (95692, 1)
Shape of test_data tensor: (31866, 200)


Now we finally create the embedding matrix. This is what we will feed to the keras embedding layer. Note, that you can use the same code to easily initialize the embeddings with Glove or other pretrained word vectors.

In [26]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass   

### Setup The Comment Classifier

In [27]:
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

In [28]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

In [29]:
# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)

In [30]:
# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(6, activation='sigmoid')(x)

In [31]:
# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

In [35]:
print(data)
hist = model.fit([data], y, validation_split=0.1,epochs=10, batch_size=10, shuffle=True)

[[   0    0    0 ... 6442  112 5682]
 [   0    0    0 ...  211    0   23]
 [  23   20   11 ...    3    6  252]
 ...
 [  23   16  535 ...  575  107  157]
 [   0    0    0 ...  350    1   23]
 [   0    0    0 ...   24   13    1]]


ValueError: Error when checking target: expected dense_1 to have shape (6,) but got array with shape (1,)