In [1]:
import pandas as pd
import numpy as np

## Import the data

In [2]:
# raw data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_index=train.shape[0]

# ouput id
_id = test['id']

# exogenous variables
train_y = train[['toxic','severe_toxic','obscene',
                 'threat','insult','identity_hate']].values
# all txt data
all_txt = pd.concat([train['comment_text'], test['comment_text']])

del train
del test

## Adjust Text for issues

In [3]:
# remove url's
all_txt = all_txt.replace(r"(f|ht)tp(s?)://\\S+", " ", regex=True)
all_txt = all_txt.replace(r"http\\S+", " ", regex=True)
all_txt = all_txt.replace(r"xml\\S+", " ", regex=True)

# remove newline
all_txt = all_txt.str.replace('\n', " ")

# lowercase
all_txt = all_txt.str.lower()

# fix word transformations
all_txt = all_txt.replace("'ll", " will")
all_txt = all_txt.replace("i'm", "i am")
all_txt = all_txt.replace("'re", " are")
all_txt = all_txt.replace("'s", " is")
all_txt = all_txt.replace("'ve", " have")
all_txt = all_txt.replace("'d", " would")

# fix curseword and strange words to get real ones
all_txt = all_txt.replace(r"(a|e)w+\\b", "", regex=True)
all_txt = all_txt.replace(r"(y)a+\\b", "", regex=True)
all_txt = all_txt.replace(r"a?(ha)+\\b", "", regex=True)
all_txt = all_txt.replace(r"(w)w+\\b", "", regex=True)
all_txt = all_txt.replace(r"((lol)(o?))+\\b", "laugh out loud", regex=True)
all_txt = all_txt.replace("n ig ger", "nigger")
all_txt = all_txt.replace("s hit", "shit")
all_txt = all_txt.replace("g ay", "gay")
all_txt = all_txt.replace("f ag got", "faggot")
all_txt = all_txt.replace("c ock", "cock")
all_txt = all_txt.replace("cu nt", "cunt")
all_txt = all_txt.replace("idi ot", "idiot")
all_txt = all_txt.replace(r"(?<=\\b(fu|su|di|co|li))\\s(?=(ck)\\b)", "", regex=True)
all_txt = all_txt.replace(r"(?<=\\w(ck))\\s(?=(ing)\\b)", "", regex=True)

# remove punctuation and whitespace
all_txt = all_txt.replace(r'[^\w\s]',' ', regex=True)
all_txt = all_txt.replace("_", " ")
all_txt = all_txt.replace('\s+', " ", regex=True)


# Tokenize & Lemmentize

In [4]:
from nltk import word_tokenize
from nltk import WordNetLemmatizer

In [5]:
all_txt = all_txt.apply(word_tokenize) # takes ~ 3 min on my machine

In [6]:
wordnet = WordNetLemmatizer()
all_txt = all_txt.apply(lambda row: [wordnet.lemmatize(x) for x in row]) # takes ~5 min

# Read in Pretrained GloVe file

In [7]:
# reads a GloVe Trained txt file and appends it with new words it hasn't seen.
# A way of transfer learning to help fit new files.

import itertools
def return_index_map(glove_file, unique_words):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for word in unique_words:
            if word not in words:
                words.add(word)
                word_to_vec_map[word] = np.zeros((50, ), dtype=np.float64)
        
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [8]:
word_to_index, index_to_word, word_to_vec_map = return_index_map('embeddings.txt', set(itertools.chain.from_iterable(all_txt.tolist())))

In [9]:
def comment_to_index(X, word_to_index, max_comment_length):
    """Takes a tokenized comment and returns the index of the words
       in the embedding.
    """
    m = X.shape[0]
    out = np.zeros((m, max_comment_length))
    
    for i in range(m):
        j = 0
#         import pdb; pdb.set_trace()
        for word in X[i][:max_comment_length]:
            out[i, j] = word_to_index[word]
            j += 1
    return out

# Impliment RNN

In [10]:
import numpy as np
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [11]:
def create_embedding(word_to_vec_map, word_to_index):
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["hi"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    for word, index in word_to_index.items():
        emb_matrix[index, :] = word_to_vec_map[word]

    embedding_layer = Embedding(vocab_len, emb_dim, trainable=True)

    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [12]:
embedding_layer = create_embedding(word_to_vec_map, word_to_index)

In [20]:
def get_model(input_shape, word_to_vec_map, word_to_index):
    
    # create input layer
    model_input = Input(shape=input_shape)
    
    # initialize embedding layer
    embedding = create_embedding(word_to_vec_map, word_to_index)
    
    # propagate words through embedding
    word_embeddings = embedding(model_input)
    
    # first LSTM network
    X = LSTM(128, return_sequences=True)(word_embeddings)
    
    # dropout
    X = Dropout(0.2)(X)
    
    # second layer of LSTM
    X = LSTM(128, return_sequences=False)(X)
    
    # dropout
    X = Dropout(0.2)(X)
    
    # add a dense layer to help classify
    X = Dense(128, activation='relu')(X)
    
    # dropout
    X = Dropout(0.2)(X)
    
    # output is 6 dimensional
    X = Dense(6)(X)
    
    X = Activation('softmax')(X)
    
    return Model(inputs=model_input, outputs=X)

In [21]:
model = get_model(((20,)), word_to_vec_map, word_to_index)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(
    all_txt.values[:train_index], train_y, test_size=0.1
)

In [23]:
model.fit(
    comment_to_index(X_train, word_to_index, 20),
    y_train,
    epochs=10,
    batch_size=256
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
  8704/143613 [>.............................] - ETA: 6:12 - loss: 0.2492 - acc: 0.9352

KeyboardInterrupt: 