In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import sys

In [2]:
import datetime
from bin.imports import *
from bin.config import *
from bin.utils import *
from bin.models import GRU_LSTM_model, CV_predictor, CAPSULE_model, DPCNN_model
from bin.text_cleaner import TextCleaner
from bin.contractions import contractions, negative_100, positive_100

Using TensorFlow backend.


In [3]:
negative_100 = [i for i in negative_100 if len(i) > 3][:200]
positive_100 = [i for i in positive_100 if len(i) > 3][:200]
# negative_100 = sorted(negative_100, key= lambda x: -len(x))
# positive_100 = sorted(positive_100, key= lambda x: -len(x))

valuable_words = negative_100 + positive_100

In [4]:
def unify_tokens(comment):
    nl = []
    wl = WordNetLemmatizer().lemmatize
    for word in comment:
        word = wl(wl(word, pos='v'), pos='a')
        # token in original form, exact matching
        in_dict = emb_keys.get(word)
        if in_dict is not None:
            nl.append(word) 
        else:
            word = word.lower()
            word = wl(wl(word, pos='v'), pos='a')
            # token lowercased, exact matching
            in_dict = emb_keys.get(word)
            if in_dict is not None:
                nl.append(word)
            else:
                # break if work consist of < 3 symbols as non reliable solution
                if len(word) < 3:
                    continue
                # top pos/neg words by LR weights lowercased, partial matching
                for w in valuable_words:
                    if w in word:
                        word = word.replace(w, '')
                        nl.append(w)
                        if len(word) < 3:
                            continue
                # embedding keys lowercased, partial matching
                for w in emb_sorted:
                    if w.lower() in word:
                        word = word.replace(w, '')
                        nl.append(w.lower())
                        if len(word) < 3:
                            continue
        # words which were not found in dict gonna be excluded from the comment
    return nl

def process_comment(df):
    df['comment_text'] = df['comment_text'].apply(unify_tokens).values
    return df

In [5]:
EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'
emb_keys = dict(get_emb_dict(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))
emb_sorted = [i for i in sorted(emb_keys, key=lambda x: -len(x)) if len(i) < 15 and len(i)>2]

FileNotFoundError: [Errno 2] No such file or directory: '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'

In [None]:
train = pd.read_csv('../data/train.csv.zip').fillna("fillna")
test = pd.read_csv('../data/test.csv.zip').fillna("fillna")
submission = pd.read_csv('../data/sample_submission.csv.zip')

print (f'text cleaner processing: {datetime.datetime.now()}')
tc = TextCleaner(contractions)
train['comment_text'] = tc.transform(train['comment_text'].fillna('na').values)
test['comment_text'] = tc.transform(test['comment_text'].fillna('na').values)

In [None]:
print (f'train cleaning: {datetime.datetime.now()}')
train = parallelize_dataframe(train, process_comment)
print (f'test cleaning: {datetime.datetime.now()}')
test = parallelize_dataframe(test, process_comment)

train cleaning: 2018-03-19 19:10:07.279024


In [None]:
EMBEDDING_FILE = '/home/ser/Downloads/fasttext/crawl-300d-2M.vec'
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [None]:
max_features = 150000
maxlen = 150
embed_size = 300

X_train = train["comment_text"].values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].values

tokenizer = text.Tokenizer(num_words=max_features, lower=False)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
missed = []
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector      
    else:
        missed.append(word)

In [None]:
len(missed)

In [None]:
missed[:10]

### Tests

In [18]:
model = GRU_LSTM_model(CuDNNLSTM, maxlen, max_features, embed_size, embedding_matrix)

batch_size = 128
epochs = 10

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

In [58]:
dpcnn_kwargs = {
    'maxlen': maxlen,
    'max_features': max_features,
    'embed_size': embed_size,
    'embedding_matrix': embedding_matrix,
    'spatial_dropout': 0.25,
    'filter_nr': 64,
    'filter_size': 3, 
    'max_pool_size': 3, 
    'max_pool_strides': 2,
    'dense_nr': 256,
    'dense_dropout': 0.5
}

model = DPCNN_model(**dpcnn_kwargs)

batch_size = 128
epochs = 15

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/15

 ROC-AUC - epoch: 1 - score: 0.971133 

Epoch 2/15

 ROC-AUC - epoch: 2 - score: 0.979113 

Epoch 3/15

 ROC-AUC - epoch: 3 - score: 0.981239 

Epoch 4/15

 ROC-AUC - epoch: 4 - score: 0.983787 

Epoch 5/15

 ROC-AUC - epoch: 5 - score: 0.984971 

Epoch 6/15

 ROC-AUC - epoch: 6 - score: 0.984433 

Epoch 7/15

 ROC-AUC - epoch: 7 - score: 0.986733 

Epoch 8/15

 ROC-AUC - epoch: 8 - score: 0.985978 

Epoch 9/15

 ROC-AUC - epoch: 9 - score: 0.987321 

Epoch 10/15

 ROC-AUC - epoch: 10 - score: 0.987668 

Epoch 11/15

 ROC-AUC - epoch: 11 - score: 0.986771 

Epoch 12/15

 ROC-AUC - epoch: 12 - score: 0.987882 

Epoch 13/15

 ROC-AUC - epoch: 13 - score: 0.987454 

Epoch 14/15

 ROC-AUC - epoch: 14 - score: 0.987154 

Epoch 15/15

 ROC-AUC - epoch: 15 - score: 0.985625 



<keras.callbacks.History at 0x7f959a4426a0>

### Predictions

#### DPCNN

In [None]:
dpcnn_kwargs = {
    'maxlen': maxlen,
    'max_features': max_features,
    'embed_size': embed_size,
    'embedding_matrix': embedding_matrix,
    'spatial_dropout': 0.25,
    'filter_nr': 64,
    'filter_size': 3, 
    'max_pool_size': 3, 
    'max_pool_strides': 2,
    'dense_nr': 256,
    'dense_dropout': 0.5
}

batch_size = 128
n_splits = 10
epochs = range(10)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

cv = CV_predictor(DPCNN_model, x_train, y_train, x_test, 
                      n_splits, batch_size, epochs, list_classes, dpcnn_kwargs)
cv.predict()

train_p = cv.train_predictions
test_p = cv.test_predictions
test_p.index = test['id']

train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_dpcnn.csv', index=False)
test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_dpcnn.csv', index=False)

#### GRU

In [None]:
gru_kwargs = {
    'CuDNN': CuDNNGRU,
    'maxlen': maxlen, 
    'max_features': max_features, 
    'embed_size': embed_size, 
    'embedding_matrix' : embedding_matrix
}

batch_size = 128
n_splits = 10
epochs = range(4)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, 
                      n_splits, batch_size, epochs, list_classes, gru_kwargs)
cv.predict()

train_p = cv.train_predictions
test_p = cv.test_predictions
test_p.index = test['id']

train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_gru_lem_low.csv', index=False)
test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_gru_lem_low.csv', index=False)

#### LSTM

In [None]:
gru_kwargs = {
    'CuDNN': CuDNNLSTM, 
    'maxlen': maxlen, 
    'max_features': max_features, 
    'embed_size': embed_size, 
    'embedding_matrix' : embedding_matrix
}

batch_size = 128
n_splits = 10
epochs = range(4)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

cv = CV_predictor(GRU_LSTM_model, x_train, y_train, x_test, 
                      n_splits, batch_size, epochs, list_classes, gru_kwargs)
cv.predict()

train_p = cv.train_predictions
test_p = cv.test_predictions
test_p.index = test['id']

train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_lstm_lem_low.csv', index=False)
test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_lstm_lem_low.csv', index=False)

#### Capsule

In [None]:
capsule_kwargs = {
    'maxlen': maxlen, 
    'max_features': max_features, 
    'embed_size': embed_size, 
    'embedding_matrix' : embedding_matrix, 
    'rate_drop_dense': 0.3,
    'Num_capsule': 10, 
    'Dim_capsule': 16, 
    'Routings':  5,
    'gru_len': 128
}

batch_size = 128
n_splits = 10
epochs = range(4)
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

cv = CV_predictor(CAPSULE_model, x_train, y_train, x_test, 
                      n_splits, batch_size, epochs, list_classes, capsule_kwargs)
cv.predict()

train_p = cv.train_predictions
test_p = cv.test_predictions
test_p.index = test['id']

train_p.to_csv('/home/ser/DL/toxic/train_predictions/f_capsule_lem_low.csv', index=False)
test_p.reset_index().to_csv('/home/ser/DL/toxic/test_predictions/f_capsule_lem_low.csv', index=False)