In [5]:
import numpy as np
np.random.seed(42)
import pandas as pd
import os,sys,time,datetime

import re
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from scipy.special import erfinv

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras.optimizers import Adam

import warnings
#warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

# Contraction replacement patterns
cont_patterns = [
    (b'(W|w)on\'t', b'will not'),
    (b'(C|c)an\'t', b'can not'),
    (b'(I|i)\'m', b'i am'),
    (b'(A|a)in\'t', b'is not'),
    (b'(\w+)\'ll', b'\g<1> will'),
    (b'(\w+)n\'t', b'\g<1> not'),
    (b'(\w+)\'ve', b'\g<1> have'),
    (b'(\w+)\'s', b'\g<1> is'),
    (b'(\w+)\'re', b'\g<1> are'),
    (b'(\w+)\'d', b'\g<1> would'),
]
patterns = [(re.compile(regex), repl) for (regex, repl) in cont_patterns]

def count_regexp_occ(regexp="", text=None):
    """ Simple way to get the number of occurence of a regex"""
    return len(re.findall(regexp, text))

DataBaseDir = '../../data/version2'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
kfold = 4
strategy = 'cnn-num'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
    ## for valid/holdout data set
    if(fold == 0):
        TestData = pd.read_csv('%s/test.csv' % FoldInputDir).reset_index(drop= True)#.sample(frac= 0.1)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s, time elapsed %s' % (len(TrainData), (end - start)))
##
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def LoadEmbeddingVectors(f):
    ## debug
    k = 1000
    EmbeddingDict = {}
    with open(f, 'r') as i_file:
        for line in i_file:
            #if(k == 0):
            #    break
            w, coe_vec= get_coefs(*line.rstrip().rsplit(' '))
            EmbeddingDict[w] = coe_vec
            k -= 1
    i_file.close()
    return EmbeddingDict

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
EmbeddingFile = '../../data/raw/crawl-300d-2M.vec'
max_features = 80000
maxlen = 256
#max_features = 3000
#maxlen = 10
embed_size = 300
batch_size = 128
epochs = 2
start = time.time()
EmbeddingIndex = LoadEmbeddingVectors(EmbeddingFile)
end = time.time()
print('load embedding features done, corpus size %s, time elapsed %s' % (len(EmbeddingIndex), (end - start)))

pred_cols = ['%s_%s' % (strategy, c) for c in targets]
for c in pred_cols:
    TestData[c] = .0

def get_num_feats(df):
    # Get length in words and characters
    df["raw_word_len"] = df["comment_text"].apply(lambda x: len(x.split())) + 1
    df["raw_char_len"] = df["comment_text"].apply(lambda x: len(x)) + 1
    # Count number of \n
    df["ant_slash_n"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\n", x))
    df['ant_slash_n_ratio'] = df["ant_slash_n"]/df["raw_char_len"]
    # Check number of upper case, if you're angry you may write in upper case
    df["nb_upper"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[A-Z]", x))
    df['nb_upper_ratio'] = df["nb_upper"]/df["raw_char_len"]
    # Number of F words - f..k contains folk, fork,
    df["nb_fk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ff]\S{2}[Kk]", x))
    df['nb_fk_ratio'] = df["nb_fk"]/df['raw_word_len']
    # Number of S word
    df["nb_sk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[Ss]\S{2}[Kk]", x))
    df['nb_sk_ratio'] = df["nb_sk"]/df['raw_word_len']
    # Number of D words
    df["nb_dk"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"[dD]ick", x))
    df['nb_dk_ratio'] = df['nb_dk']/df['raw_word_len']
    # Number of occurence of You, insulting someone usually needs someone called : you
    df["nb_you"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\W[Yy]ou\W", x))
    df['nb_you_ratio'] = df["nb_you"]/df['raw_word_len']
    
    # Just to check you really refered to my mother ;-)
    df["nb_mother"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wmother\W", x))
    df['nb_mother_ratio'] = df["nb_mother"]/df['raw_word_len']
    # Just checking for toxic 19th century vocabulary
    df["nb_ng"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\Wnigger\W", x))
    df['nb_ng_ratio'] = df["nb_ng"]/df['raw_word_len']
    # Some Sentences start with a <:> so it may help
    df["start_with_columns"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"^\:+", x))
    df['start_with_columns_ratio'] = df["start_with_columns"]/(1 + df["ant_slash_n"])
    
    df['num_question_marks'] = df['comment_text'].apply(lambda comment: comment.count('?'))
    df['question_mask_ratio'] = df['num_question_marks']/df["raw_char_len"]
    
    df['num_exclamation_marks'] = df['comment_text'].apply(lambda comment: comment.count('!'))
    df['exclamation_mark_ratio'] = df['num_exclamation_marks']/df["raw_char_len"]
    
    df['num_punctuation'] = df['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '.,;:'))
    df['punctuation_ratio'] = df['num_punctuation']/df["raw_char_len"]
    
    df['imcomplete_punctuation'] = df['comment_text'].apply( lambda comment: sum(comment.count(w) for w in '*,#,$'))
    df['imcomplete_punctuation_ratio'] = df['imcomplete_punctuation']/df["raw_char_len"]
    
    
    ##
    df['num_unique_words'] = df['comment_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['raw_word_len']
    df['num_smilies'] = df['comment_text'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    df['similes_ratio'] = df['num_smilies'] / df['raw_word_len']
    
    df["count_standard_punctuations"] = df["comment_text"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
    df["standard_punctuations_ratio"] = df["count_standard_punctuations"]/df['raw_char_len']
    df["count_words_title"] = df["comment_text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
    df["words_title_ratio"] = df["count_words_title"]/df['raw_word_len']
    
    df['unique_words_greater_200'] = (df['num_unique_words'] > 200).astype(int)
    
    # Check for time stamp
    df["has_timestamp"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\d{2}|:\d{2}", x))
    # Check for dates 18:44, 8 December 2010
    df["has_date_long"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{2}:\d{2}, \d{1,2} \w+ \d{4}", x))
    # Check for date short 8 December 2010
    df["has_date_short"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\D\d{1,2} \w+ \d{4}", x))
    # Check for http links
    df["has_http"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"http[s]{0,1}://\S+", x))
    # check for mail
    df["has_mail"] = df["comment_text"].apply(
        lambda x: count_regexp_occ(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', x)
    )
#     # Looking for words surrounded by == word == or """" word """"
    df["has_emphasize_equal"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\={2}.+\={2}", x))
    df["has_emphasize_quotes"] = df["comment_text"].apply(lambda x: count_regexp_occ(r"\"{4}\S+\"{4}", x))
    
    return df

TrainData = get_num_feats(TrainData)
TestData = get_num_feats(TestData)

## numeric features
tmp_cols = ['id', 'fold', 'comment_text']
tmp_cols.extend(targets)
num_feats = [c for c in TrainData.columns if(c not in tmp_cols)]

# print(num_feats)
# sys.exit(1)

## MinMax Normalization
# entire_num_df = pd.concat([TrainData[num_feats], TestData[num_feats]], axis= 0, ignore_index= True)
# scaler = MinMaxScaler().fit(entire_num_df)
# TrainData[num_feats] = scaler.transform(TrainData[num_feats].values)
# TestData[num_feats] = scaler.transform(TestData[num_feats].values)

## Guass Rank Normalization
TrainData['source'] = 'train'
TestData['source'] = 'test'
# tmp_cols = num_feats.copy()
# tmp_cols.append('source')
all_data = pd.concat([TrainData, TestData], ignore_index= True)
for c in num_feats:
    rank = np.argsort(all_data[c], axis= 0)
    upper = np.max(rank)
    lower = np.min(rank)
    # linear normalization to 0-1
    all_data[c] = (all_data[c] - lower)/(upper - lower)
    # gauss normalization
    all_data[c] = erfinv(all_data[c])
    all_data[c] -= np.mean(all_data[c])

TrainData = all_data[all_data['source'] == 'train'].drop(['source'], axis= 1)
TestData = all_data[all_data['source'] == 'test'].drop(['source'], axis= 1)

## tokenized features
tokenizer = text.Tokenizer(num_words= max_features)
EntireCorpus = list(TrainData['comment_text'].values) + list(TestData['comment_text'].values)
tokenizer.fit_on_texts(EntireCorpus)

## embedding with pre-trained embedding library
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = EmbeddingIndex.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

def get_model(X_input):
    # input for token words
    inp_token = Input(shape=(maxlen, ), name= 'token_words')
    # input for num feats
    inp_num = Input(shape=[X_input['num_feats'].shape[1]], name= "num_feats")
    # embedding for token words
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_token)
    x = SpatialDropout1D(0.5)(x)
    
    x1 = Conv1D(64, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    
    x2 = Conv1D(128, kernel_size = 2, padding = "valid", kernel_initializer = "glorot_uniform")(x)

    x3 = Conv1D(32, kernel_size = 4, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    
    x4 = Conv1D(16, kernel_size = 5, padding = "valid", kernel_initializer = "glorot_uniform")(x)
    
    avg_pool1 = GlobalAveragePooling1D()(x1)
    max_pool1 = GlobalMaxPooling1D()(x1)
    
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    
    avg_pool3 = GlobalAveragePooling1D()(x3)
    max_pool3 = GlobalMaxPooling1D()(x3)
    
    avg_pool4 = GlobalAveragePooling1D()(x4)
    max_pool4 = GlobalMaxPooling1D()(x4)
    
    # concate results of bi-gru with num features
    conc = concatenate([avg_pool1, max_pool1, 
                        avg_pool2, max_pool2,
                        avg_pool3, max_pool3,
                        avg_pool4, max_pool4,
                        inp_num])
    outp = Dense(6, activation="sigmoid")(conc)
    # 
    model = Model(inputs= [inp_token, inp_num], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer= Adam(lr=1e-3),
                  metrics=['accuracy'])
    return model

##
cv_score = .0
start = time.time()
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'test': TestData
    }
    for c in pred_cols:
        FoldData['valid'][c] = .0
        FoldData['test'][c] = .0
    ## input for X
    X_train_input = {}
    X_valid_input = {}
    X_test_input = {}
    ## tokenize with entire corpus composed by train/valid/test
    X_train_input['token_words'] = tokenizer.texts_to_sequences(FoldData['train']['comment_text'].values)
    X_valid_input['token_words'] = tokenizer.texts_to_sequences(FoldData['valid']['comment_text'].values)
    X_test_input['token_words'] = tokenizer.texts_to_sequences(FoldData['test']['comment_text'].values)
    
    X_train_input['token_words'] = sequence.pad_sequences(X_train_input['token_words'], maxlen= maxlen)
    X_valid_input['token_words'] = sequence.pad_sequences(X_valid_input['token_words'], maxlen= maxlen)
    X_test_input['token_words'] = sequence.pad_sequences(X_test_input['token_words'], maxlen= maxlen)

    ## num data
    X_train_input['num_feats'] = FoldData['train'][num_feats].values
    X_valid_input['num_feats'] = FoldData['valid'][num_feats].values
    X_test_input['num_feats'] = FoldData['test'][num_feats].values
    
    ## input for Y
    Y_train = FoldData['train'][targets].values
    Y_valid = FoldData['valid'][targets].values
    
    ## construct bi-gru model
    model = get_model(X_train_input)
    RocAuc = RocAucEvaluation(validation_data= (X_valid_input, Y_valid), interval=1)
    hist = model.fit(X_train_input, Y_train, 
                     batch_size= batch_size, 
                     epochs= epochs, 
                     validation_data= (X_valid_input, Y_valid),
                     callbacks=[RocAuc],
                     verbose=2)
    end = time.time()
    print('fitting done, time elapsed %s.' % (end - start))
    ## predict for valid
    pred_valid = model.predict(X_valid_input, batch_size=1024)
    FoldData['valid'][pred_cols] = pred_valid
    ## predict for test
    pred_test = model.predict(X_test_input, batch_size=1024)
    FoldData['test'][pred_cols] = pred_test
    TestData[pred_cols] += pred_test
    ## evaluate
    score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_cols])
    cv_score += score
    ## output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in ['valid', 'test']:
        if(mod == 'test'):
            out_cols = ['id']
            out_cols.extend(pred_cols)
        else:
            out_cols = pred_cols.copy()
            out_cols.extend(targets)
        FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
    end = time.time()
    print('fold %s, score %.5f, time elapsed %.2fs' % (fold, score, (end - start)))
cv_score /= kfold
TestData[pred_cols] /= kfold
end = time.time()

print('\n================')
print('cv score %.5f,  time elapsed %s' % (cv_score, (end - start)))
print('================')

## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_cols]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l0/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data for fold 3 done.
load data done, train 159571, time elapsed 1.0719916820526123
load embedding features done, corpus size 2000000, time elapsed 76.90698409080505



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Train on 119662 samples, validate on 39909 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.985881 

31s - loss: 0.0632 - acc: 0.9781 - val_loss: 0.0465 - val_acc: 0.9822
Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.988516 

30s - loss: 0.0421 - acc: 0.9840 - val_loss: 0.0482 - val_acc: 0.9815
Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.988644 

30s - loss: 0.0369 - acc: 0.9856 - val_loss: 0.0432 - val_acc: 0.9834
fitting done, time elapsed 104.09131240844727.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


fold 0, score 0.98864, time elapsed 111.42s

Train on 119671 samples, validate on 39900 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.985084 

31s - loss: 0.0615 - acc: 0.9789 - val_loss: 0.0478 - val_acc: 0.9821
Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.987385 

30s - loss: 0.0420 - acc: 0.9839 - val_loss: 0.0427 - val_acc: 0.9836
Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.987659 

30s - loss: 0.0366 - acc: 0.9858 - val_loss: 0.0453 - val_acc: 0.9826
fitting done, time elapsed 215.92129468917847.
fold 1, score 0.98766, time elapsed 223.33s

Train on 119684 samples, validate on 39887 samples
Epoch 1/3

 ROC-AUC - epoch: 1 - score: 0.982749 

31s - loss: 0.0616 - acc: 0.9789 - val_loss: 0.0480 - val_acc: 0.9822
Epoch 2/3

 ROC-AUC - epoch: 2 - score: 0.985942 

30s - loss: 0.0421 - acc: 0.9839 - val_loss: 0.0434 - val_acc: 0.9834
Epoch 3/3

 ROC-AUC - epoch: 3 - score: 0.985557 

30s - loss: 0.0368 - acc: 0.9857 - val_loss: 0.0451 - val_acc: 0.9829
fitting done, time elapsed 327.9664

0