In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
import os,sys,time,datetime

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
#warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

DataBaseDir = '../../data/version2'
InputDir = '%s/l0/kfold' % DataBaseDir
OutputDir = '%s/l1' % DataBaseDir
kfold = 4
strategy = 'bi-gru-num'
# load data
start = time.time()
valid_dfs = []
for fold in range(kfold):
    FoldInputDir = '%s/%s' % (InputDir, fold)
    valid = pd.read_csv('%s/valid.csv' % FoldInputDir).reset_index(drop= True).sample(frac= 0.1)
    ## for valid/holdout data set
    if(fold == 0):
        TestData = pd.read_csv('%s/test.csv' % FoldInputDir).reset_index(drop= True).sample(frac= 0.1)
    valid['fold'] = fold
    valid_dfs.append(valid)
    print('load data for fold %s done.' % fold)
TrainData = pd.concat(valid_dfs, axis= 0, ignore_index= True)
end = time.time()
print('load data done, train %s, time elapsed %s' % (len(TrainData), (end - start)))
##
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def LoadEmbeddingVectors(f):
    ## debug
    k = 1000
    EmbeddingDict = {}
    with open(f, 'r') as i_file:
        for line in i_file:
            if(k == 0):
                break
            w, coe_vec= get_coefs(*line.rstrip().rsplit(' '))
            EmbeddingDict[w] = coe_vec
            k -= 1
    i_file.close()
    return EmbeddingDict

# class RocAucEvaluation(Callback):
#     def __init__(self, validation_data=(), interval=1):
#         super(Callback, self).__init__()

#         self.interval = interval
#         self.X_val, self.y_val = validation_data

#     def on_epoch_end(self, epoch, logs={}):
#         if epoch % self.interval == 0:
#             y_pred = self.model.predict(self.X_val, verbose=0)
#             score = roc_auc_score(self.y_val, y_pred)
#             print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
EmbeddingFile = '../../data/raw/crawl-300d-2M.vec'
max_features = 30000
maxlen = 150
#max_features = 3000
#maxlen = 10
embed_size = 300
batch_size = 32
epochs = 2
start = time.time()
EmbeddingIndex = LoadEmbeddingVectors(EmbeddingFile)
end = time.time()
print('load embedding features done, corpus size %s, time elapsed %s' % (len(EmbeddingIndex), (end - start)))

pred_cols = ['%s_%s' % (strategy, c) for c in targets]
for c in pred_cols:
    TestData[c] = .0

def get_num_feats(text_df):
    text_df['num_words'] = text_df['comment_text'].str.count('\S+')
    text_df['num_comas'] = text_df['comment_text'].str.count('\.')
    text_df['num_bangs'] = text_df['comment_text'].str.count('\!')
    text_df['num_quotas'] = text_df['comment_text'].str.count('\"')
    text_df['avg_word'] = text_df['comment_text'].str.len() / (1 + text_df['num_words'])
    
    return text_df

## numeric features
num_feats = ['num_words','num_comas','num_bangs','num_quotas','avg_word']
TrainData = get_num_feats(TrainData)
TestData = get_num_feats(TestData)
entire_num_df = pd.concat([TrainData[num_feats], TestData[num_feats]], axis= 0, ignore_index= True)
scaler = MinMaxScaler().fit(entire_num_df)
TrainData[num_feats] = scaler.transform(TrainData[num_feats].values)
TestData[num_feats] = scaler.transform(TestData[num_feats].values)

## tokenized features
tokenizer = text.Tokenizer(num_words= max_features)
EntireCorpus = list(TrainData['comment_text'].values) + list(TestData['comment_text'].values)
tokenizer.fit_on_texts(EntireCorpus)

## embedding with pre-trained embedding library
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = EmbeddingIndex.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

def get_model(X_input):
    # input for token words
    inp_token = Input(shape=(maxlen, ), name= 'token_words')
    # input for num feats
    inp_num = Input(shape=[X_input['num_feats'].shape[1]], name= "num_feats")
    # embedding for token words
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp_token)
    x = SpatialDropout1D(0.25)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    # concate results of bi-gru with num features
    conc = concatenate([avg_pool, max_pool, inp_num])
    outp = Dense(6, activation="sigmoid")(conc)
    # 
    model = Model(inputs= [inp_token, inp_num], outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

##
cv_score = .0
start = time.time()
for fold in range(kfold):
    print('====== fold %s ======\n' % fold)
    FoldData = {
        'train': TrainData[TrainData['fold'] != fold],
        'valid': TrainData[TrainData['fold'] == fold],
        'test': TestData
    }
    for c in pred_cols:
        FoldData['valid'][c] = .0
        FoldData['test'][c] = .0
    ## input for X
    X_train_input = {}
    X_valid_input = {}
    X_test_input = {}
    ## tokenize with entire corpus composed by train/valid/test
    X_train_input['token_words'] = tokenizer.texts_to_sequences(FoldData['train']['comment_text'].values)
    X_valid_input['token_words'] = tokenizer.texts_to_sequences(FoldData['valid']['comment_text'].values)
    X_test_input['token_words'] = tokenizer.texts_to_sequences(FoldData['test']['comment_text'].values)
    
    X_train_input['token_words'] = sequence.pad_sequences(X_train_input['token_words'], maxlen= maxlen)
    X_valid_input['token_words'] = sequence.pad_sequences(X_valid_input['token_words'], maxlen= maxlen)
    X_test_input['token_words'] = sequence.pad_sequences(X_test_input['token_words'], maxlen= maxlen)

    ## num data
    X_train_input['num_feats'] = FoldData['train'][num_feats].values
    X_valid_input['num_feats'] = FoldData['valid'][num_feats].values
    X_test_input['num_feats'] = FoldData['test'][num_feats].values
    
    ## input for Y
    Y_train = FoldData['train'][targets].values
    Y_valid = FoldData['valid'][targets].values
    
    ## construct bi-gru model
    model = get_model(X_train_input)
    RocAuc = RocAucEvaluation(validation_data= (X_valid_input, Y_valid), interval=1)
    hist = model.fit(X_train_input, Y_train, 
                     batch_size= batch_size, 
                     epochs= epochs, 
                     validation_data= (X_valid_input, Y_valid),
                     callbacks=[RocAuc],
                     verbose=2)
    end = time.time()
    print('fitting done, time elapsed %s.' % (end - start))
    ## predict for valid
    pred_valid = model.predict(X_valid_input, batch_size=1024)
    FoldData['valid'][pred_cols] = pred_valid
    ## predict for test
    pred_test = model.predict(X_test_input, batch_size=1024)
    FoldData['test'][pred_cols] = pred_test
    TestData[pred_cols] += pred_test
    ## evaluate
    score = roc_auc_score(FoldData['valid'][targets], FoldData['valid'][pred_cols])
    cv_score += score
    ## output
    FoldOutputDir = '%s/kfold/%s' % (OutputDir, fold)
    if(os.path.exists(FoldOutputDir) == False):
        os.makedirs(FoldOutputDir)
    for mod in ['valid', 'test']:
        if(mod == 'test'):
            out_cols = ['id']
            out_cols.extend(pred_cols)
        else:
            out_cols = pred_cols.copy()
            out_cols.extend(targets)
        FoldData[mod][out_cols].to_csv('%s/%s_%s.csv' % (FoldOutputDir, mod, strategy),float_format='%.8f', index= False) 
    end = time.time()
    print('fold %s, score %.5f, time elapsed %.2fs' % (fold, score, (end - start)))
cv_score /= kfold
TestData[pred_cols] /= kfold
end = time.time()
print('\n================')
print('cv score %.5f,  time elapsed %s' % (cv_score, (end - start)))
print('================')

## submit
sub = TestData[['id']].copy()
sub[targets] = TestData[pred_cols]
OutputFileName = '%s_submit_%s' % (strategy, datetime.datetime.now().strftime("%Y-%m-%d"))
SubmitDir = '%s/l0/submit' % DataBaseDir
if(os.path.exists(SubmitDir) == False):
    os.makedirs(SubmitDir)
sub.to_csv('%s/%s.csv' % (SubmitDir, OutputFileName), float_format='%.8f', index=False)
print('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))
os.system('zip %s/%s.zip %s/%s.csv' % (SubmitDir, OutputFileName, SubmitDir, OutputFileName))

load data for fold 0 done.
load data for fold 1 done.
load data for fold 2 done.
load data for fold 3 done.
load data done, train 15958, time elapsed 1.6504640579223633
load embedding features done, corpus size 1000, time elapsed 0.08364319801330566



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Epoch 1/2


KeyboardInterrupt: 