In [1]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import os,sys
sys.path.append('../')

## Math and dataFrame
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import matplotlib.pyplot as plt

#ML
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, confusion_matrix
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Model
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


In [2]:
label_cols = ['toxic', 'severe_toxic', 'obscene',  'threat', 'insult', 'identity_hate']
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['dirtyness'] = train.apply(lambda x: x.iloc[2::].sum(), axis = 1)
test['dirtyness'] = test.apply(lambda x: x.iloc[2::].sum(), axis = 1)

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

print("train set len ", len(train) )
print("test set len ", len(test) )
print("clean samples", len(train[train['dirtyness'] == 0]))
print("toxic samples", len(train[train['dirtyness'] != 0]))

train set len  159571
test set len  153164
clean samples 143346
toxic samples 16225


In [3]:
modelType = 'CNN1d'
modelSubTypeList = ['MultiClass_Embedding_Random']

def secondInput(modelSubTypeList): 
    if len(modelSubTypeList) == 1:
        return None
    else:
        return modelSubTypeList[1]
    
modelSubType = '+'.join(modelSubTypeList)
reSample = False
runNumber = 1
modelName = modelType + '_' + modelSubType + '_' + str(reSample) + '_arch' + str(runNumber)

#hyper param
batch_size = 64
epochs = 8

# tokenize
max_features = 20000
maxlen = 120
embedding_dims = 10
filters = 12
hidden_dims = 2
kernel_size = 10

In [4]:
#preprocessing
raw_text = np.hstack([train.comment_text.str.lower(), test.comment_text.str.lower()])
tok_raw = Tokenizer(num_words=max_features)
tok_raw.fit_on_texts(raw_text)

train["seq"] = tok_raw.texts_to_sequences(train.comment_text.str.lower())
test["seq"] = tok_raw.texts_to_sequences(test.comment_text.str.lower())

#sequence.pad_sequences(train['comment_text'].values, maxlen=maxlen)
#test['comment_text'] = sequence.pad_sequences(test['comment_text'], maxlen=maxlen)
#print('x_train shape:', train['comment_text'].values.shape)
#print('x_test shape:', test['comment_text'].values.shape)
#pad
#train["seq"].apply(lambda x: len(x)).describe()
train["seq_pad"] = train["seq"].apply(lambda x, maxlen: sequence.pad_sequences([x], maxlen=maxlen)[0], args = [maxlen])
test["seq_pad"] = test["seq"].apply(lambda x, maxlen: sequence.pad_sequences([x], maxlen=maxlen)[0], args = [maxlen])


In [6]:
# FE factory
def FEEmbedding(EmbeddingStrategy):
    ''' returns embedding matrix
    '''
    if EmbeddingStrategy == 'MultiClass_Embedding_Random':
        return None
    else:
        raise ValueError("Undefined embedding strategy")
        
def secondFE(secondFEStrategy):
    trn_2nd_input = None
    test_2nd_input = None
    aux_input_dim = None
    from models.FeatureExtraction import FeatureExtraction
    fe = FeatureExtraction()    
    
    if secondFEStrategy == 'KeyWordTermFreq':


        keyword_dir = '../../ZhiHaoSun/'
        keyfiles = [
                keyword_dir + 'toxic_words.txt',
                keyword_dir + 'identity_hate_words.txt',
                keyword_dir + 'insult_words.txt',
                keyword_dir + 'obscene_words.txt',
                keyword_dir + 'threat_words.txt',
                keyword_dir + 'identity_hate_words.txt',
                ]

        term_doc = fe.tfKeyWordEnsemble(
                pd.concat([train, test]), n_feature = 80000, vocabfile = keyfiles,
                COMMENT = 'comment_text'
                )

        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]
        trn_2nd_input = trn_term_doc
        test_2nd_input = test_term_doc
    
        aux_input_dim = trn_2nd_input.shape[1]
        
    elif secondFEStrategy == "tfIdf":
        term_doc = fe.tfIdf(pd.concat([train, test]), 'comment_text')
        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]
        trn_2nd_input = trn_term_doc
        test_2nd_input = test_term_doc

        aux_input_dim = trn_2nd_input.shape[1]
        
    return trn_2nd_input, test_2nd_input, aux_input_dim


embMatrix = FEEmbedding(modelSubTypeList[0])
print(">> EmbMatrix of type {}".format(type(embMatrix)))

secondFEStrategy = None if secondInput(modelSubTypeList) is None else modelSubTypeList[1]
trn_2nd_input, test_2nd_input, aux_input_dim = secondFE(secondFEStrategy)
print(">> 2ndInput of type {}".format(type(trn_2nd_input)))



>> EmbMatrix of type <type 'NoneType'>
>> 2ndInput of type <type 'NoneType'>


In [7]:
#model factory
def buildModel(modelType):
    if modelType == 'CNN1d':

        model = Sequential()
        
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        model.add(Embedding(max_features,
                            embedding_dims,
                            input_length=maxlen, 
                            #embeddings_regularizer = keras.regularizers.l1(0.01)
                            ))
        model.add(Dropout(0.3))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        model.add(Conv1D(filters,
                         kernel_size,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(BatchNormalization())
        # we use max pooling:
        model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(6))
        model.add(Activation('sigmoid'))

        # define metrics and compile model
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    
    elif (modelType == 'LSTM') and (secondInput(modelSubTypeList) is None) :
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embedding_dims)(inp)
        x = Bidirectional(LSTM(hidden_dims, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = GlobalMaxPool1D()(x)
        x = BatchNormalization()(x)

        #x = Dense(hidden_dims, activation="relu")(x)
        x = Dropout(0.5)(x)
        x = Dense(6, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    elif (modelType == 'LSTM') and (secondInput(modelSubTypeList) is not None):
        
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embedding_dims)(inp)
        x = Bidirectional(LSTM(hidden_dims, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
        x = GlobalMaxPool1D()(x)
        x = BatchNormalization()(x)
        
        auxiliary_input = Input(shape=(aux_input_dim,), name='aux_input')
        y = Dense(4)(auxiliary_input)
        y = BatchNormalization()(y)
        y = Activation('relu')(y)
        y = Dropout(0.5)(y)

        x = keras.layers.concatenate([x, y])

        x = Dense(hidden_dims, activation="relu")(x)
        x = Dropout(0.5)(x)
        
        x = Dense(6, activation="sigmoid")(x)
        model = Model(inputs=[inp, auxiliary_input], outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    else:
        raise ValueError("undefined model type")
    
    return model

modeldict = {}
for lc in label_cols:
    modeldict[lc] = buildModel(modelType)
    modeldict[lc].summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 10)           200000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 120, 10)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 111, 12)           1212      
_________________________________________________________________
batch_normalization_1 (Batch (None, 111, 12)           48        
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 12)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 26        
_________________________________________________________________
batch_normalization_2 (Batch (None, 2)                 8         
__________

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 120, 10)           200000    
_________________________________________________________________
dropout_11 (Dropout)         (None, 120, 10)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 111, 12)           1212      
_________________________________________________________________
batch_normalization_11 (Batc (None, 111, 12)           48        
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 12)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 26        
_________________________________________________________________
batch_normalization_12 (Batc (None, 2)                 8         
__________

In [8]:
# reSample factory
def reSampleToBalance(reSample, train, trn_2nd_input):
    
    trainOrig = np.array(train['seq_pad'].tolist())
    assert(trainOrig.shape == (len(train), maxlen))
    train2ndOrig = trn_2nd_input
    trn_weights = None
    
    if reSample == True:
        print("reSample to balance")
        assert((secondInput(modelSubTypeList) is None))
        trn_re, label_re = fe.reSample( scipy.sparse.csr_matrix(trainOrig) , y = train[label_cols].values)
        
    elif reSample == False:
        print("Not reSample to balance")
        trn_re, label_re = scipy.sparse.csr_matrix(trainOrig), train[label_cols].values
        train2nd_re = train2ndOrig
        
    elif reSample == "covCorrection":
        print("covCorrection")
        trn_re, label_re = scipy.sparse.csr_matrix(trainOrig), train[label_cols].values
        train2nd_re = train2ndOrig

        term_doc = fe.tfIdf(pd.concat([train, test]), 'comment_text')
        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]

        trn_weights = fe.covarianceShiftCorrection(trn_term_doc, test_term_doc)

    else:
        raise ValueError("Undefined reSample strategy")
    
    return trn_re, label_re, train2nd_re, trn_weights




In [None]:
#define F1 metric
import keras

for lc in label_cols:
    # train test split
    train_re, label_re, train2nd_re, trn_weights = reSampleToBalance(reSample, train, trn_2nd_input)

    y_train = label_re
    if secondInput(modelSubTypeList) is not None:
        assert(train2nd_re is not None)
        x_train = [train_re, train2nd_re]
    else:
        x_train = train_re
    
    # add check point
    filepath= lc + "_weights_best.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
    callbacks_list = [checkpoint]

    modeldict[lc].fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          #validation_data=(x_val, y_val), 
          validation_split=0.2,
          callbacks=callbacks_list
             )

In [124]:
# implement Trained Model
from models.TrainedModel import TrainedModel
import modelDB

class TrainedModelCNNEmbedding(TrainedModel):
    def __init__(self, md = None):
        super(TrainedModelCNNEmbedding, self).__init__(md)
    
    def predict(self, test, **kwargs):
        print ("Predict using Model: ")
        for i in self.md.keys():  
            print( "{} - {}".format(i, type(self.md[i]).__name__ ) )
            
        res = {}
        for topic, submd in zip( self.md.keys(), self.md.values() ):
            print( " predicitng topic {}".format(topic))
            res[topic] = zip(*submd.predict(test, batch_size = 1024))[0] #obtain proba
        
        #important to keep the order as required by the submission file
        testid = kwargs['testid'] 
        #print (testid)
        dfres = pd.DataFrame(res)
        dfres['id'] = testid
        
        #print(dfres.shape)
        assert(dfres.shape[0] == test.shape[0])
        
        #reshape to submission file format
        dfres = dfres[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
        
        return dfres
    
    def _save(self, mDB, nameKey, modelpath, **kwargs):
        '''
        save model into model data base
        :mDB: meta data frame storing all models info
        :nameKey: unique identifier for each saved model
        :modelpath: subdir inside modelDB dir, e.g. if modelDB is /root/modelDB, then modelpath is /cnn
        '''
        print("Saving model")
        import datetime
        
        #save a dict (topic => ('modelname', 'weightname')) into db
        model_saved_toDB = {}
        for topic, submd in zip( self.md.keys(), self.md.values() ):
            #this is the file we will save model to
            mdname = os.path.join(modelpath, nameKey + '_'+ topic +'_' + '.sav' )
            mdnameAbs = os.path.join(modelDB.MODEL_DB_ROOT, mdname )

            #this is the file we will save weights to
            weightName = os.path.join(modelpath, nameKey +  '_'+ topic + '_' + '_weights.h5' )
            weightNameAbs = os.path.join( modelDB.MODEL_DB_ROOT, weightName )

            #convert md to json and save to file
            print(" Saving model {}".format(mdname) )
            model_json = submd.to_json()
            with open(mdnameAbs, "w") as json_file:
                json_file.write(model_json)

            # serialize weights to HDF5
            print(" Saving weights {}".format(weightName) )
            submd.save_weights(weightNameAbs)
    
            model_saved_toDB[topic] = (mdname, weightName)
            
        
        #db schema
        # 'modelName', type {rnn, cnn, rf}, date, model
        print( "Info: custom saving options" )
        for i in kwargs:
            print("{} - {}".format(i , kwargs[i]))
            
        import json
        newRow = pd.DataFrame({
            'modelName': [nameKey],
            'type': kwargs['modelType'],
            'subType': kwargs['modelSubType'],
            'date': str(datetime.datetime.now().strftime("%Y-%m-%d")),
            'model' : json.dumps(model_saved_toDB)
        }
        )
        
        #add a new row 
        #mDB = pd.concat([mDB, newRow])
        mDB = mDB.append(newRow, ignore_index = True)
        display(mDB)

        return mDB
    
    def load(self, mDB, nameKey, modelpath):
        '''
        :mDB: meta data frame storing all models info
        :nameKey: unique identifier for each saved model
        :modelpath: subdir inside modelDB dir, e.g. if modelDB is /root/modelDB, then modelpath is /cnn
        '''
        from keras.models import model_from_json
        import json
        
        if (mDB.empty) or mDB[ mDB['modelName'] == nameKey ].empty:
            raise VaueError("Model name does not exist")
        print("loadModel")
        #the saved model is of format: dict (topic => ('modelname', 'weightname')) into db
        assert(len(mDB[mDB['modelName'] == nameKey]) == 1)
        saved_model_inDB = json.loads(mDB[mDB['modelName'] == nameKey].iloc[0]['model'])
        
        print("saved json string representing the model is {}".format(saved_model_inDB))
        loaded_model = {}
        for topic, mdPointer in saved_model_inDB.iteritems():
            print(" load model for topic {}".format(topic))
            mdname = mdPointer[0]
            mdnameAbs = os.path.join(modelDB.MODEL_DB_ROOT, mdname )
            print("  model file in {}".format(mdnameAbs))

            #this is the file we will save weights to
            weightName = mdPointer[1]
            weightNameAbs = os.path.join( modelDB.MODEL_DB_ROOT, weightName )
            print("  weight file in {}".format(weightNameAbs))
            
            with open(mdnameAbs, 'r') as json_file:
                tmpModel_json = json_file.read()
            tmpModel = model_from_json(tmpModel_json)
            #load weights
            tmpModel.load_weights(weightNameAbs)
            
            #assign the model into loaded model dict
            loaded_model[topic] = tmpModel
        
        assert(self.md is None)
        self.setModel(loaded_model)
        return self



In [125]:
# #Utest test predict
# myCNN1d = TrainedModelCNNEmbedding(modeldict)
# dfres = myCNN1d.predict(trainOrig[0:1000, :], testid = train['id'])
# display(dfres.head(20))
# display(train.head(20)[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']])

# #Utest test save
# mdDB = pd.read_json("../modelDB/modelMetaDB.json")
# modelpath = 'cnn/' # NOTE: this is relative to the modelDB path
# print("current modelDB")
# display(mdDB)

# mdDB = myCNN1d.save(mdDB, 'CNN1d-Embedding_Random-OneperClass', modelpath, modelType = 'CNN1d', modelSubType = 'Embedding_Random')
# #mdDB.to_json("../modelDB/modelMetaDB.json")

# #Utest test load
# loadedCNN = TrainedModelCNNEmbedding().load( mdDB, 'CNN1d-Embedding_Random-OneperClass', modelpath)
# for i in loadedCNN.md.keys():  
#     print( "{} - {}".format(i, loadedCNN.md[i] ) )
# dfres_loaded = loadedCNN.predict(trainOrig[0:1000, :], testid = train['id'])
# assert(dfres_loaded.equals(dfres))

In [126]:
#forward pass to inference
testOrig = np.array(test['seq_pad'].tolist())

myCNN1d = TrainedModelCNNEmbedding(modeldict)

dfres = myCNN1d.predict(testOrig, testid = test['id'])

Predict using Model: 
severe_toxic - Sequential
identity_hate - Sequential
obscene - Sequential
insult - Sequential
threat - Sequential
toxic - Sequential
 predicitng topic severe_toxic
 predicitng topic identity_hate
 predicitng topic obscene
 predicitng topic insult
 predicitng topic threat
 predicitng topic toxic


In [128]:
#save
myCNN1d = TrainedModelCNNEmbedding(modeldict)

mdDB = pd.read_pickle("../modelDB/modelMetaDB.pkl")
modelpath = 'cnn/' # NOTE: this is relative to the modelDB path
#print("current modelDB")
#display(mdDB)
mdDB = myCNN1d.save(mdDB, 'CNN1d-Embedding_Random-OneperClass_arch2', modelpath, modelType = 'CNN1d', modelSubType = 'Embedding_Random')
mdDB.to_pickle("../modelDB/modelMetaDB.pkl")


Saving model
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_severe_toxic_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_severe_toxic__weights.h5
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_identity_hate_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_identity_hate__weights.h5
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_obscene_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_obscene__weights.h5
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_insult_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_insult__weights.h5
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_threat_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_threat__weights.h5
 Saving model cnn/CNN1d-Embedding_Random-OneperClass_arch2_toxic_.sav
 Saving weights cnn/CNN1d-Embedding_Random-OneperClass_arch2_toxic__weights.h5
Info: custom saving options
modelSubType - Embedding_Ra

Unnamed: 0,date,model,modelName,subType,type,weights
0,1970-01-01 00:00:00,"{""severe_toxic"": [""cnn/CNN1d-Embedding_Random-...",CNN1d-Embedding_Random-OneperClass,Embedding_Random,CNN1d,
1,1970-01-01 00:00:00,cnn/utestModel.sav,utestModel,TfIdf,testType,
2,2018-02-13,"{""severe_toxic"": [""cnn/CNN1d-Embedding_Random-...",CNN1d-Embedding_Random-OneperClass_arch1,Embedding_Random,CNN1d,
3,2018-02-13,svc/SVC_tfidf_1.sav,SVC_tfidf_1,TfIdf,SVC,
4,2018-02-13,svc/SVC_keywordTermFreq_1.sav,SVC_keywordTermFreq_1,TfIdf,SVC,
5,2018-02-14,"{""severe_toxic"": [""cnn/CNN1d-Embedding_Random-...",CNN1d-Embedding_Random-OneperClass_arch2,Embedding_Random,CNN1d,


In [129]:
#save
dfres.to_csv('../submission/submission_cnn1d_arch2.csv', index = False)