In [130]:
%load_ext autoreload
%autoreload 2
from __future__ import print_function
import os,sys
sys.path.append('../')

## Math and dataFrame
import numpy as np
import pandas as pd
import scipy
import seaborn as sns

import matplotlib.pyplot as plt

#ML
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold, train_test_split
from sklearn.metrics import log_loss, accuracy_score, f1_score, confusion_matrix
from keras.callbacks import ModelCheckpoint, EarlyStopping

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization, Input, LSTM
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Model
from keras.preprocessing.text import Tokenizer


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
label_cols = ['toxic', 'severe_toxic', 'obscene',  'threat', 'insult', 'identity_hate']
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['dirtyness'] = train.apply(lambda x: x.iloc[2::].sum(), axis = 1)
test['dirtyness'] = test.apply(lambda x: x.iloc[2::].sum(), axis = 1)

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

print("train set len ", len(train) )
print("test set len ", len(test) )
print("clean samples", len(train[train['dirtyness'] == 0]))
print("toxic samples", len(train[train['dirtyness'] != 0]))

train set len  159571
test set len  153164
clean samples 143346
toxic samples 16225


In [460]:
#config
modelType = 'LSTM'
modelSubTypeList = ['MultiClass_Embedding_Random', 'tfIdf']

def secondInput(modelSubTypeList): 
    if len(modelSubTypeList) == 1:
        return None
    else:
        return modelSubTypeList[1]
    
modelSubType = '+'.join(modelSubTypeList)
reSample = False
runNumber = 1
modelName = modelType + '_' + modelSubType + '_' + str(reSample) + '_arch' + str(runNumber)

#hyper param
batch_size = 64
epochs = 8

# tokenize
max_features = 20000
maxlen = 120
embedding_dims = 10
filters = 12
hidden_dims = 2
kernel_size = 10

In [461]:
#preprocessing
raw_text = np.hstack([train.comment_text.str.lower(), test.comment_text.str.lower()])
tok_raw = Tokenizer(num_words=max_features)
tok_raw.fit_on_texts(raw_text)

train["seq"] = tok_raw.texts_to_sequences(train.comment_text.str.lower())
test["seq"] = tok_raw.texts_to_sequences(test.comment_text.str.lower())

#sequence.pad_sequences(train['comment_text'].values, maxlen=maxlen)
#test['comment_text'] = sequence.pad_sequences(test['comment_text'], maxlen=maxlen)
#print('x_train shape:', train['comment_text'].values.shape)
#print('x_test shape:', test['comment_text'].values.shape)
#pad
#train["seq"].apply(lambda x: len(x)).describe()
train["seq_pad"] = train["seq"].apply(lambda x, maxlen: sequence.pad_sequences([x], maxlen=maxlen)[0], args = [maxlen])
test["seq_pad"] = test["seq"].apply(lambda x, maxlen: sequence.pad_sequences([x], maxlen=maxlen)[0], args = [maxlen])


In [462]:
# FE factory
def FEEmbedding(EmbeddingStrategy):
    ''' returns embedding matrix
    '''
    if EmbeddingStrategy == 'MultiClass_Embedding_Random':
        return None
    else:
        raise ValueError("Undefined embedding strategy")
        
def secondFE(secondFEStrategy):
    trn_2nd_input = None
    test_2nd_input = None
    aux_input_dim = None
    from models.FeatureExtraction import FeatureExtraction
    fe = FeatureExtraction()    
    
    if secondFEStrategy == 'KeyWordTermFreq':


        keyword_dir = '../../ZhiHaoSun/'
        keyfiles = [
                keyword_dir + 'toxic_words.txt',
                keyword_dir + 'identity_hate_words.txt',
                keyword_dir + 'insult_words.txt',
                keyword_dir + 'obscene_words.txt',
                keyword_dir + 'threat_words.txt',
                keyword_dir + 'identity_hate_words.txt',
                ]

        term_doc = fe.tfKeyWordEnsemble(
                pd.concat([train, test]), n_feature = 80000, vocabfile = keyfiles,
                COMMENT = 'comment_text'
                )

        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]
        trn_2nd_input = trn_term_doc
        test_2nd_input = test_term_doc
    
        aux_input_dim = trn_2nd_input.shape[1]
        
    elif secondFEStrategy == "tfIdf":
        term_doc = fe.tfIdf(pd.concat([train, test]), 'comment_text')
        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]
        trn_2nd_input = trn_term_doc
        test_2nd_input = test_term_doc

        aux_input_dim = trn_2nd_input.shape[1]
        
    return trn_2nd_input, test_2nd_input, aux_input_dim


embMatrix = FEEmbedding(modelSubTypeList[0])
print(">> EmbMatrix of type {}".format(type(embMatrix)))

trn_2nd_input, test_2nd_input, aux_input_dim = secondFE(modelSubTypeList[1])
print(">> 2ndInput of type {}".format(type(trn_2nd_input)))




>> EmbMatrix of type <type 'NoneType'>
>> 2ndInput of type <class 'scipy.sparse.csr.csr_matrix'>


In [466]:
#model factory
def buildModel(modelType):
    if modelType == 'CNN1d':

        model = Sequential()
        
        # we start off with an efficient embedding layer which maps
        # our vocab indices into embedding_dims dimensions
        model.add(Embedding(max_features,
                            embedding_dims,
                            input_length=maxlen, 
                            #embeddings_regularizer = keras.regularizers.l1(0.01)
                            ))
        model.add(Dropout(0.3))

        # we add a Convolution1D, which will learn filters
        # word group filters of size filter_length:
        model.add(Conv1D(filters,
                         kernel_size,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(BatchNormalization())
        # we use max pooling:
        model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(BatchNormalization())
        model.add(Dropout(0.3))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(6))
        model.add(Activation('sigmoid'))

        # define metrics and compile model
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    
    elif (modelType == 'LSTM') and (secondInput(modelSubTypeList) is None) :
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embedding_dims)(inp)
        x = Bidirectional(LSTM(hidden_dims, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
        x = GlobalMaxPool1D()(x)
        x = BatchNormalization()(x)

        #x = Dense(hidden_dims, activation="relu")(x)
        x = Dropout(0.5)(x)
        x = Dense(6, activation="sigmoid")(x)
        model = Model(inputs=inp, outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    elif (modelType == 'LSTM') and (secondInput(modelSubTypeList) is not None):
        
        inp = Input(shape=(maxlen,))
        x = Embedding(max_features, embedding_dims)(inp)
        x = Bidirectional(LSTM(hidden_dims, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
        x = GlobalMaxPool1D()(x)
        x = BatchNormalization()(x)
        
        auxiliary_input = Input(shape=(aux_input_dim,), name='aux_input')
        y = Dense(4)(auxiliary_input)
        y = BatchNormalization()(y)
        y = Activation('relu')(y)
        y = Dropout(0.5)(y)

        x = keras.layers.concatenate([x, y])

        x = Dense(hidden_dims, activation="relu")(x)
        x = Dropout(0.5)(x)
        
        x = Dense(6, activation="sigmoid")(x)
        model = Model(inputs=[inp, auxiliary_input], outputs=x)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        
    else:
        raise ValueError("undefined model type")
    
    return model

model = buildModel(modelType)
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_39 (InputLayer)           (None, 120)          0                                            
__________________________________________________________________________________________________
aux_input (InputLayer)          (None, 689099)       0                                            
__________________________________________________________________________________________________
embedding_48 (Embedding)        (None, 120, 10)      200000      input_39[0][0]                   
__________________________________________________________________________________________________
dense_92 (Dense)                (None, 4)            2756400     aux_input[0][0]                  
__________________________________________________________________________________________________
bidirectio

In [470]:
# reSample factory
def reSampleToBalance(reSample, train, trn_2nd_input):
    
    trainOrig = np.array(train['seq_pad'].tolist())
    assert(trainOrig.shape == (len(train), maxlen))
    train2ndOrig = trn_2nd_input
    trn_weights = None
    
    if reSample == True:
        print("reSample to balance")
        assert((secondInput(modelSubTypeList) is None))
        trn_re, label_re = fe.reSample( scipy.sparse.csr_matrix(trainOrig) , y = train[label_cols].values)
        
    elif reSample == False:
        print("Not reSample to balance")
        trn_re, label_re = scipy.sparse.csr_matrix(trainOrig), train[label_cols].values
        train2nd_re = train2ndOrig
        
    elif reSample == "covCorrection":
        print("covCorrection")
        trn_re, label_re = scipy.sparse.csr_matrix(trainOrig), train[label_cols].values
        train2nd_re = train2ndOrig

        term_doc = fe.tfIdf(pd.concat([train, test]), 'comment_text')
        trn_term_doc = term_doc.tocsr()[0:len(train), :]
        test_term_doc = term_doc.tocsr()[len(train)::, :]

        trn_weights = fe.covarianceShiftCorrection(trn_term_doc, test_term_doc)

    else:
        raise ValueError("Undefined reSample strategy")
    
    return trn_re, label_re, train2nd_re, trn_weights


train_re, label_re, train2nd_re, trn_weights = reSampleToBalance(reSample, train, trn_2nd_input)

y_train = label_re
if secondInput(modelSubTypeList) is not None:
    assert(train2nd_re is not None)
    x_train = [train_re, train2nd_re]
else:
    x_train = train_re

Not reSample to balance


In [None]:
# add check point and fit
import keras
filepath= type(model).__name__ + "_weights_best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
esCallback = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=0, mode='auto')
callbacks_list = [checkpoint]#, esCallback]

hist = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=1,
          #validation_data=(x_val, y_val), 
          validation_split=0.2,
          shuffle = True,
          sample_weight = trn_weights,
          callbacks=callbacks_list
             )

In [None]:
#print(len(y_val))
#print(len(y_val[ [not np.array_equal(i, np.array([0,0,0,0,0,0])) for i in y_val], : ]) )

In [None]:
history = hist
plt.figure(1)  
# summarize history for accuracy  
plt.subplot(211)  
plt.plot(history.history['acc'])  
plt.plot(history.history['val_acc'])  
plt.title('model accuracy')  
plt.ylabel('accuracy')  
plt.xlabel('epoch')  
plt.legend(['train', 'val'], loc='upper left')  

In [None]:
# implement Trained Model
from models.TrainedModel import TrainedModel
import modelDB

class TrainedModelCNNEmbeddingMultiClass(TrainedModel):
    def __init__(self, md = None):
        super(TrainedModelCNNEmbeddingMultiClass, self).__init__(md)
    
    def predict(self, test, **kwargs):
        print ("Predict using Model: ")
        print( type(self.md).__name__ )
            
        res = self.md.predict(test, batch_size = 1024)
        #important to keep the order as required by the submission file
        testid = kwargs['testid'] 
        #print (testid)
        dfres = pd.DataFrame(res,columns = ['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
        dfres['id'] = testid
        
        #print(dfres.shape)
        #assert(dfres.shape[0] == test.shape[0])
        
        #reshape to submission file format
        dfres = dfres[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']]
        
        return dfres
    
    def _save(self, mDB, nameKey, modelpath, **kwargs):
        '''
        save model into model data base
        :mDB: meta data frame storing all models info
        :nameKey: unique identifier for each saved model
        :modelpath: subdir inside modelDB dir, e.g. if modelDB is /root/modelDB, then modelpath is /cnn
        '''
        print("Saving model")
        import datetime
        
        #save a dict (topic => ('modelname', 'weightname')) into db
        model_saved_toDB = {}
        
        submd = self.md
        #this is the file we will save model to
        mdname = os.path.join(modelpath, nameKey +'_' + '.sav' )
        mdnameAbs = os.path.join(modelDB.MODEL_DB_ROOT, mdname )

        #this is the file we will save weights to
        weightName = os.path.join(modelpath, nameKey + '_' + '_weights.h5' )
        weightNameAbs = os.path.join( modelDB.MODEL_DB_ROOT, weightName )

        #convert md to json and save to file
        print(" Saving model {}".format(mdname) )
        model_json = submd.to_json()
        with open(mdnameAbs, "w") as json_file:
            json_file.write(model_json)

        # serialize weights to HDF5
        print(" Saving weights {}".format(weightName) )
        submd.save_weights(weightNameAbs)
    
        model_saved_toDB['MultiClassModel'] = (mdname, weightName)
            
        
        #db schema
        # 'modelName', type {rnn, cnn, rf}, date, model
        print( "Info: custom saving options" )
        for i in kwargs:
            print("{} - {}".format(i , kwargs[i]))
            
        import json
        newRow = pd.DataFrame({
            'modelName': [nameKey],
            'type': kwargs['modelType'],
            'subType': kwargs['modelSubType'],
            'date': str(datetime.datetime.now().strftime("%Y-%m-%d")),
            'model' : json.dumps(model_saved_toDB)
        }
        )
        
        #add a new row 
        #mDB = pd.concat([mDB, newRow])
        mDB = mDB.append(newRow, ignore_index = True)
        display(mDB)

        return mDB
    
    def load(self, mDB, nameKey, modelpath):
        '''
        :mDB: meta data frame storing all models info
        :nameKey: unique identifier for each saved model
        :modelpath: subdir inside modelDB dir, e.g. if modelDB is /root/modelDB, then modelpath is /cnn
        '''
        from keras.models import model_from_json
        import json
        
        if (mDB.empty) or mDB[ mDB['modelName'] == nameKey ].empty:
            raise VaueError("Model name does not exist")
        print("loadModel")
        #the saved model is of format: dict (topic => ('modelname', 'weightname')) into db
        assert(len(mDB[mDB['modelName'] == nameKey]) == 1)
        saved_model_inDB = json.loads(mDB[mDB['modelName'] == nameKey].iloc[0]['model'])
        
        print("saved json string representing the model is {}".format(saved_model_inDB))

        key = 'MultiClassModel'
        mdPointer = saved_model_inDB[key]
        print(" load model")
        mdname = mdPointer[0]
        mdnameAbs = os.path.join(modelDB.MODEL_DB_ROOT, mdname )
        print("  model file in {}".format(mdnameAbs))

        #this is the file we will save weights to
        weightName = mdPointer[1]
        weightNameAbs = os.path.join( modelDB.MODEL_DB_ROOT, weightName )
        print("  weight file in {}".format(weightNameAbs))
            
        with open(mdnameAbs, 'r') as json_file:
            tmpModel_json = json_file.read()
        tmpModel = model_from_json(tmpModel_json)
            #load weights
        tmpModel.load_weights(weightNameAbs)
            
            #assign the model into loaded model dict
        loaded_model = tmpModel
        
        assert(self.md is None)
        self.setModel(loaded_model)
        return self



In [None]:
#Utest test predict
if ( modelType == 'LSTMWithKeyWordTermFreq' )  or (modelType == "LSTMWithtfIdf"):
    utestInput = [trainOrig[0:1000, :], train2ndOrig[0:1000,:]]
else:
    utestInput = trainOrig[0:1000, :]


myCNN1d = TrainedModelCNNEmbeddingMultiClass(model)
dfres = myCNN1d.predict(utestInput, testid = train['id'])
display(dfres.head(20))
display(train.head(20)[['id', 'toxic','severe_toxic','obscene','threat','insult','identity_hate']])

#Utest test save
mdDB = pd.read_pickle("../modelDB/modelMetaDB.pkl")
modelpath = 'cnn/' # NOTE: this is relative to the modelDB path
print("current modelDB")
display(mdDB)

mdDB = myCNN1d.save(mdDB, 'utestModel', modelpath, modelType = 'CNN1d', modelSubType = 'Embedding_Random')
#mdDB.to_json("../modelDB/modelMetaDB.json")

#Utest test load
loadedCNN = TrainedModelCNNEmbeddingMultiClass().load( mdDB, 'utestModel', modelpath)
dfres_loaded = loadedCNN.predict(utestInput, testid = train['id'])
assert(dfres_loaded.equals(dfres))

In [None]:
#forward pass to inference
if secondInput(modelSubTypeList) is not None:
    testOrig = [np.array(test['seq_pad'].tolist()), test_2nd_input]
else:
    testOrig = np.array(test['seq_pad'].tolist())


myCNN1d = TrainedModelCNNEmbeddingMultiClass(model)

dfres = myCNN1d.predict(testOrig, testid = test['id'])

In [None]:
myCNN1d = TrainedModelCNNEmbeddingMultiClass(model)

mdDB = pd.read_pickle("../modelDB/modelMetaDB.pkl")
modelpath = 'cnn/' # NOTE: this is relative to the modelDB path
#print("current modelDB")
#display(mdDB)
mdDB = myCNN1d.save(mdDB, modelName, modelpath, modelType = modelType, modelSubType = modelSubType)
mdDB.to_pickle("../modelDB/modelMetaDB.pkl")


In [None]:
#save
dfres.to_csv('../submission/'+modelName+'.csv', index = False)