#  Bankruptcy Prediction using Keras

2016-12-20, Feng Mai  
Use Keras merge layer to combine textual and numerical features for bankruptcy prediction

In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, Convolution2D, Convolution1D, Reshape, Lambda, AveragePooling1D, AveragePooling2D, MaxPooling1D
from keras.layers import LSTM, SimpleRNN, GRU
from keras.regularizers import l1, l2, activity_l2, l1, activity_l1
from keras.preprocessing.text import Tokenizer
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import Merge

import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier


Using TensorFlow backend.


## Data Preparation

Parameters

In [95]:
max_features = 5000 # max number of words to include (remove lower frequency words)
maxlen = 5000  # cut texts after this number of words 

test_train_split_year = 2011
forecast_year = 1

In [94]:
def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year


def pad_text_data():
    """ Load tokenized word sequence and pad it for deep learning"""
    print('Loading data')
    X = np.load("data/10k/X_keras_unigram.npy")
    
    # pad sequence for deep learning
    print('Pad sequences')
    X = sequence.pad_sequences(X, maxlen=maxlen)
    
    return X
    
    
def load_data(X_padded_text):
    """ Load tokenized word sequence and pad it for deep learning"""   
    index_10k = pd.read_csv('data/10k/10k_index.csv',usecols=['gvkey','fyear'])
    index_10k['index_10k'] = index_10k.index
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)

    # match text index with one year after y, index_10k_y has the index of text data has one year after Y matched
    index_10k_y = pd.merge(left=index_10k, right=final_variable, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(index_10k_y.shape)
    
    index_10k_y_n_year = n_year_before(index_10k_y, n = forecast_year)
    print("Total number of observations: ")
    print(index_10k_y_n_year.shape)
    
    y = np.array(index_10k_y_n_year['Y'])
    X_text = X_padded_text[index_10k_y_n_year['index_10k'].tolist()] # get all text X which has matched one year after Y
    X_num = index_10k_y_n_year.drop(['gvkey', 'fyear', 'datadate', 'cusip', 'PERMCO', 'Y', 'PERMNO', 'index_10k'], 1)
    X_num = X_num.as_matrix()
    # split train-test by year
    index_10k_y_n_year = index_10k_y_n_year.reset_index(drop=True)
    train_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] < test_train_split_year].index.tolist()
    test_index = index_10k_y_n_year[index_10k_y_n_year['fyear'] >= test_train_split_year].index.tolist()
    print(X_text.shape, X_num.shape, y.shape)
    return X_text, X_num, train_index, test_index, y

In [96]:
X_padded = pad_text_data()
pickle.dump(X_padded, open("data/10k/X_padded.pickle", 'wb'))

Loading data
Pad sequences


In [97]:
X_padded = pickle.load(open("data/10k/X_padded.pickle", 'rb'))

In [98]:
X_text, X_num, train_index, test_index, y = load_data(X_padded_text = X_padded)

Total number of observations with no forecasting: 
(79222, 44)
Total number of observations: 
(64999, 44)
(64999, 5000) (64999, 36) (64999,)


## Train and Evaluate a Deep Learning Model

Define different deep learning models

In [136]:
def create_model_text(no_merge = False):
    # this is the Deep Averaging Network Moodel
    # see "Deep Unordered Composition Rivals Syntactic Methods for Text Classification", Iyyer et al. 2015

    embedding_size = embedding_dims
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))
    print(model.layers[-1].output_shape)

    model.add(Dropout(0.1)) # Use dropout if implement the DAN model
    model.add(AveragePooling1D(pool_length=model.output_shape[1]))
    print(model.layers[-1].output_shape)
    model.add(Flatten())

    # We add a vanilla hidden layer:
    model.add(Dense(4))
    model.add(Dropout(0.1))
    model.add(Activation('relu'))
    
    if no_merge:
        model.add(Dense(1, activation='sigmoid'))
        print(model.layers[-1].output_shape)

        model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    return model


# def create_model_text(no_merge = False):
#     # this is the fast-text model
#     model = Sequential()
#     # we start off with an efficient embedding layer which maps
#     # our vocab indices into embedding_dims dimensions
#     model.add(Embedding(max_features,
#                         embedding_dims,
#                         input_length=maxlen))
#     # we add a GlobalAveragePooling1D, which will average the embeddings
#     # of all words in the document
#     model.add(GlobalAveragePooling1D())
#     if no_merge:
#         # We project onto a single unit output layer, and squash it with a sigmoid:
#         model.add(Dense(1, activation='sigmoid'))
#         model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
#     return model


# def create_model_text(no_merge = False):
#     # this is a CNN from Keras example
#     nb_filter = 50
#     filter_length = 3
#     hidden_dims = 8
#     model = Sequential()

#     # we start off with an efficient embedding layer which maps
#     # our vocab indices into embedding_dims dimensions
#     model.add(Embedding(max_features,
#                         embedding_dims,
#                         input_length=maxlen,
#                         dropout=0.0))
#     # we add a Convolution1D, which will learn nb_filter
#     # word group filters of size filter_length:
#     model.add(Convolution1D(nb_filter=nb_filter,
#                             filter_length=filter_length,
#                             border_mode='valid',
#                             activation='relu',
#                             subsample_length=1))
#     # we use max pooling:
#     model.add(GlobalMaxPooling1D())
#     # We add a vanilla hidden layer:
#     model.add(Dense(hidden_dims))
#     model.add(Dropout(0.0))
#     model.add(Activation('relu'))
#     model.add(Dense(hidden_dims))
#     model.add(Dropout(0.0))
#     model.add(Activation('relu'))

#     if no_merge:
#         model.add(Dense(1))
#         model.add(Activation('sigmoid'))
#         model.compile(loss='binary_crossentropy',
#                       optimizer='adam',
#                       metrics=['accuracy'])
#     return model


# def create_model_text(no_merge = False):
#     # CNN/LSTM https://github.com/fchollet/keras/blob/master/examples/imdb_cnn_lstm.py
#     # Embedding
#     embedding_size = 128

#     # Convolution
#     filter_length = 5
#     nb_filter = 64
#     pool_length = 4

#     # LSTM
#     lstm_output_size = 70

#     '''
#     Note:
#     batch_size is highly sensitive.
#     Only 2 epochs are needed as the dataset is very small.
#     '''
    
#     model = Sequential()
#     model.add(Embedding(max_features, embedding_size, input_length=maxlen))
#     model.add(Dropout(0.25))
#     model.add(Convolution1D(nb_filter=nb_filter,
#                             filter_length=filter_length,
#                             border_mode='valid',
#                             activation='relu',
#                             subsample_length=1))
#     model.add(MaxPooling1D(pool_length=pool_length))
#     model.add(LSTM(lstm_output_size))
    
    
#     if no_merge:
#         model.add(Dense(1))
#         model.add(Activation('sigmoid'))

#         model.compile(loss='binary_crossentropy',
#                       optimizer='adam',
#                       metrics=['accuracy'])
#     return model

        
        
def create_model_num(no_merge = False):
    model = Sequential()
    model.add(Dense(4, input_dim=36, activation='relu', activity_regularizer=activity_l2(0.0001)))
    if no_merge:
        model.add(Dense(1, input_dim=36, activation='sigmoid',init='zero', activity_regularizer=activity_l2(0.0001)))
        model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
    return model

def create_model_merge_layer():
    model_text = create_model_text()
    model_num = create_model_num()
    merged = Merge([model_text, model_num], mode='concat')
    final_model = Sequential()
    final_model.add(merged)
    print(final_model.layers[-1].output_shape)
    final_model.add(Dense(1, activation='sigmoid'))
    final_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return final_model

In [139]:
embedding_dims = 100
batch_size = 32
nb_epoch = 3

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, batch_size = batch_size, nb_epoch = nb_epoch, verbose = 1):
    """Calculate and print ROC score from a set of X and y"""
    model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, 
             validation_data=(X_test, y_test), verbose = verbose)
    pred_yp = model.predict(X_test)
    roc = metrics.roc_auc_score(y_test , pred_yp)
    print(roc)
    return roc

def forecast_performace(X_text, X_num, train_index, test_index, y, model_type = "text"):
    """Train_test split using year; Calculate and print performance score"""
    print("Running Model")
    model = None # Clearing the NN.
    
    if model_type == "text":
        model = create_model_text(no_merge=True)
        X_train = X_text[train_index]
        y_train = y[train_index]
        X_test = X_text[test_index]
        y_test = y[test_index]
        train_and_evaluate_model(model, X_train, y_train, X_test, y_test)
        
    if model_type == "num":
        model = create_model_num(no_merge=True)
        X_train = X_num[train_index]
        y_train = y[train_index]
        X_test = X_num[test_index]
        y_test = y[test_index]
        train_and_evaluate_model(model, X_train, y_train, X_test, y_test, verbose = 0)

    if model_type == "merge":
        model = create_model_merge_layer()
        X_train = [X_text[train_index], X_num[train_index]]
        y_train = y[train_index]
        X_test = [X_text[test_index], X_num[test_index]]
        y_test =  y[test_index]
        train_and_evaluate_model(model, X_train, y_train, X_test, y_test)

    

def kfold_performace(X_text, X_num, y, n_folds = 2):
    """Calculate and print average stratified K-fold performance score"""
    skf = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
    performance_scores = []
    for i, (train_index, test_index) in enumerate(skf):
        print("Running Fold", i+1, "/", n_folds)
        model = None # Clearing the NN.
        model = create_model_merge_layer()
        performance_scores.append(train_and_evaluate_model(model, X_text, X_num, y, train_index, test_index, 
                                                          just_text=False))
    print(sum(performance_scores)/n_folds)

#kfold_performace(X, y, n_folds = 2)

In [140]:
forecast_performace(X_text, X_num, train_index, test_index, y, "text")

Running Model
Build model...
(None, 5000, 100)
(None, 1, 100)
(None, 1)
Train on 50286 samples, validate on 14713 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
0.74122075476
