# Bankruptcy Prediction 


In [1]:
## Modules

import pickle
from matplotlib import pyplot as plt
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, SimpleRNN, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.model_selection import train_test_split
from keras import optimizers
from tensorflow import set_random_seed
from numpy.random import seed
%matplotlib inline

#set up matplotlib
params = {'legend.fontsize': 'xx-large',
          'figure.figsize': (30, 15),
         'axes.labelsize': 'xx-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
plt.rcParams.update(params)


Using TensorFlow backend.


In [2]:
## Functions
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 

def train_model(x_train, y_train, units, dropout, num_lstm_layers, model_type, epoch, batch_size):
    model = Sequential(
        [
            LSTM(units, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])) if model_type == 'lstm' 
            else GRU(units, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])) if model_type == 'gru' 
            else SimpleRNN(units, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])),
            Dropout(dropout)
        ] +
        [
            LSTM(units, return_sequences=True) if model_type == 'lstm' else 
            GRU(units, return_sequences=True) if model_type == 'gru' else
            SimpleRNN(units, return_sequences=True),
            Dropout(dropout)
        ] * (num_lstm_layers - 2) +
        [
            LSTM(units) if model_type == 'lstm' else 
            GRU(units) if model_type == 'gru' else
            SimpleRNN(units),
            Dropout(dropout),
            Dense(1, activation = 'sigmoid')
        ])
    model.compile(optimizer=optimizers.RMSprop(lr=0.003),loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(x_train, y_train, epochs=epoch, batch_size=batch_size, verbose=0)
    return model, history

def plot_loss(history):
    plt.plot(history.history['loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.show()
    

In [3]:
## Constants 

MIN_ROWS = 5
ZSCORE = 1.8

seed(700)
set_random_seed(700)

EPOCH = 100
BATCH_SIZE = 30
LSTM_UNITS = 60
DROPOUT = 0.2
NUM_LSTM_LAYERS = 4

# lstm, gru
# 42 - 60, 100
# 100 - 73, 67
# 200 - 74, 67
# 300 - 70, 16
# 400 - 73, 75 -- 65, 87
# 500 - 69, 12 -- 72, 75
# 600 - 75, 54 -- 76, 37
# 700 - 59, 100

# Import data and data processing

In [4]:
with open('data_with_ratios.pickle', 'rb') as fp:
    data = pickle.load(fp)

new_data = []
bankrupt = []

cols = data[0].keys()

for obj in data:
    if (len(obj.keys()) <= 28):
        continue
    cols = intersection(cols, obj.keys())
cols = sorted(cols)
print(cols)

ground_truth = []
ground_bankrupt = []

# Build a list of companies that are really bankrupt
# 'data' already contains labels of truly bankrupt companies

for obj in data:
    if not obj['bankrupt']:
        continue
    valid = True
    new_obj = []
    for i in range(MIN_ROWS):
        new_vals = []
        for key in cols:
            if key not in obj:
                valid = False
                break
            if (isinstance(obj[key], list) and key not in ['equity', 'roe', 'bad_solvency', 'debt_equity']):
                if len(obj[key]) < MIN_ROWS:
                    valid = False
                    break
                new_vals.append(obj[key][i]['value'])
        new_obj.append(new_vals)
    if (valid):
        ground_bankrupt.append(True)
        ground_truth.append(new_obj)

        
        
for obj in data:
    if obj['bankrupt']:
        continue
    valid = True
    new_obj = []
    for i in range(MIN_ROWS):
        new_vals = []
        for key in cols:
            if key not in obj:
                valid = False
                break
            if (isinstance(obj[key], list) and key not in ['equity', 'roe', 'bad_solvency', 'debt_equity']):
                if len(obj[key]) < MIN_ROWS:
                    valid = False
                    break
                new_vals.append(obj[key][i]['value'])
        new_obj.append(new_vals)
    if (valid):
#         ban = obj['bankrupt']
#         if 'zscore' in obj:
#             for zscore in obj['zscore']:
#                 if zscore['value'] < ZSCORE:
#                     ban = True
        neg_equity = False
        neg_roe = False
        bad_solvency = False
        bad_debt = False
        if 'equity' in obj:
            for equity in obj['equity']:
                if equity['value'] < 0:
                    neg_equity = True
        if 'roe' in obj:
            current_roe = None
            for roe in obj['roe']:
                if current_roe and roe['value'] < current_roe / 2:
                    neg_roe = True
                current_roe = roe['value']
        if 'bad_solvency' in obj:
            for solv in obj['bad_solvency']:
                if solv['value'] < 2:
                    bad_solvency = True
        if 'debt_equity' in obj:
            for de in obj['debt_equity']:
                if de['value'] > 2:
                    bad_debt = True
        bankrupt.append((neg_equity or neg_roe) and (bad_debt or bad_solvency))
        new_data.append(new_obj)

['bankrupt', 'company', 'debt_equity', 'equity', 'solvency', 'ticker', 'x10', 'x15', 'x16', 'x17', 'x2', 'x29', 'x3', 'x41', 'x50', 'x55', 'x7', 'x8']


# Altman Z-score performance evaluation on SimFin

Out of 747 with the ratios, only 20 are correct (i.e. in ground truth).
727 are false positives flagged by Altman Z-score


In [5]:
true_bankrupt = []
zscore_bankrupt = []
for obj in data:
    if obj['bankrupt']:
        true_bankrupt.append(obj['ticker'])
    if ('zscore' in obj):
        for zscore in obj['zscore']:
            if zscore['value'] < 1.8:
                zscore_bankrupt.append(obj['ticker'])
                break
intersect = set(true_bankrupt) & set(zscore_bankrupt)
print('True Bankrupt')
print(len(true_bankrupt))
print(true_bankrupt)
print('Bankrupt by Z score')
print(len(zscore_bankrupt))
print(zscore_bankrupt)
print('Intersect')
print(len(intersect))
print(intersect)

True Bankrupt
44
['NCSO', 'ARO', 'ACI', 'ASNA', 'BONT', 'BKU', 'CPN', 'CIT', 'CMLS', 'CHTR', 'CIE', 'DAL', 'DNKN', 'DYN', 'DEST', 'DDMG', 'EFH', 'FRP', 'CK00007861', 'GGP', 'GGP', 'GM', 'ACF', 'GMAN', 'HCOM', 'ZINCQ', 'LEA', 'LEE', 'LNGG', 'LYB', 'MFRM', 'ODP', 'PIR', 'RFP', 'SHLD', 'SHO', 'SMRT', 'SIX', 'C730', 'CK00015847', '', 'TRCO', 'TSQ', 'TROX']
Bankrupt by Z score
747
['FOX', 'ABT', 'AKR', 'ACCO', 'ARAY', 'ACW', 'ACRX', 'ACET', 'ADOM', 'ADT', 'ADRO', 'AMD', 'AEHR', 'ACY', 'AJRD', 'AES', 'AMG', 'SO', 'AIRI', 'AL', 'AKS', 'ALK', 'ALB', 'ALR', 'ALX', 'ALCO', 'ADS', 'LNT', 'AMOT', 'MDRX', 'ALNY', 'ANAV', 'ALPN', 'AEE', 'ABMC', 'AEP', 'AMSC', 'AMT', 'APU', 'AMP', 'FOLD', 'AMKR', 'AHS', 'APC', 'AVXL', 'AR', 'APA', 'AIV', 'ARCI', 'ARCW', 'ACI', 'PNW', 'AWI', 'ARTX', 'ARQL', 'ARRY', 'ASH', 'ASPN', 'ATRO', 'ASTC', 'T', 'AAWW', 'ATW', 'ADP', 'AVB', 'AVNW', 'AVA', 'ACLS', 'ACHC', 'ACAD', 'ACLZ', 'ADTM', 'ADPT', 'ASNB', 'AGHI', 'AA', 'ALEX', 'ARE', 'Y', 'ATI', 'AGN', 'ALE', 'ALSN', 'ALT', 

In [6]:
# Examples of (24) companies that really went bankrupt but not picked up by Altman Z-score

not_detected = [b for b in true_bankrupt if b not in zscore_bankrupt]
print(not_detected)

for obj in data:
    if obj['ticker'] in not_detected:
        if 'zscore' in obj:
            print(obj['ticker'])
            print(obj['zscore'])
#         else:
#             print(obj['ticker'])

['NCSO', 'ARO', 'ASNA', 'BONT', 'BKU', 'CIT', 'DEST', 'DDMG', 'CK00007861', 'ACF', 'GMAN', 'ZINCQ', 'LEA', 'LNGG', 'LYB', 'MFRM', 'ODP', 'PIR', 'SMRT', 'CK00015847', '', 'TRCO', 'TROX']
ARO
[{'date': '8/12/10', 'value': 9.42717009992146}, {'date': '3/6/11', 'value': 9.872457180713567}]
ASNA
[{'date': '9/6/11', 'value': 5.83933649983558}, {'date': '1/3/12', 'value': 6.97692340830925}, {'date': '4/3/13', 'value': 4.284513750136817}, {'date': '3/3/14', 'value': 4.790147416052633}, {'date': '3/3/15', 'value': 4.277927708003096}, {'date': '1/3/16', 'value': 2.2732138946707994}, {'date': '8/6/17', 'value': 2.9813499518295137}, {'date': '5/3/18', 'value': 3.031318404692856}]
BONT
[{'date': '7/12/11', 'value': 2.0314942241645726}, {'date': '11/4/12', 'value': 2.2940571223782764}, {'date': '17/4/13', 'value': 2.2748608137511175}, {'date': '16/4/14', 'value': 2.2829921108696043}, {'date': '15/4/15', 'value': 2.2215037249441414}, {'date': '13/4/16', 'value': 2.154344385539776}, {'date': '12/4/17'

# Deep learning models

In [None]:
# Data Prep

results_test = []
results_ground_truth = []

for i in range(30):

    arr = np.array(new_data)
    test_arr = np.array(ground_truth)
    print("Dataset Dimensions" + str(arr.shape))

    scalers = {}
    for i in range(arr.shape[2]):
        scalers[i] = MinMaxScaler(feature_range=(0,1))
        arr[:, :, i] = scalers[i].fit_transform(arr[:, :, i])
        test_arr[:, :, i] = scalers[i].transform(test_arr[:, :, i])

    x_train, x_test, y_train, y_test = train_test_split(arr, bankrupt, test_size=0.15, random_state=42)

    print('bankrupt:' + str(len([True for b in bankrupt if b])))
    print('not bankrupt:' + str(len([True for b in bankrupt if not b])))

    # Train model

    lstm_model, lstm_history = train_model(x_train, y_train, LSTM_UNITS, DROPOUT, NUM_LSTM_LAYERS, 'gru', EPOCH, BATCH_SIZE)
    # plot_loss(lstm_history)

    
    # Evaluate model and predict data on TEST 
    print("******Evaluating TEST set*********")

    scores = lstm_model.evaluate(x_test, y_test)
    print("model: \n%s: %.2f%%" % (lstm_model.metrics_names[1], scores[1]*100))
    results_test.append(scores[1]*100)

    y_predict = lstm_model.predict_classes(x_test)
    cm = confusion_matrix(y_test, y_predict)
    print(cm)

    try:
        tn, fp, fn, tp = cm.ravel()
        print(tn, fp, fn, tp)
    except ValueError:
        print("100% accuracy, no CM to print")

    fpr_BDmodel, tpr_BDmodel, thresholds_BDmodel = roc_curve(y_test, y_predict)
    auc_BDmodel = auc(fpr_BDmodel, tpr_BDmodel)
    print("AUC: " + str(auc_BDmodel))


    # Evaluate model and predict data on GROUND TRUTH
    print("******Evaluating GROUND TRUTH*********")
    scores = lstm_model.evaluate(test_arr, ground_bankrupt)
    print("model: \n%s: %.2f%%" % (lstm_model.metrics_names[1], scores[1]*100))
    results_ground_truth.append(scores[1]*100)

    y_predict = lstm_model.predict_classes(test_arr)
    cm = confusion_matrix(ground_bankrupt, y_predict)
    print(cm)
    
    try:
        tn, fp, fn, tp = cm.ravel()
        print(tn, fp, fn, tp)
    except ValueError:
        print("100% accuracy, no CM to print")

    from keras import backend
    backend.clear_session()

import statistics

print(results_test)
print("Average test accuracy: " + str(statistics.mean(results_test)))
print("Average test stdev: " + str(statistics.stdev(results_test)))

print(results_ground_truth)
print("Average ground truth accuracy: " + str(statistics.mean(results_ground_truth)))
print("Average ground stdev: " + str(statistics.stdev(results_ground_truth)))


W1011 12:55:49.540460 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1011 12:55:49.542454 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1011 12:55:49.545445 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1011 12:55:49.727100 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is depre

Dataset Dimensions(1362, 5, 13)
bankrupt:445
not bankrupt:917


W1011 12:55:49.735078 104876 deprecation.py:506] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W1011 12:55:50.188110 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W1011 12:55:50.194094 104876 deprecation_wrapper.py:119] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:3376: The name tf.log is deprecated. Please use tf.math.log instead.

W1011 12:55:50.199081 104876 deprecation.py:323] From C:\Users\yihao001\AppData\Local\Continuum\anaconda3\lib

******Evaluating TEST set*********
model: 
acc: 68.78%
[[83 54]
 [10 58]]
83 54 10 58
AUC: 0.7293902962644914
******Evaluating GROUND TRUTH*********
model: 
acc: 83.33%
[[ 0  0]
 [ 4 20]]
0 0 4 20
Dataset Dimensions(1362, 5, 13)
bankrupt:445
not bankrupt:917
******Evaluating TEST set*********
model: 
acc: 71.71%
[[95 42]
 [16 52]]
95 42 16 52
AUC: 0.7290682696436237
******Evaluating GROUND TRUTH*********
model: 
acc: 66.67%
[[ 0  0]
 [ 8 16]]
0 0 8 16
Dataset Dimensions(1362, 5, 13)
bankrupt:445
not bankrupt:917
******Evaluating TEST set*********
model: 
acc: 75.61%
[[119  18]
 [ 32  36]]
119 18 32 36
AUC: 0.6990124516960067
******Evaluating GROUND TRUTH*********
model: 
acc: 62.50%
[[ 0  0]
 [ 9 15]]
0 0 9 15
Dataset Dimensions(1362, 5, 13)
bankrupt:445
not bankrupt:917
******Evaluating TEST set*********
model: 
acc: 59.02%
[[57 80]
 [ 4 64]]
57 80 4 64
AUC: 0.6786174323744096
******Evaluating GROUND TRUTH*********
model: 
acc: 100.00%
[[24]]
100% accuracy, no CM to print
Dataset Dime

model: 
acc: 74.63%
[[105  32]
 [ 20  48]]
105 32 20 48
AUC: 0.7361528553027051
******Evaluating GROUND TRUTH*********
model: 
acc: 66.67%
[[ 0  0]
 [ 8 16]]
0 0 8 16
Dataset Dimensions(1362, 5, 13)
bankrupt:445
not bankrupt:917
