In [29]:
import pandas as pd
import numpy as np
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

Use traditional method in sklearn to forecasting bankcruptcy

# Bankcruptcy prediction using sklearn

## Load and align data

SVD for text data

In [143]:
# # Use SVD to reduce the dimension in the 10k text
def generate_save_SVD():
    X = np.load('data/10k/X_tfidf.npy')
    text = pd.DataFrame(data=X)
    svd = TruncatedSVD(n_components=80)
    svd_text = svd.fit_transform(text)
    print(svd.explained_variance_ratio_.sum())
    np.save("data/10k/svd_X_80tfidf.npy", svd_text)

In [31]:
np.shape(X)

(121848, 5000)

Load Numerical Data

In [32]:
# all numerical data
final_variable = pd.read_csv('data/final_variables.csv')
final_variable = final_variable.drop('Unnamed: 0',1)
drop_list = ['gvkey','datadate','fyear','cusip','PERMNO','PERMCO', 'Y']
all_x_var = list(final_variable.drop(drop_list, axis=1))
print(all_x_var)
lasso_x = ['PRICE','OIADPAT','NIMTA','FAT','LCTCHAT','EXCESS_RETURN','LCTAT','EBITDPAT'] # lasso selected features

['NIAT', 'NISALE', 'OIADPAT', 'OIADPSALE', 'EBITAT', 'EBITDPAT', 'EBITSALE', 'SEQAT', 'REAT', 'LCTAT', 'LCTCHAT', 'LTAT', 'LOGSALE', 'CHAT', 'CHLCT', 'QALCT', 'ACTLCT', 'WCAPAT', 'LCTLT', 'INVTSALE', 'SALEAT', 'APSALE', 'LOGAT', 'INVCHINVT', 'CASHAT', 'LCTSALE', 'RELCT', 'FAT', 'SIGMA', 'NIMTA', 'LTMTA', 'CASHMTA', 'PRICE', 'RSIZE', 'EXCESS_RETURN', 'MBE']


Helper functions to load and align data

In [99]:
test_train_split_year = 2011
forecast_year = 1

def n_year_before(df, n = 1):
    """input x,y df, return df with y and n year before x"""
    dat_tmp = df.copy()
    dat_tmp['fyear'] = dat_tmp['fyear'] + n
    dat_tmp = dat_tmp.drop('Y',axis =1)
    Ys = df[['fyear','gvkey','Y']]
    n_year = pd.merge(dat_tmp,Ys,how = 'inner',on=['fyear','gvkey'])
    return n_year

def load_data(how='text', svd=True):
    """Input user specified method, return train_x, train_y, test_x, test_y based on pre-load df
    :param how: ['text','numerical','total']
    :type how: str
    :param svd: do decomposition to tfidf
    :type svd: boolean
    :return: train_x, train_y, test_x, test_y split by fyear 2009
    :rtype: pandas.dataframe
    """
    print('Loading data')
    
    # load accounting and market data
    final_variable = pd.read_csv('data/final_variables.csv')
    final_variable = final_variable.drop('Unnamed: 0',1)
    final_variable = final_variable.replace([np.inf,-np.inf],0)
    final_variable = final_variable[final_variable['fyear'] >= 1993] # 10k data from 1993
    final_variable.shape
    # load 5000_tfidf text 
    total_text = pd.DataFrame(data=np.load('data/10k/X_tfidf.npy'))
    print("10k data shape: ")
    print(total_text.shape)
    # load 80_tfidf text
    svd_text = pd.DataFrame(data=np.load('data/10k/svd_X_80tfidf.npy'))
    # load text index
    index_10k = pd.read_csv('data/10k/10k_index.csv',usecols=['gvkey','fyear'])
    print("10k index shape: ")
    print(index_10k.shape)

    if svd:
        text = pd.concat([svd_text, index_10k], axis=1)
        text_idx = list(range(80))
    if not svd:
        text = pd.concat([total_text, index_10k], axis=1)
        text_idx = list(range(5000))
    
    
    # combine text and numerical data
    text_num = pd.merge(left=final_variable, right=text, how='inner', on=['gvkey','fyear'])
    print("Total number of observations with no forecasting: ")
    print(text_num.shape)
    text_num_n_year = n_year_before(text_num, n = forecast_year)
    print("Total number of observations: ")
    print(text_num_n_year.shape)
    text_num_n_year.fillna(0, inplace=True)
    
    train = text_num_n_year[text_num_n_year['fyear'] < test_train_split_year]
    test = text_num_n_year[text_num_n_year['fyear'] >= test_train_split_year]
    train_y = train['Y']
    test_y = test['Y']

    if how == 'text':
        train_x = train.ix[:,text_idx]
        test_x = test.ix[:,text_idx]                       
    if how == 'num':
        # use 8 selected numerical features
        train_x = train[all_x_var]
        test_x = test[all_x_var]
    if how == 'total':
        train_x = pd.concat([train.ix[:,text_idx],train[all_x_var]], axis=1) 
        test_x = pd.concat([test.ix[:,text_idx],test[all_x_var]], axis=1) 
    print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    print(train_x.head(5))
    return train_x, train_y, test_x, test_y

def get_prob_auc(clf,x,y):
    probas_= clf.predict_proba(x)
    probas_=probas_[:,1]
    fpr,tpr,thresholds = roc_curve(y,probas_)
    roc_auc = roc_auc_score(y,probas_)
    accuracy_ratio = (roc_auc-0.5)*2
    return probas_,accuracy_ratio

def tencile_table(test,p):
    tenc_dat = pd.DataFrame({'y_true':test,'probability':p})
    tenc_dat.sort('probability',axis = 0,ascending=False, inplace = True)
    tenc_dat.index = range(0,len(tenc_dat))
    y = tenc_dat['y_true']
    point = float(len(tenc_dat))/10
    point = int(round(point))
    tenc = []
    for i in range(0,10):
        tenc.append(y[(i*point):((i+1)*point)])
    tenc[9]=tenc[9].append(y[10*point:])
    total = sum(y)
    num_of_bkr = []
    for j in range(0,10):
        num_of_bkr.append(sum(tenc[j]))
    tencile_bkr = np.array(num_of_bkr)
    rate = tencile_bkr.astype(float)/total
    tencile_result=pd.DataFrame({'Group':range(1,11),'Rate':rate})
    return tencile_result

## Create and Compare Models

In [175]:
def create_model(model_type):
    # create your model using this function
    if model_type == 'MNB':
        model = MultinomialNB()
    if model_type == 'lg':
        model = LogisticRegression(C = 0.18,penalty='l1')
#         model = LogisticRegressionCV(Cs = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], solver = 'liblinear',
#                                      penalty='l1')
    if model_type == 'lg-text':
        model = LogisticRegression(C = 0.017,penalty='l1')
    if model_type =='SVM':
        model = LinearSVC(C=0.000004)
    if model_type == 'GNB':
        model = GaussianNB()
    if model_type == 'SVM_rbf':
        model = SVC(kernel='rbf', C=0.005, class_weight='balanced')
    if model_type == 'SVM_poly':
        model = SVC(kernel='poly', C=0.001)
    if model_type == 'random_forest':
        model = RandomForestClassifier(max_depth=8)
    if model_type == "boost":
        model = GradientBoostingClassifier()
    return model

Only look at single models

In [183]:
method = ('text','boost')

print('method is: ', method)
train_x, train_y, test_x, test_y = load_data(how=method[0], svd=False)

method is:  ('text', 'boost')
Loading data
10k data shape: 
(121848, 5000)
10k index shape: 
(121848, 2)
Total number of observations with no forecasting: 
(79222, 5043)
Total number of observations: 
(64999, 5043)
(50286, 5000) (50286,) (14713, 5000) (14713,)
   0         1         2         3         4         5         6         7     \
4   0.0  5.143531  4.822726  4.754027  4.548103  4.644117  4.229583  4.115143   
5   0.0  5.143531  4.803432  4.771720  4.562860  4.696352  4.087589  4.094556   
6   0.0  5.161100  4.726078  4.664173  4.512507  4.689111  4.076383  4.104918   
7   0.0  5.174858  4.783604  4.669951  4.502021  4.632429  4.283316  3.982480   
8   0.0  4.649827  4.474391  4.472400  4.310765  4.244091  4.029873  3.791913   

       8         9     ...       4990      4991  4992  4993  4994  4995  \
4  5.074887  0.879722  ...   0.000000  3.251397   0.0   0.0   0.0   0.0   
5  5.175413  0.000000  ...   0.000000  3.251397   0.0   0.0   0.0   0.0   
6  5.026521  0.000000  ... 

In [184]:
print('in sample result:')
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
if hasattr(model, 'predict_proba'):
    pred_yp = model.predict_proba(train_x)[:,1]
else:
    pred_yp = model.decision_function(train_x)
    pred_yp = (pred_yp - pred_yp.min()) / (pred_yp.max() - pred_yp.min())
roc = metrics.roc_auc_score(train_y, pred_yp)
print(roc)
print('out sample result:')
model = create_model(model_type=method[1])
model.fit(train_x, train_y)
if hasattr(model, 'predict_proba'):
    pred_yp = model.predict_proba(test_x)[:,1]
else:
    pred_yp = model.decision_function(test_x)
    pred_yp = (pred_yp - pred_yp.min()) / (pred_yp.max() - pred_yp.min())
roc = metrics.roc_auc_score(test_y, pred_yp)
print(roc)

in sample result:
0.898879230754
out sample result:
0.576911294166


## Benchmark multiple models

In [None]:
#80 tfidf result, max_depth=1, n_estimators=150
methods = [('text','MNB'),('numerical','lg'),('total','random_forest')]
# for method in methods:
#     print('method is: ', method)
#     train_x, train_y, test_x, test_y = load_data(how=method[0], svd=False)
#     print('in sample result:')
#     model = create_model(model_type=method[1])
#     model.fit(train_x, train_y)
#     if hasattr(model, 'predict_proba'):
#         pred_yp = model.predict_proba(train_x)[:,1]
#     else:
#         pred_yp = model.decision_function(train_x)
#         pred_yp = (pred_yp - pred_yp.min()) / (pred_yp.max() - pred_yp.min())
#     roc = metrics.roc_auc_score(train_y, pred_yp)
#     print(roc)
#     print('out sample result:')
#     model = create_model(model_type=method[1])
#     model.fit(train_x, train_y)
#     if hasattr(model, 'predict_proba'):
#         pred_yp = model.predict_proba(test_x)[:,1]
#     else:
#         pred_yp = model.decision_function(test_x)
#         pred_yp = (pred_yp - pred_yp.min()) / (pred_yp.max() - pred_yp.min())
#     roc = metrics.roc_auc_score(test_y, pred_yp)
#     print(roc)

In [None]:
train_x.shape[0] + test_x.shape[0]