   The DataSet collected over 2007-2013 describes 64 features and bankruptcy status after 1~5 year of Polish companies. Now look at the dataset first.

# Part I
To do the necessary data analysis which includes confirming the data content, visualizing the data distribution and cleaning and preprocessing the data.

In [None]:
# Loading necessary python module
import numpy as np 
import pandas as pd
from os.path import join
from matplotlib import pyplot
%matplotlib inline

In [None]:
#Downloading dataset
module_path = './DataSet/'
filename = 'year.arff.csv'
data = []
for i in range(1,6):
    data_temp = pd.read_csv(join(module_path,'{}'.format(i)+filename))
    data.append(data_temp)

In [None]:
# Confirming the number of companies every year
for i in range(5):
    print('the number of companies in {}year: {}'.format(i+1,data[i].shape[0]))

In [None]:
# Glancing at the content of dataset
data[0].head(10)

In [None]:
# Checking the general summary of each columns
data[1].describe()

In [None]:
# Visualizing the data distribution
pyplot.rcParams['figure.figsize'] = [24,36]
data[0].hist(sharex=False, sharey=False, xlabelsize=12, ylabelsize=12)
pyplot.show()

In [None]:
# Another data distribution visualization
pyplot.rcParams['figure.figsize'] = [24,100]
data[0].plot(kind='box', subplots=True, layout=(18,4), sharex=False, sharey=False, fontsize=12)
pyplot.show()

In [None]:
# Features should be scaled using standard scaler before using them for training or testing.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaledData = []
for i in range(5):
    Data = data[i].as_matrix()
    scaleddata = Data
    scaleddata[:,:-1] = scaler.fit_transform(Data[:,:-1])
    scaleddata = pd.DataFrame(scaleddata,columns = data[i].columns)
    scaledData.append(scaleddata)

In [None]:
#Spliting data into features and labels
features = []
labels = []
for i in range(5):
    feature, label = scaledData[i].drop('class', axis=1), scaledData[i]['class']
    features.append(feature)
    labels.append(label)

In [None]:
# Verifying data is highly skewed, as mentioned by the publisher of the dataset.
for i in range(5):
    num_of_bkr = sum(labels[i]==1)
    num_of_hlt = sum(labels[i]==0)
    print('the number of bankruptcy companies in {}year is {}'.format(i+1,num_of_bkr))
    print('the number of healthy companies in {}year is {}\n'.format(i+1,num_of_hlt))

To overcome the skew class problem i use Synthetic Minority Oversampling Technique (SMOTE) and implement this using imbalanced-learn package of scikit-learn contrib. And to avoid information leak, splitting the dataset prior to adding the generated extra data to the training set.

In [None]:
# Splitting the data into training and testing set
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = [[] for i in range(4)]
for i in range(5):
    x_train,x_test,train,test = train_test_split(features[i],labels[i], test_size = 0.30, random_state = 0)
    X_train.append(x_train);X_test.append(x_test);y_train.append(train);y_test.append(test)

In [None]:
# Synthetic Minority Oversampling Technique(SMOTE)
from imblearn.over_sampling import SMOTE
from collections import Counter
X_resampled = []; y_resampled = []
for i in range(5):
    x_resampled, resampled = SMOTE().fit_sample(X_train[i], y_train[i])
    X_resampled.append(x_resampled); y_resampled.append(resampled)
    print(sorted(Counter(resampled).items()))

# Part II
To discover bravely the intrinsical factors predicting the bankruptcy best.

The dataset provides the information about 64 features values and bankruptcy status after different years of different companies  so that the impact of company self-factor and economic environment factor on bankruptcy forecast should be ignored. Therefore, i just only consider the impact of original features and calculate importance of these features with different forecast durations. Doing this may be losting some information causing the prediction accuracy decreases but can let us pay more attention to the most essential things.

In [None]:
import operator
import xgboost as xgb
import scipy.stats as st
#from graphviz import Digraph
from xgboost import plot_tree
from xgboost import XGBClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_recall_fscore_support

In [None]:
#Feature Selecting, Taking 1year data as an example. First let us confirm best parameters for XGboost model.
xtrain = X_resampled[4];ytrain = y_resampled[4]
xtest = X_test[4]; ytest = y_test[4]
dtrain = xgb.DMatrix(pd.DataFrame(xtrain, columns=xtest.columns), label=ytrain, missing=np.nan)
num_rounds = 200
folds = StratifiedKFold(n_splits=3,random_state=1,shuffle=True) 
xgb_parameters = {
    "learning_rate":[0.05,0.1,0.15],
    "min_child_weight":[0.5, 1.0, 2.0, 3.0],
    "max_depth":[2,4,6,8],
    "gamma":st.uniform(0, 1.0),
    "subsample":st.uniform(0,1.0),
    "colsample_bytree":st.uniform(0.4, 0.6),
    "reg_lambda": [0.1, 1.0, 5.0, 10.0]}
clf = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic'),xgb_parameters,
                         n_iter=100,cv=folds,scoring='roc_auc',n_jobs=-1)
clf.fit(xtrain, ytrain)
best_params = clf.best_params_

classifier_1 = clf.best_estimator_
classifier_1.fit(pd.DataFrame(xtrain, columns=xtest.columns), ytrain)
ypred_1 = classifier_1.predict(xtest)

cm_1 = confusion_matrix(ytest,ypred_1)
accuracy_1 = (cm_1[0,0]+cm_1[1,1])/len(ytest)
precision_1, recall_1, f_score_1, support = precision_recall_fscore_support(ytest, ypred_1, average = None)
print("\nFor Model 1 - XGBoost:")
print("Precision:",precision_1)
print("Recall:",recall_1)
print("F-Score:",f_score_1)
print("Accuracy_XGBoost:",accuracy_1*100,'%')
#xgb.plot_tree(classifier_1)

In [None]:
#After best parameters being comfirmed, we can select top k important features
xgb_params = best_params
xgb_model = xgb.train(xgb_params, dtrain, num_rounds)
importance = xgb_model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse = True)
Feature = pd.DataFrame(importance, columns=['feature', 'fscore'])

fig, ax = pyplot.subplots(figsize=(12,8))
xgb.plot_importance(xgb_model, height=0.8, ax=ax)
pyplot.show()

Feature = Feature[0:16]
Features = Feature['feature']

In [None]:
#Then We can take these features to predict the label of companies bankruptcy 
x_retrain = pd.DataFrame(xtrain, columns=xtest.columns)[Features];y_retrain = ytrain
x_retest = xtest[Features]; y_retest = ytest
dtrain = xgb.DMatrix(x_retrain, label=y_retrain, missing=np.nan)
num_rounds = 200
folds = StratifiedKFold(n_splits=3,random_state=1,shuffle=True) 
xgb_parameters = {
    "learning_rate":[0.05,0.1,0.15],
    "min_child_weight":[0.5, 1.0, 2.0, 3.0],
    "max_depth":[2,4,6,8],
    "gamma":st.uniform(0, 1.0),
    "subsample":st.uniform(0,1.0),
    "colsample_bytree":st.uniform(0.4, 0.6),
    "reg_lambda": [0.1, 1.0, 5.0, 10.0]}
clf = RandomizedSearchCV(xgb.XGBClassifier(objective='binary:logistic'),xgb_parameters,
                         n_iter=100,cv=folds,scoring='f1',n_jobs=-1)
clf.fit(x_retrain, y_retrain)
best_params = clf.best_params_

classifier_2 = clf.best_estimator_
classifier_2.fit(x_retrain, y_retrain)
ypred_2 = classifier_2.predict(x_retest)

cm_2 = confusion_matrix(y_retest,ypred_2)
accuracy_2 = (cm_2[0,0]+cm_2[1,1])/len(y_retest)
precision_2, recall_2, f_score_2, support = precision_recall_fscore_support(y_retest, ypred_2, average = None)
print("\nFor Model 2 - FeaturedXGBoost:")
print("Precision:",precision_2)
print("Recall:",recall_2)
print("F-Score:",f_score_2)
print("Accuracy_FeaturedXGBoost:",accuracy_2*100,'%')
#xgb.plot_tree(classifier_2)

In [None]:
#Better use BalancedBaggingClassifier method to predict the probability of companies bankruptcy
x_retrain = pd.DataFrame(xtrain, columns=xtest.columns)[Features];y_retrain = ytrain
x_retest = xtest[Features]; y_retest = ytest

classifier_3 = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='entropy'),
                                       n_estimators = 5, bootstrap = True)
classifier_3.fit(x_retrain,y_retrain)
ypred_3 = classifier_3.predict(x_retest)

cm_3 = confusion_matrix(y_retest,ypred_3)
accuracy_3 = (cm_3[0,0]+cm_3[1,1])/len(y_retest)
precision_3, recall_3, f_score_3, support = precision_recall_fscore_support(y_retest, ypred_3, average = None)
print("\nFor Model 3 - Featured-BalancedBagging:")
print("Precision:",precision_3)
print("Recall:",recall_3)
print("F-Score:",f_score_3)
print("Accuracy_Featured-BalancedBagging:",accuracy_3*100,'%')
#xgb.plot_tree(classifier_3)

In [None]:
# Plotting Precision-Recall Curve and Giving best precision, recall and f1.
def cvClassifier(clf, X, y, color, name, confMat = False, confMatNormalize = True):
    sfolds = StratifiedKFold(n_splits = 5)
    predicted_prob = np.zeros_like(y, dtype = float)
    for train,test in sfolds.split(X, y):
        clf.fit(X[train,:],y[train])
        y_prob = clf.predict_proba(X[test,:])
        predicted_prob[test] = y_prob[:,1]
    
    precision, recall, thresholds = precision_recall_curve(y, predicted_prob)
    pyplot.plot(recall,precision , color=color,label = name)
    pyplot.xlabel('Recall')
    pyplot.ylabel('Precision')
    pyplot.ylim([0.0, 1.05])
    pyplot.xlim([0.0, 1.0])
    pyplot.title('2-class Precision-Recall curve')
    pyplot.legend()
    
    fscore = 2*(precision*recall)/(precision + recall)
    maxFidx = np.nanargmax(fscore)
    selP = precision[maxFidx]
    selRecall = recall[maxFidx]
    selThreshold = thresholds[maxFidx]

    print(name + ' Best precision is : {}'.format(selP))
    print(name +' Best recall is : {}'.format(selRecall))
    print(name +' Best fscore is : {}\n'.format(fscore[maxFidx]))

cvClassifier(classifier_1, xtrain, ytrain, 'r','XGBoost')
cvClassifier(classifier_2, x_retrain.as_matrix(), y_retrain, 'g','FeaturedXGBoost')
cvClassifier(classifier_3, x_retrain.as_matrix(), y_retrain, 'b','FeaturedBalancedBagging')

# Part III
Calculating the probability of predicting bankruptcy by weighted summation and giving an example

In [None]:
# The function of calculating the probability of predicting bankruptcy
def bankyprob(result,gamma):
    years = len(result)
    if years == 1:
        prob = 1
        label = result['label'][0]
    else:
        result['time'] = [pow(gamma, i+1) for i in range(years)]
        sumweight = sum(result.apply(lambda x: x['fscore']*x['time'], axis=1))
        bankyweight = sum(result.apply(lambda x: x['fscore']*x['time'] if x['label'] == 1 else 0, axis=1))
        prob = bankyweight/sumweight
        label = int(prob>=0.5)
        if label == 0:
            prob = 1 - prob
    return label,prob

In [None]:
# An example: a company is given in which the first three year near the predicted bankruptcy date 
#predicting 0(Normal),1(bankruptcy) and 1(bankruptcy) respectively.
company = []
company.append(scaledData[0][100:101])
company.append(scaledData[1][10000:10001])
company.append(scaledData[2][10200:10201])

In [None]:
#Predicting based on BalancedBagging model
xtrain = X_resampled[0];ytrain = y_resampled[0]
xtest = X_test[0]; ytest = y_test[0]
Features = ['Attr21','Attr34' ,'Attr29','Attr27','Attr24' ,'Attr37' ,'Attr46',
            'Attr5','Attr15','Attr9','Attr6','Attr1','Attr2','Attr11','Attr58','Attr3']
x_retrain = pd.DataFrame(xtrain, columns=xtest.columns)[Features];y_retrain = ytrain
x_retest = xtest[Features]; y_retest = ytest
classifier = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='entropy'),
                                       n_estimators = 5, bootstrap = True)
classifier.fit(x_retrain,y_retrain)
ypred_0 = classifier.predict(company[0][Features])


xtrain = X_resampled[1];ytrain = y_resampled[1]
xtest = X_test[1]; ytest = y_test[1]
Features = ['Attr27','Attr29' ,'Attr46','Attr34','Attr6' ,'Attr21' ,'Attr9',
            'Attr24','Attr58','Attr15','Attr37','Attr5','Attr60','Attr25','Attr1','Attr61']
x_retrain = pd.DataFrame(xtrain, columns=xtest.columns)[Features];y_retrain = ytrain
x_retest = xtest[Features]; y_retest = ytest
classifier = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='entropy'),
                                       n_estimators = 5, bootstrap = True)
classifier.fit(x_retrain,y_retrain)
ypred_1 = classifier.predict(company[1][Features])


xtrain = X_resampled[2];ytrain = y_resampled[2]
xtest = X_test[2]; ytest = y_test[2]
Features = ['Attr5','Attr24' ,'Attr6','Attr46','Attr27' ,'Attr29' ,'Attr34',
            'Attr9','Attr59','Attr15','Attr58','Attr21','Attr37','Attr56','Attr39','Attr41']
x_retrain = pd.DataFrame(xtrain, columns=xtest.columns)[Features];y_retrain = ytrain
x_retest = xtest[Features]; y_retest = ytest
classifier = BalancedBaggingClassifier(base_estimator = RandomForestClassifier(criterion='entropy'),
                                       n_estimators = 5, bootstrap = True)
classifier.fit(x_retrain,y_retrain)
ypred_2 = classifier.predict(company[2][Features])

In [None]:
# Calculating the probability of bankrupcty based on time factor and model 
ypred = pd.DataFrame({'fscore':[0.52,0.53,0.41],'label':list(map(int,[ypred_0,ypred_1,ypred_2]))})
label, prob = bankyprob(ypred,0.8)
print('label: {}\n prob: {}'.format(label,prob))