In [24]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

from bpi_fr_algo_credit_scoring.conf import DATA_ROOT
from bpi_fr_algo_credit_scoring.reader import read_yearly_data
from bpi_fr_algo_credit_scoring.feature_engineering import feature_engineering

In [5]:
# Import of dataset
default_risk_dataset = read_yearly_data(
    path=DATA_ROOT, 
    default_year=1
)

# Feature engineering part
default_risk_dataset = feature_engineering(
    default_risk_dataset, 
    to_drop_feat=
        ['(current assets - inventories) / long - term liabilities',
        'sales(n) / sales(n - 1)',
        'total liabilities / total assets', 
        '(gross profit + depreciation) / sales', 
        'gross profit / sales',
        '(inventory * 365) / sales', 
        'net profit / sales',
        '(total liabilities - cash) / sales',
        '(gross profit + interest) / sales',
        'constant capital / total assets', 
        'profit on sales / sales',
        'profit on operating activities / sales',
        'rotation receivables + inventory turnover in days',
        '(receivables * 365) / sales',
        'net profit / inventory',
        'EBITDA(profit on operating activities - depreciation) / sales',
        '(sales - cost of products sold) / sales'
        ], 
    to_drop_companies=
        [5395, 1900, 5334, 279, 1232, 75, 4472, 644, 5787, 5986, 3908, 2434, 
        4422, 2499, 4516, 1815, 1814, 1677, 1715, 5913, 4556, 6293, 2616, 
        2259, 6182, 177, 6950, 238, 1411, 5600, 82, 5283, 7005, 1417, 5914, 
        6769, 267, 3261, 4605, 2679, 5432, 309, 5490, 1919, 1850, 3134, 4530, 
        6844, 6921, 307, 3680, 4169, 5438, 6075, 2486, 6684, 4911, 2357, 
        3898, 6346, 6683, 5223, 6102, 1155, 2939, 1179, 1412, 3455, 171, 2887, 
        5811, 4463, 222, 3332, 2202, 6922, 2835, 3592, 2930, 2932, 2305, 6812, 
        2938, 2615, 165, 6160, 2881, 2649, 1237, 1178, 6867, 1554, 1664, 4540, 
        1713, 1749, 3394, 7000, 4131, 325, 560, 6758, 4027, 6851, 3770
        ]
)

In [10]:
# Training and testing split
Y = default_risk_dataset['Default']
X = default_risk_dataset.drop('Default', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state =55)

In [11]:
# Lazy predict results
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
 97%|█████████▋| 28/29 [00:26<00:00,  1.24it/s]

[LightGBM] [Info] Number of positive: 210, number of negative: 5327
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11985
[LightGBM] [Info] Number of data points in the train set: 5537, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.037927 -> initscore=-3.233436
[LightGBM] [Info] Start training from score -3.233436


100%|██████████| 29/29 [00:27<00:00,  1.06it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.98,0.79,0.79,0.98,0.25
DecisionTreeClassifier,0.96,0.77,0.77,0.97,0.74
XGBClassifier,0.98,0.77,0.77,0.98,2.12
BaggingClassifier,0.98,0.75,0.75,0.98,6.1
RandomForestClassifier,0.98,0.73,0.73,0.98,7.04
AdaBoostClassifier,0.97,0.71,0.71,0.97,4.07
NearestCentroid,0.61,0.68,0.68,0.73,0.03
BernoulliNB,0.66,0.66,0.66,0.77,0.05
ExtraTreeClassifier,0.94,0.59,0.59,0.94,0.04
ExtraTreesClassifier,0.97,0.58,0.58,0.96,0.65


In [25]:
# Results with model selected
model_selected = DecisionTreeClassifier()
model_selected.fit(X_train, y_train)
y_pred = model_selected.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1335
           1       0.96      0.44      0.60        50

    accuracy                           0.98      1385
   macro avg       0.97      0.72      0.80      1385
weighted avg       0.98      0.98      0.98      1385

[[1334    1]
 [  28   22]]


In [None]:
# Model hypertuning