In [1]:
import numpy as np
import pandas as pd
from interpret.glassbox import ExplainableBoostingClassifier
from libraries.feature_selection import *
from data_preparation.data_preparation import transform_data
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
x_train, y_train = transform_data(pd.read_csv("datasets/in_time.csv"))
x_oot, y_oot = transform_data(pd.read_csv("datasets/out_of_time.csv"))

In [16]:
model_for_performance = LogisticRegression()
model= ExplainableBoostingClassifier()

pipe = Pipeline([
    #('preprocessing' ,full_pipeline_logisitic),
    #('droping not used', FunctionTransformer(lambda x: x.drop(['scale__remainder__customer_id', 'remainder__remainder__application_date', 'remainder__application_status_transform__Application_status', 'remainder__remainder__Application data: employment date (main applicant)'], axis=1))),
    ('Delete features with gini lower than 0.01', GiniSelector(0.01)), 
    ('Smart Correlated Selection', SmartCorrelatedSelection( variables=None,
                                                             method="spearman",
                                                             threshold=0.8,
                                                             missing_values="raise",
                                                             selection_method="model_performance",
                                                             estimator=model_for_performance,).set_output(transform="pandas")),
    #('Cleaning semi-manualy overly correlated featuers',OverCorrelatedDropper(0.6)),
    #('Logistic regression', model)
], verbose = True)
pipe.fit(x_train, y_train)

[Pipeline]  (step 1 of 2) Processing Delete features with gini lower than 0.01, total=  14.0s
[Pipeline]  (step 2 of 2) Processing Smart Correlated Selection, total= 1.3min


In [5]:
x_transformed=pipe.transform(x_train)

In [13]:
corr=x_transformed.corr(method="spearman").values
for i in range(corr.shape[0]):
    for j in range(i):
        if(i!=j):
            if(corr[i,j]>=0.6):
                print(i,j)

16 15
18 17


In [17]:
model.fit(x_transformed,y_train)

In [29]:
importances=[(i,j) for i,j in zip(x_transformed.columns ,model.term_importances())]
variables=[i[0] for i in sorted(importances, key=lambda x: x[1],reverse=True)[:9]]
variables

['utilized_limit_in_revolving_loans_H0',
 'DPD_term_loan_H2',
 'Default_flag_H1',
 'Default_flag_H2',
 'Default_flag_H3',
 'DPD_term_loan_H3',
 'Default_flag_H4',
 'DPD_term_loan_H6',
 'Default_flag_H5']

In [36]:
pd.concat([x_transformed[variables], y_train], axis=1).to_csv("variable.csv")

In [None]:
""" 
('Sequential Feature Selector', SequentialFeatureSelector(model, 
                                                              n_features_to_select=9,
                                                              scoring='roc_auc', 
                                                              cv=5).set_output(transform="pandas")),

"""

In [3]:
train.describe()

Unnamed: 0,Customer_id,No_dependants,Time_in_address,Time_in_current_job,Credit_cards,Debit_cards,Active_accounts,Active_loans,Active_mortgages,Active_credit_card_lines,...,out_transactions_amt_H8,out_transactions_amt_H7,out_transactions_amt_H6,out_transactions_amt_H5,out_transactions_amt_H4,out_transactions_amt_H3,out_transactions_amt_H2,out_transactions_amt_H1,out_transactions_amt_H0,Target
count,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,...,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0,310000.0
mean,34995230.0,1.159529,8.472011,4.733869,0.246939,1.520745,1.220213,1.040158,-8200.258065,-8808.576742,...,4875.162629,4873.64986,4875.01298,4874.377914,4872.908508,4874.697344,4873.802552,4874.401424,4874.218386,0.068506
std,2886171.0,0.959115,8.490954,5.152743,0.457813,0.728501,0.460074,0.19633,3840.832513,3238.394342,...,1628.568628,1627.367891,1627.495556,1629.18031,1625.356039,1627.949687,1627.401824,1628.402842,1627.676323,0.252613
min,30000020.0,0.0,0.02,0.03,0.0,1.0,1.0,1.0,-9999.0,-9999.0,...,3112.43,3108.89,3108.69,3110.62,3104.11,3113.58,3104.13,3100.98,3102.91,0.0
25%,32493420.0,0.0,2.69,1.31,0.0,1.0,1.0,1.0,-9999.0,-9999.0,...,3820.85,3820.6,3822.3475,3820.6775,3820.1175,3822.2675,3821.2475,3820.895,3821.2,0.0
50%,34991990.0,1.0,5.54,2.97,0.0,1.0,1.0,1.0,-9999.0,-9999.0,...,4351.5,4348.85,4350.695,4349.075,4350.165,4350.735,4348.365,4348.95,4349.8,0.0
75%,37492240.0,2.0,11.16,6.23,0.0,2.0,1.0,1.0,-9999.0,-9999.0,...,5298.0225,5297.1875,5300.13,5294.4275,5299.13,5294.7825,5300.2025,5298.0625,5298.1275,0.0
max,39999940.0,4.0,67.97,49.41,2.0,4.0,3.0,2.0,1.0,2.0,...,20353.1,22575.64,19739.77,22748.87,20216.12,19710.56,20713.02,19914.48,20969.01,1.0
