In [6]:
import numpy as np
import pandas as pd
from interpret.glassbox import ExplainableBoostingClassifier
from libraries.feature_selection import *
from data_preparation.data_preparation import transform_data
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import roc_auc_score
from data_preparation import metrics_eval

In [2]:
x_train, y_train = transform_data(pd.read_csv("datasets/in_time.csv"))
x_oot, y_oot = transform_data(pd.read_csv("datasets/out_of_time.csv"))

In [5]:
x_train.dtypes

Ref_month                           object
Birth_date                          object
No_dependants                        int64
Time_in_address                    float64
Time_in_current_job                float64
                                    ...   
hasExternal_credit_card_balance       bool
hasExternal_term_loan_balance         bool
hasExternal_mortgage_balance          bool
hasActive_credit_card_lines           bool
hasActive_mortgages                   bool
Length: 279, dtype: object

In [3]:
model_for_performance = LogisticRegression()
model= ExplainableBoostingClassifier(interactions=0.0)

pipe = Pipeline([
    #('preprocessing' ,full_pipeline_logisitic),
    #('droping not used', FunctionTransformer(lambda x: x.drop(['scale__remainder__customer_id', 'remainder__remainder__application_date', 'remainder__application_status_transform__Application_status', 'remainder__remainder__Application data: employment date (main applicant)'], axis=1))),
    ('Delete features with gini lower than 0.01', GiniSelector(0.01)), 
    ('Smart Correlated Selection', SmartCorrelatedSelection( variables=None,
                                                             method="spearman",
                                                             threshold=0.8,
                                                             missing_values="raise",
                                                             selection_method="model_performance",
                                                             estimator=model_for_performance,).set_output(transform="pandas")),
    #('Cleaning semi-manualy overly correlated featuers',OverCorrelatedDropper(0.6)),
    #('Logistic regression', model)
], verbose = True)
pipe.fit(x_train, y_train)

ValueError: could not convert string to float: '11-2022'

In [5]:
x_transformed=pipe.transform(x_train)

In [13]:
corr=x_transformed.corr(method="spearman").values
for i in range(corr.shape[0]):
    for j in range(i):
        if(i!=j):
            if(corr[i,j]>=0.6):
                print(i,j)

16 15
18 17


In [17]:
model.fit(x_transformed,y_train)

In [29]:
importances=[(i,j) for i,j in zip(x_transformed.columns ,model.term_importances())]
variables=[i[0] for i in sorted(importances, key=lambda x: x[1],reverse=True)[:9]]
variables

['utilized_limit_in_revolving_loans_H0',
 'DPD_term_loan_H2',
 'Default_flag_H1',
 'Default_flag_H2',
 'Default_flag_H3',
 'DPD_term_loan_H3',
 'Default_flag_H4',
 'DPD_term_loan_H6',
 'Default_flag_H5']

In [37]:
x_selected=x_transformed[variables]
model.fit(x_selected,y_train)

In [36]:
pd.concat([x_selected, y_train], axis=1).to_csv("variable.csv")

In [None]:
""" 
('Sequential Feature Selector', SequentialFeatureSelector(model, 
                                                              n_features_to_select=9,
                                                              scoring='roc_auc', 
                                                              cv=5).set_output(transform="pandas")),

"""

In [46]:
y_train_predict=model.predict_proba(x_selected)[:,1]
y_oot_predict=model.predict_proba(x_oot[variables])[:,1]

In [45]:
roc_auc_score(y_train, y_train_predict)

0.8426342390174884

In [47]:
roc_auc_score(y_oot, y_oot_predict)

0.7906252133155331

In [None]:
metrics_eval.evaluate_model(model, x_train, y_train, x_oot, y_oot)