In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

In [2]:
data_org= pd.read_csv('1_credit_prp.csv')
data=data_org.copy()
data.dtypes


Loan Status                   object
Current Loan Amount          float64
Term                          object
Credit Score                 float64
Annual Income                float64
Years in current job          object
Home Ownership                object
Purpose                       object
Monthly Debt                 float64
Years of Credit History      float64
Number of Open Accounts      float64
Number of Credit Problems     object
Current Credit Balance       float64
Maximum Open Credit          float64
Bankruptcies                  object
dtype: object

### Label Encoding

In [3]:
category_vars = list(data.select_dtypes(include='object').columns)
category_vars

['Loan Status',
 'Term',
 'Years in current job',
 'Home Ownership',
 'Purpose',
 'Number of Credit Problems',
 'Bankruptcies']

In [4]:
category_vars= category_vars[1:]
category_vars

['Term',
 'Years in current job',
 'Home Ownership',
 'Purpose',
 'Number of Credit Problems',
 'Bankruptcies']

In [5]:
## encoding categorecal variables 
for i in category_vars:
    data[i]= data[i].astype('category')
    data[i]= data[i].cat.codes
    data[i]= data[i].astype('float64')
    
data['Loan Status'].mask(data['Loan Status'] == 'Fully Paid', 1 , inplace=True)
data['Loan Status'].mask(data['Loan Status'] == 'Charged Off', 0 , inplace=True)
data['Loan Status']= data['Loan Status'].astype('float64')

### Correlation 

In [6]:
data_cp= data.drop('Loan Status', axis=1)
corrmat = data_cp.corr()
corrmat = corrmat.abs().unstack() # absolute value of corr coef
corrmat = corrmat.sort_values(ascending=False)
corrmat = corrmat[corrmat >= 0]
corrmat = corrmat[corrmat < 1]
corrmat = pd.DataFrame(corrmat).reset_index()
corrmat.columns = ['feature1', 'feature2', 'correlation']
corrmat.head()

Unnamed: 0,feature1,feature2,correlation
0,Bankruptcies,Number of Credit Problems,0.802224
1,Number of Credit Problems,Bankruptcies,0.802224
2,Monthly Debt,Current Credit Balance,0.477801
3,Current Credit Balance,Monthly Debt,0.477801
4,Monthly Debt,Annual Income,0.472384


In [7]:
data_corr= data_org.drop('Number of Credit Problems', axis=1)
data_corr.to_csv("Correlation/data.csv", index=False)


### Univariante ROC_AUC

In [8]:
### spliting data en X et Y
X= data.drop('Loan Status', axis=1)
Y= data['Loan Status']


In [9]:
# loop to build a tree, make predictions and get the roc-auc
# for each feature of the train set

In [10]:
roc_values = []
cv = StratifiedKFold(n_splits=20)
for feature in X.columns:
    roc_temp_list = []
    X_= X[feature].copy()
    for train, test in cv.split(X_, Y):
        clf = DecisionTreeClassifier()
        clf.fit(X_.iloc[train].fillna(0).to_frame(), Y.iloc[train])
        y_scored = clf.predict_proba(X_.iloc[test].fillna(0).to_frame())
        roc_temp_list.append(roc_auc_score(Y.iloc[test], y_scored[:, 1]))
    roc_values.append(np.array(roc_temp_list).mean())
    
    

In [11]:
# let's add the variable names and order it for clearer visualisation
#roc_values = pd.Series(roc_values)
#roc_values.index = X.columns

roc_table= pd.DataFrame({'features': X.columns, 'mean roc_auc_score': roc_values})
roc_table=roc_table.reset_index(drop=True)
roc_table=roc_table.sort_values(by=['mean roc_auc_score'], ascending=False)
roc_table


Unnamed: 0,features,mean roc_auc_score
2,Credit Score,0.694332
1,Term,0.571232
0,Current Loan Amount,0.568786
5,Home Ownership,0.534774
3,Annual Income,0.525823
7,Monthly Debt,0.524093
12,Maximum Open Credit,0.52195
8,Years of Credit History,0.520067
11,Current Credit Balance,0.51805
4,Years in current job,0.513932


In [12]:
data_unv= data_org.drop(['Number of Credit Problems', 'Bankruptcies', 'Number of Open Accounts', 'Purpose'], axis=1)
data_unv.to_csv("Univariante ROC_AUC/data.csv", index=False)
   