# Run different Ensemble methods, Oversampling and Feature selection and reduction.

In [1]:
# 01. Data Preprocessing

import pandas as pd
from sklearn import preprocessing
from IPython.display import display, HTML

df_loan20K=pd.read_csv('Data_Loans_20K.csv', header=0)

print(df_loan20K.shape)

cols_loan20K = df_loan20K.columns

print('Column Name, DataTypes, MissingValues in Loan 20K CSV\n')
for i in cols_loan20K:
    print(i,',', df_loan20K[i].dtype , ',', df_loan20K[i].isnull().any())

##If Yes, fill in missing values by mean values or most frequent nominal values.

df_loan20K["Credit Score"].fillna(df_loan20K["Credit Score"].mean(), inplace=True)
df_loan20K["Annual Income"].fillna(df_loan20K["Annual Income"].mean(), inplace=True)
df_loan20K["Years in current job"].fillna(df_loan20K["Years in current job"].mode().iloc[0], inplace=True)
df_loan20K["Months since last delinquent"].fillna(df_loan20K["Months since last delinquent"].mean(), inplace=True)
df_loan20K["Bankruptcies"].fillna(df_loan20K["Bankruptcies"].mean(), inplace=True)
df_loan20K["Tax Liens"].fillna(df_loan20K["Tax Liens"].mean(), inplace=True)

print('\nColumn Name, DataTypes, MissingValues after filling with Mean and Frequent repeated nominal value in Loan 20K CSV\n')
for i in cols_loan20K:
    print(i,',', df_loan20K[i].dtype , ',', df_loan20K[i].isnull().any())    

# print out and display dataframe as table in HTML
display(HTML(df_loan20K.head(10).to_html()))

# encode labels
y = df_loan20K['Term'] # define label as nominal values
le = preprocessing.LabelEncoder()
le.fit(y)
y_encoded = le.transform(y) # encode nominal labels to integers #####################################
print(y_encoded)
df_loan20K['Term'] = y_encoded

# Data preprocessing ################################################################################
print('Column Datatypes:\n',df_loan20K.dtypes)

# convert all nominal variables to binary variables
df_num=df_loan20K.copy(deep=True) 
# create new binary columns
df_dummies=pd.get_dummies(df_num[['Loan Status','Years in current job','Home Ownership','Purpose']])
# add them to dataframe
df_num=df_num.join(df_dummies)
# drop original columns
df_num=df_num.drop('Loan Status',axis=1)
df_num=df_num.drop('Years in current job',axis=1)
df_num=df_num.drop('Home Ownership', axis=1)
df_num=df_num.drop('Purpose', axis=1)

# drop extra binary columns, since we only need N-1 binary columns
df_num=df_num.drop('Loan Status_Charged Off', axis=1)
df_num=df_num.drop('Years in current job_10+ years', axis=1)
df_num=df_num.drop('Home Ownership_Own Home', axis=1)
df_num=df_num.drop('Purpose_wedding', axis=1)

# print out and display dataframe as table in HTML
display(HTML(df_num.head(10).to_html()))

(20000, 17)
Column Name, DataTypes, MissingValues in Loan 20K CSV

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , True
Annual Income , float64 , True
Years in current job , object , True
Home Ownership , object , False
Purpose , object , False
Monthly Debt , float64 , False
Years of Credit History , float64 , False
Months since last delinquent , float64 , True
Number of Open Accounts , int64 , False
Number of Credit Problems , int64 , False
Current Credit Balance , int64 , False
Maximum Open Credit , int64 , False
Bankruptcies , float64 , True
Tax Liens , float64 , True

Column Name, DataTypes, MissingValues after filling with Mean and Frequent repeated nominal value in Loan 20K CSV

Loan Status , object , False
Current Loan Amount , int64 , False
Term , object , False
Credit Score , float64 , False
Annual Income , float64 , False
Years in current job , object , False
Home Ownership , object , False
Purpose , object , Fal

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0
1,Fully Paid,262328,Short Term,1094.310471,1376165.0,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0
2,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0
3,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0
4,Fully Paid,176220,Short Term,1094.310471,1376165.0,5 years,Rent,Debt Consolidation,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0
5,Charged Off,206602,Short Term,7290.0,896857.0,10+ years,Home Mortgage,Debt Consolidation,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0
6,Fully Paid,217646,Short Term,730.0,1184194.0,< 1 year,Home Mortgage,Debt Consolidation,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0
7,Charged Off,648714,Long Term,1094.310471,1376165.0,< 1 year,Home Mortgage,Buy House,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0
8,Fully Paid,548746,Short Term,678.0,2559110.0,2 years,Rent,Debt Consolidation,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0
9,Fully Paid,215952,Short Term,739.0,1454735.0,< 1 year,Rent,Debt Consolidation,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0


[1 1 1 ... 0 1 1]
Column Datatypes:
 Loan Status                      object
Current Loan Amount               int64
Term                              int32
Credit Score                    float64
Annual Income                   float64
Years in current job             object
Home Ownership                   object
Purpose                          object
Monthly Debt                    float64
Years of Credit History         float64
Months since last delinquent    float64
Number of Open Accounts           int64
Number of Credit Problems         int64
Current Credit Balance            int64
Maximum Open Credit               int64
Bankruptcies                    float64
Tax Liens                       float64
dtype: object


Unnamed: 0,Current Loan Amount,Term,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens,Loan Status_Fully Paid,Years in current job_1 year,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Business Loan,Purpose_Buy House,Purpose_Buy a Car,Purpose_Debt Consolidation,Purpose_Educational Expenses,Purpose_Home Improvements,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation
0,445412,1,709.0,1167493.0,5214.74,17.2,35.20179,6,1,228190,416746,1.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,262328,1,1094.310471,1376165.0,33295.98,21.1,8.0,35,0,229976,850784,0.0,0.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,99999999,1,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,347666,0,721.0,806949.0,8741.9,12.0,35.20179,9,0,256329,386958,0.0,0.0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,176220,1,1094.310471,1376165.0,20639.7,6.1,35.20179,15,0,253460,427174,0.0,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
5,206602,1,7290.0,896857.0,16367.74,17.3,35.20179,6,0,215308,272448,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,217646,1,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052,1.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,648714,0,1094.310471,1376165.0,14806.13,8.2,8.0,15,0,193306,864204,0.0,0.0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8,548746,1,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038,0.0,0.0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,215952,1,739.0,1454735.0,39277.75,13.9,35.20179,20,0,669560,1021460,0.0,0.0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [2]:
# 02. Bagging meta-estimator

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score

y = df_num['Term']
x = df_num.drop('Term', axis=1)

# by 10-fold cross validation Bagging meta-estimator
tree = DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(bag, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(bag, x, y, cv=10, scoring=precision).mean()
print("Bagging meta-estimator accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

# by 10-fold cross validation Bagging meta-estimator
tree = DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, max_features=0.8)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(bag, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(bag, x, y, cv=10, scoring=precision).mean()
print("Bagging meta-estimator accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bagging meta-estimator accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Bagging meta-estimator accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
# 03. Random Forset 
from sklearn.ensemble import RandomForestClassifier

# by 10-fold cross validation Bagging meta-estimator
randForest = RandomForestClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1, n_estimators=100, max_samples=0.8, random_state=1)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(randForest, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(randForest, x, y, cv=10, scoring=precision).mean()
print("Forests of randomized trees accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

# by 10-fold cross validation Bagging meta-estimator
randForest = RandomForestClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1, n_estimators=100, max_samples=0.8, max_features=0.8)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(randForest, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(randForest, x, y, cv=10, scoring=precision).mean()
print("Forests of randomized trees accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Forests of randomized trees accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Forests of randomized trees accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# 04. AdaBoosting

from sklearn.ensemble import AdaBoostClassifier

# by 10-fold cross validation Bagging meta-estimator
tree = DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
adaBoost = AdaBoostClassifier(tree, n_estimators=100, learning_rate=1.0, algorithm='SAMME', random_state=1)
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(adaBoost, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(adaBoost, x, y, cv=10, scoring=precision).mean()
print("AdaBoosting accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

# by 10-fold cross validation Bagging meta-estimator
tree = DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
adaBoost = AdaBoostClassifier(tree, n_estimators=100, learning_rate=1.0, algorithm='SAMME.R', random_state=1 )
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(adaBoost, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(adaBoost, x, y, cv=10, scoring=precision).mean()
print("AdaBoosting accuracy by 10-fold Cross Validation: acc = ",acc, "precision = ", pre)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoosting accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoosting accuracy by 10-fold Cross Validation: acc =  0.71745 precision =  0.358725


  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# 05. Gradient Boosting 
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100, random_state=1, learning_rate=1.0, criterion='squared_error') 
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("GradientBoosting Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)

GradientBoosting Accuracy by N-fold Cross Validation: acc =  0.80495 precision =  0.7616084501918581


In [6]:
# 06. Histogram Gradient Boosting
from sklearn.ensemble import HistGradientBoostingClassifier

clf = HistGradientBoostingClassifier(random_state=1, learning_rate=1.0, max_depth=10) 
precision = make_scorer(precision_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
print("HistGradientBoosting Accuracy by N-fold Cross Validation: acc = ",acc, "precision = ", pre)

HistGradientBoosting Accuracy by N-fold Cross Validation: acc =  0.79565 precision =  0.7485598109098744


In [7]:
# 07. Oversampling

from collections import Counter
from imblearn.over_sampling import RandomOverSampler



ros = RandomOverSampler(random_state=10)
ros.fit(x, y)
print('\nOriginal dataset shape {}'.format(Counter(y)))
# by 10-fold cross validation with Gini impurity criterion
clf=DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
precision = make_scorer(precision_score, average='macro')
recall = make_scorer(recall_score, average='macro')
acc=cross_val_score(clf, x, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, x, y, cv=10, scoring=precision).mean()
rec=cross_val_score(clf, x, y, cv=10, scoring=recall).mean()
print('By 10-fold Cross Validation: Original Decision tree accuracy = ',acc, ', precison = ', pre, ', recall = ', rec)

x_resampled, y_resampled = ros.fit_resample(x, y)
print('After oversampling dataset shape {}'.format(Counter(y_resampled)))

# by 10-fold cross validation with Gini impurity criterion
clf=DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
precision = make_scorer(precision_score, average='macro')
recall = make_scorer(recall_score, average='macro')
acc=cross_val_score(clf, x_resampled, y_resampled, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, x_resampled, y_resampled, cv=10, scoring=precision).mean()
rec=cross_val_score(clf, x_resampled, y_resampled, cv=10, scoring=recall).mean()
print('By 10-fold Cross Validation: Oversampling Decision tree accuracy = ',acc, ', precison = ', pre, ', recall = ', rec)


Original dataset shape Counter({1: 14349, 0: 5651})


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


By 10-fold Cross Validation: Original Decision tree accuracy =  0.71745 , precison =  0.358725 , recall =  0.5
After oversampling dataset shape Counter({1: 14349, 0: 14349})
By 10-fold Cross Validation: Oversampling Decision tree accuracy =  0.7022789448180295 , precison =  0.7515710463933435 , recall =  0.7022794843011192


In [8]:
# 08. Feature selection and reduction

from scipy.stats import f_oneway

X = df_num.drop('Term', axis=1)
print('\nSelected features by ANOVA:')
for col in X.columns:
    group1 = df_num[df_num['Term']==0][col]
    group2 = df_num[df_num['Term']==1][col]
    group3 = df_num[df_num['Term']==2][col]
    group4 = df_num[df_num['Term']==3][col]
    stat, p = f_oneway(group1, group2, group3, group4)
    if p < 0.01:
        print(col)
        
# ANOVA method

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

selector = SelectKBest(f_classif, k=10).fit(x, y)
print('\nSelected features by ANOVA:', selector.get_feature_names_out())

X_reduced = x[selector.get_feature_names_out()]

# print out and display dataframe as table in HTML
display(HTML(X_reduced.head(10).to_html()))


selector = SelectKBest(f_classif, k=10).fit(x_resampled, y_resampled)
print('\nSelected features by ANOVA:', selector.get_feature_names_out())

X_resampled_reduced = x_resampled[selector.get_feature_names_out()]

# print out and display dataframe as table in HTML
display(HTML(X_resampled_reduced.head(10).to_html()))



Selected features by ANOVA:





Selected features by ANOVA: ['Current Loan Amount' 'Annual Income' 'Monthly Debt'
 'Years of Credit History' 'Number of Open Accounts'
 'Current Credit Balance' 'Loan Status_Fully Paid'
 'Home Ownership_Home Mortgage' 'Home Ownership_Rent'
 'Purpose_Debt Consolidation']


Unnamed: 0,Current Loan Amount,Annual Income,Monthly Debt,Years of Credit History,Number of Open Accounts,Current Credit Balance,Loan Status_Fully Paid,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Debt Consolidation
0,445412,1167493.0,5214.74,17.2,6,228190,1,1,0,0
1,262328,1376165.0,33295.98,21.1,35,229976,1,1,0,1
2,99999999,2231892.0,29200.53,14.9,18,297996,1,0,0,1
3,347666,806949.0,8741.9,12.0,9,256329,1,0,0,1
4,176220,1376165.0,20639.7,6.1,15,253460,1,0,1,1
5,206602,896857.0,16367.74,17.3,6,215308,0,1,0,1
6,217646,1184194.0,10855.08,19.6,13,122170,1,1,0,1
7,648714,1376165.0,14806.13,8.2,15,193306,0,1,0,0
8,548746,2559110.0,18660.28,22.6,4,437171,1,0,1,1
9,215952,1454735.0,39277.75,13.9,20,669560,1,0,1,1



Selected features by ANOVA: ['Current Loan Amount' 'Annual Income' 'Monthly Debt'
 'Number of Open Accounts' 'Current Credit Balance'
 'Loan Status_Fully Paid' 'Home Ownership_Home Mortgage'
 'Home Ownership_Rent' 'Purpose_Debt Consolidation' 'Purpose_Take a Trip']


Unnamed: 0,Current Loan Amount,Annual Income,Monthly Debt,Number of Open Accounts,Current Credit Balance,Loan Status_Fully Paid,Home Ownership_Home Mortgage,Home Ownership_Rent,Purpose_Debt Consolidation,Purpose_Take a Trip
0,445412,1167493.0,5214.74,6,228190,1,1,0,0,0
1,262328,1376165.0,33295.98,35,229976,1,1,0,1,0
2,99999999,2231892.0,29200.53,18,297996,1,0,0,1,0
3,347666,806949.0,8741.9,9,256329,1,0,0,1,0
4,176220,1376165.0,20639.7,15,253460,1,0,1,1,0
5,206602,896857.0,16367.74,6,215308,0,1,0,1,0
6,217646,1184194.0,10855.08,13,122170,1,1,0,1,0
7,648714,1376165.0,14806.13,15,193306,0,1,0,0,0
8,548746,2559110.0,18660.28,4,437171,1,0,1,1,0
9,215952,1454735.0,39277.75,20,669560,1,0,1,1,0


In [9]:
# 09. Decision tree classifier after feature reduction.

# by 10-fold cross validation with Gini impurity criterion
clf=DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
acc=cross_val_score(clf, X_reduced, y, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, X_reduced, y, cv=10, scoring=precision).mean()
rec=cross_val_score(clf, X_reduced, y, cv=10, scoring=recall).mean()
print('By 10-fold Cross Validation: Original Decision tree accuracy = ',acc, ', precison = ', pre, ', recall = ', rec)

# by 10-fold cross validation with Gini impurity criterion
clf=DecisionTreeClassifier(criterion='gini', max_depth=10, ccp_alpha = 0.1)
acc=cross_val_score(clf, X_resampled_reduced, y_resampled, cv=10, scoring='accuracy').mean()
pre=cross_val_score(clf, X_resampled_reduced, y_resampled, cv=10, scoring=precision).mean()
rec=cross_val_score(clf, X_resampled_reduced, y_resampled, cv=10, scoring=recall).mean()
print('By 10-fold Cross Validation: Oversampling Decision tree accuracy = ',acc, ', precison = ', pre, ', recall = ', rec)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


By 10-fold Cross Validation: Original Decision tree accuracy =  0.71745 , precison =  0.358725 , recall =  0.5
By 10-fold Cross Validation: Oversampling Decision tree accuracy =  0.7022789448180295 , precison =  0.7515710463933435 , recall =  0.7022794843011192
