# Feature selection
Featurization produced 1770 features, however all of these features can not be used in the machine learning model. It is necessary to understand the most important features among these 1770 features which will be useful for better predictions.

This script describes the steps in feature selection such as data loading, data cleaning, statistical methods (Weight of evidence and Information Value) for feature selection and final feature selection using votes from multiple algorithms.

## Importing libraries, modules and data

In [None]:
# Importing the required libraries and modules
import os
import zipfile
import numpy as np
import pandas as pd
from datetime import datetime
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
from sklearn.preprocessing import MinMaxScaler
import imblearn
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Defining the functions

In [None]:
#Function for reducing the memory usage of dataframe
def reduce_mem_usage(data, verbose = True):
    #Reference: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
    '''
    This function is used to reduce the memory usage by converting the datatypes of a pandas
    DataFrame withing required limits.

    Inputs=
    data= name of the dataframe

    Outputs=
    Memory reduced dataframe

    '''

    start_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('-'*100)
        print('Memory usage of dataframe: {:.2f} MB'.format(start_mem))

    for col in data.columns:
        col_type = data[col].dtype

        if col_type != object:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)

    end_mem = data.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage after optimization: {:.2f} MB'.format(end_mem))
        print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
        print('-'*100)

    return data

In [None]:
# Function to plot bar plot of percentage NaN values in a dataframe
def plot_nan_pct(dataframe,title):
  '''
  Function plots bar plot representing percentage of NaN values in a dataframe.
  This function first computes all the variable/columns of dataframe consisting of NaN
  values and then computes the corresponding percentage and plots that percentage values.

  Inputs=
  dataframe= DataFrame name
  title= data frame name entered as string (This will be used in bar plot title)

  Outputs=
  1. Bar plot ot percentage of NaN values in a dataframe
    '''

  nan_col_name=[]
  nan_val_count=[]
  nan_value_dict={}

  # prepering a dictionary of columns and correponding NaN percentage
  for i in range(dataframe.shape[1]):
    count=round(dataframe[dataframe[dataframe.columns[i]].isna()].shape[0]/dataframe.shape[0]*100,2)
    if count!=0:
      nan_value_dict[dataframe.columns[i]]=count
  # sorting the dict in reverse order and storing the column name and NaN percentage in lists
  for w in sorted(nan_value_dict, key=nan_value_dict.get, reverse=True):
    nan_val_count.append(nan_value_dict[w])
    nan_col_name.append(w)

  if len(nan_val_count)>0:
    print("Number of variables having NaN samples are ",len(nan_col_name))
    # generating the plot
    fig = plt.figure(figsize = (25, 5))

    # creating the bar plot
    plt.bar(nan_col_name, nan_val_count, color ='maroon')

    plt.xlabel("Variable Name")
    plt.ylabel("Percentage (%)")
    plt.title("Percentage of NaN values in "+title)
    plt.xticks(rotation = 90)
    plt.show()

  else:
    print("Dataframe {} does not have any NaN variable".format(title))

In [None]:
# Lets remove the columns which has >=99% same column values
def drop_constant_column(dataf,freq_ratio=0.99):
  most_freq_col_df=pd.DataFrame(columns=["Column_Name","Item_Freq"])
  for col in range(0,dataf.columns.size):
    item_freq=dataf.iloc[:,col].value_counts().max()/dataf.shape[0]
    most_freq_col_df=most_freq_col_df.append(pd.DataFrame([[dataf.columns[col],item_freq]],
                                                          columns=["Column_Name","Item_Freq"]))
  remove_columns=most_freq_col_df[(most_freq_col_df["Item_Freq"]>freq_ratio)]["Column_Name"].values
  if remove_columns.tolist():
    print("Folowing columns are removed from the data: ")
    print(sorted(remove_columns.tolist()))
  else:
    print("All columns were keptas no columns with frequent value > {} is found".format(freq_ratio))

  # lets delete the most repeated column from the dataf
  dataf.drop(remove_columns,axis=1,inplace=True)

  return(dataf)

In [None]:
# https://github.com/Sundar0989/WOE-and-IV?source=post_page-----6f05072e83eb--------------------------------
from numpy.lib import quantile
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 16

# define a binning function
def mono_bin(Y, X, n = max_bin):
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1
        except Exception as e:
            n = n - 1
    if len(d2) == 1:
        n = force_bin
        # bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        bins = notmiss.X.quantile(np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            # bins = np.insert(bins, 0, 1)
            bins=np.insert(np.asarray(bins), 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)})
        d2 = d1.groupby('Bucket', as_index=True)
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    return(d3)

def char_bin(Y, X):

    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    df2 = notmiss.groupby('X',as_index=True)

    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y

    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)

    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)

    return(d3)

def data_vars(df1, target):

    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]

    x = df1.columns
    count = -1

    for i in x:
        print('processing ',i)
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1

            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)

    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

## Loading and cleaning of merged dataframe

In [None]:
# Importing the featurized dataframes
pickle_path = '/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/'

#1. Merged Application train dataframe
pickle_in=open(pickle_path+"application_train_merged.pickle","rb")
application_train_merged=pickle.load(pickle_in)
pickle_in.close()
print("Shape of featurized Application train merged dataframe is",application_train_merged.shape)

Shape of featurized Application train merged dataframe is (307511, 1772)


In [None]:
# Lets check for missing values in the merged dataframe
plot_nan_pct(application_train_merged,"application_train_merged")

Dataframe application_train_merged does not have any NaN variable


In [None]:
# Lets remove the columns from merged dataframe which has >99% frequency of a single value
# For the 8% default rate we can keep 99% threshold for constant column drop
print("Shape of the application_train_merged dataframe is ",application_train_merged.shape)
application_train_merged=drop_constant_column(application_train_merged,0.99)
print("Shape of the application_train_merged after droping constant columns is ",application_train_merged.shape)

Shape of the application_train_merged dataframe is  (307511, 1772)
Folowing columns are removed from the data: 
['AMT_BALANCE_MAXCompleted', 'AMT_BALANCE_MAXRest', 'AMT_BALANCE_MEANCompleted', 'AMT_BALANCE_MEANRest', 'AMT_BALANCE_SUMCompleted', 'AMT_BALANCE_SUMRest', 'AMT_DRAWINGS_ATM_CURRENT_MAXCompleted', 'AMT_DRAWINGS_ATM_CURRENT_MAXRest', 'AMT_DRAWINGS_ATM_CURRENT_SUMCompleted', 'AMT_DRAWINGS_ATM_CURRENT_SUMRest', 'AMT_DRAWINGS_CURRENT_MAXCompleted', 'AMT_DRAWINGS_CURRENT_MAXRest', 'AMT_DRAWINGS_CURRENT_SUMCompleted', 'AMT_DRAWINGS_CURRENT_SUMRest', 'AMT_DRAWINGS_OTHER_CURRENT_MAXCompleted', 'AMT_DRAWINGS_OTHER_CURRENT_MAXRest', 'AMT_DRAWINGS_OTHER_CURRENT_MAX_Latest_year', 'AMT_DRAWINGS_OTHER_CURRENT_SUMCompleted', 'AMT_DRAWINGS_OTHER_CURRENT_SUMRest', 'AMT_DRAWINGS_OTHER_CURRENT_SUM_Latest_year', 'AMT_DRAWINGS_POS_CURRENT_MAXCompleted', 'AMT_DRAWINGS_POS_CURRENT_MAXRest', 'AMT_DRAWINGS_POS_CURRENT_SUMCompleted', 'AMT_DRAWINGS_POS_CURRENT_SUMRest', 'AMT_DRAWING_SUM_MAXCompleted', 

In [None]:
#2. Merged Application test dataframe
pickle_in=open(pickle_path+"application_test_merged.pickle","rb")
application_test_merged=pickle.load(pickle_in)
pickle_in.close()
print("Shape of featurized Application test merged dataframe is",application_test_merged.shape)

Shape of featurized Application test merged dataframe is (48744, 1771)


In [None]:
# Lets check for missing values in the merged dataframe
plot_nan_pct(application_test_merged,"application_test_merged")

Dataframe application_test_merged does not have any NaN variable


## Splitting merged dataframe in train, test and oot

In [None]:
train, test, oot = np.split(application_train_merged.sample(frac=1, random_state=42),
                            [int(.8*len(application_train_merged)), int(.9*len(application_train_merged))])

print('Shape of the trin data is ', train.shape)
print('Shape of the test data is ', test.shape)
print('Shape of the oot data is ', oot.shape)

Shape of the trin data is  (246008, 1418)
Shape of the test data is  (30751, 1418)
Shape of the oot data is  (30752, 1418)


## Top 40 feature selection using Information Value

### Calculating Weight of evidence and Information value
The weight of evidence tells the predictive power of a single feature concerning its independent feature. If any of the categories/bins of a feature has a large proportion of events compared to the proportion of non-events, we will get a high value of WoE which in turn says that that class of the feature separates the events from non-events.
https://www.analyticsvidhya.com/blog/2021/06/understand-weight-of-evidence-and-information-value/

In [None]:
final_iv, IV = data_vars(train,train.TARGET)

In [None]:
IV.to_csv('/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/Information_Value.csv')
IV.sort_values(by=['IV'], ascending=False).head(10)

Unnamed: 0,VAR_NAME,IV
623,EXT_SOURCE_2,0.318135
624,EXT_SOURCE_3,0.246297
626,EXT_SOURCE_MEAN,0.199057
628,EXT_SOURCE_MUL,0.198053
1310,WEIGHTED_EXT_SOURCE,0.194873
627,EXT_SOURCE_MIN,0.177928
682,INCOME_EXT_RATIO,0.162892
625,EXT_SOURCE_MAX,0.137515
450,CREDIT_EXT_RATIO,0.130561
622,EXT_SOURCE_1,0.110553


#### Manual selection of variables based on Information Value:
38 variables are manually selected based on the information value. 0.04-0.5 information value threshold is used for selection.

In [None]:
iv_selected_df=pd.read_csv('/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/Information_Value.csv')
iv_selected_df.drop(columns=['Unnamed: 0'],inplace=True)
iv_selected_df=iv_selected_df.reset_index(drop=True)
print("Following are the manually selected 39 variables :")
iv_selected_df[iv_selected_df['DECISION']=='KEEP']

Following are the manually selected 39 variables :


Unnamed: 0,VAR_NAME,IV,DECISION
41,EXT_SOURCE_2,0.318135,KEEP
63,EXT_SOURCE_3,0.246297,KEEP
173,EXT_SOURCE_MEAN,0.199057,KEEP
229,WEIGHTED_EXT_SOURCE,0.194873,KEEP
283,INCOME_EXT_RATIO,0.162892,KEEP
286,CREDIT_EXT_RATIO,0.130561,KEEP
316,EXT_SOURCE_1,0.110553,KEEP
318,DAYS_CREDIT_MEAN,0.108673,KEEP
319,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,0.10651,KEEP
342,DAYS_EMPLOYED,0.100511,KEEP


### Train, Test and OOT dataframes consisting of only selected variables from IV:

In [None]:
# List of selected variables based on IV value
iv_selected_var=iv_selected_df[iv_selected_df['DECISION']=='KEEP']['VAR_NAME'].to_list()
print('Total {} variables are selected based on IV value'.format(len(iv_selected_var)))

X_train=train[iv_selected_var]
X_test=test[iv_selected_var]
X_oot=oot[iv_selected_var]
# this is the test dataframe fromed from a test dataframe which home credit has provided
X_test_oot=application_test_merged[iv_selected_var]

y_train=train['TARGET']
y_test=test['TARGET']
y_oot=oot['TARGET']

print('Shape of X_train is {} and of y_train is {} '.format(X_train.shape,y_train.shape))
print('Shape of X_test is {} and of y_test is {} '.format(X_test.shape,y_test.shape))
print('Shape of X_oot is {} and of y_oot is {} '.format(X_oot.shape,y_oot.shape))
print('Shape of X_test_oot is {} '.format(X_test_oot.shape))

Total 38 variables are selected based on IV value
Shape of X_train is (246008, 38) and of y_train is (246008,) 
Shape of X_test is (30751, 38) and of y_test is (30751,) 
Shape of X_oot is (30752, 38) and of y_oot is (30752,) 
Shape of X_test_oot is (48744, 38) 


## Min Max Scaling and SMOTE train data
In this section min max scaler is used to scale the data. Scaled data converges faster on the solution.

### Min max scaling
Before using any algorithm lets use min max scaling to normalize the dataframes.

In [None]:
# defining the minmaxscaler
scaler = MinMaxScaler()
# fit_transform on the train data
X_train=pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns, index=X_train.index)
# transform on the test and oot data
X_test=pd.DataFrame(scaler.transform(X_test),columns=X_test.columns, index=X_test.index)
X_oot=pd.DataFrame(scaler.transform(X_oot),columns=X_oot.columns, index=X_oot.index)
X_test_oot=pd.DataFrame(scaler.transform(X_test_oot),columns=X_test_oot.columns, index=X_test_oot.index)

### SMOTE:
As the data is imbalance lets use SMOTE technique to balance the data first before applying any machine learning algorithm.
Lets use SMOTE data for train samples.

In [None]:
# summarize class distribution
counter = Counter(y_train)
print('Existing data target distribution is ',counter)
# define pipeline
over = SMOTE(sampling_strategy=1)
under = RandomUnderSampler(sampling_strategy=1)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X_train_smote, y_train_smote = pipeline.fit_resample(X_train, y_train)
# summarize the new class distribution
counter = Counter(y_train_smote)
print('After SMOTE data target distribution is ',counter)

Existing data target distribution is  Counter({0: 226136, 1: 19872})
After SMOTE data target distribution is  Counter({0: 226136, 1: 226136})


Lets observe the dataframes distribution

In [None]:
X_train.describe()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_MEAN,WEIGHTED_EXT_SOURCE,INCOME_EXT_RATIO,CREDIT_EXT_RATIO,EXT_SOURCE_1,DAYS_CREDIT_MEAN,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,DAYS_EMPLOYED,...,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,OCCUPATION_TYPE,INTEREST_RATE_MIN_ALL,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,NAME_INCOME_TYPE_Working,DAYS_LAST_PHONE_CHANGE,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,REGION_POPULATION_RELATIVE,NAME_PRODUCT_TYPE_WALK-IN_MEAN_ALL,ORGANIZATION_TYPE
count,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,...,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0,246008.0
mean,0.600253,0.457216,0.215458,0.215991,0.001046,0.001108,0.227318,0.317555,0.280989,0.786557,...,0.836649,0.23138,0.655566,0.007911,0.516199,0.22455,0.558004,0.285103,0.076433,0.167784
std,0.224838,0.299115,0.303893,0.304752,0.014875,0.017125,0.296267,0.220632,0.001704,0.368832,...,0.264733,0.192612,0.444884,0.03389,0.499739,0.192675,0.329579,0.191779,0.179693,0.190688
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.456882,0.200681,0.0,0.0,0.000107,7.6e-05,0.0,0.145192,0.28097,0.954009,...,0.752315,0.055556,0.0,0.0,0.0,0.063829,0.289918,0.134587,0.0,0.017544
50%,0.661336,0.513079,0.0,0.0,0.00022,0.00022,0.0,0.31742,0.28097,0.956422,...,1.0,0.222222,0.953309,0.0,1.0,0.176578,0.503193,0.25692,0.0,0.087719
75%,0.775557,0.708447,0.526559,0.527076,0.000385,0.000432,0.473884,0.466461,0.28097,0.960456,...,1.0,0.333333,0.958359,0.0,1.0,0.366113,0.921533,0.392774,0.0,0.210526
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_train_smote.describe()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_MEAN,WEIGHTED_EXT_SOURCE,INCOME_EXT_RATIO,CREDIT_EXT_RATIO,EXT_SOURCE_1,DAYS_CREDIT_MEAN,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,DAYS_EMPLOYED,...,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,OCCUPATION_TYPE,INTEREST_RATE_MIN_ALL,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,NAME_INCOME_TYPE_Working,DAYS_LAST_PHONE_CHANGE,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,REGION_POPULATION_RELATIVE,NAME_PRODUCT_TYPE_WALK-IN_MEAN_ALL,ORGANIZATION_TYPE
count,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,...,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0,452272.0
mean,0.545419,0.400233,0.185811,0.186203,0.001218,0.001243,0.194695,0.285838,0.28099,0.811276,...,0.82553,0.228623,0.632391,0.010904,0.559469,0.2056,0.563741,0.271213,0.0885,0.164838
std,0.236487,0.288659,0.27685,0.277727,0.014507,0.016185,0.269694,0.211655,0.001282,0.345927,...,0.259781,0.187357,0.453976,0.037031,0.496451,0.179337,0.324774,0.170952,0.188463,0.181605
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.365778,0.126581,0.0,0.0,0.00012,9e-05,0.0,0.112218,0.28097,0.954371,...,0.709877,0.055556,0.0,0.0,0.0,0.06059,0.289918,0.136488,0.0,0.035088
50%,0.600228,0.417439,0.0,0.0,0.00025,0.000255,0.0,0.278405,0.28097,0.956367,...,1.0,0.222222,0.953327,0.0,1.0,0.156495,0.503193,0.256185,0.0,0.095456
75%,0.737794,0.649046,0.436865,0.435257,0.00046,0.000487,0.386156,0.428474,0.28097,0.959622,...,1.0,0.312129,0.959253,0.0,1.0,0.334744,0.921533,0.359761,0.094222,0.199991
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
X_test.describe()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_MEAN,WEIGHTED_EXT_SOURCE,INCOME_EXT_RATIO,CREDIT_EXT_RATIO,EXT_SOURCE_1,DAYS_CREDIT_MEAN,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,DAYS_EMPLOYED,...,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,OCCUPATION_TYPE,INTEREST_RATE_MIN_ALL,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,NAME_INCOME_TYPE_Working,DAYS_LAST_PHONE_CHANGE,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,REGION_POPULATION_RELATIVE,NAME_PRODUCT_TYPE_WALK-IN_MEAN_ALL,ORGANIZATION_TYPE
count,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,...,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0,30751.0
mean,0.599866,0.455973,0.217253,0.217651,0.001034,0.001053,0.229716,0.316871,0.280975,0.788551,...,0.833924,0.229351,0.656647,0.007863,0.517739,0.223505,0.558957,0.285897,0.074662,0.166488
std,0.226577,0.299134,0.304788,0.305522,0.014173,0.015917,0.297475,0.219878,0.001957,0.367213,...,0.268009,0.191488,0.444493,0.033948,0.499693,0.192352,0.329992,0.192679,0.177168,0.188339
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.053056,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003367,0.0,0.0
25%,0.456882,0.196049,0.0,0.0,0.000108,7.6e-05,0.0,0.142668,0.28097,0.954016,...,0.751543,0.055556,0.0,0.0,0.0,0.063953,0.289918,0.134587,0.0,0.035088
50%,0.661907,0.513079,0.0,0.0,0.000222,0.00022,0.0,0.317591,0.28097,0.956436,...,1.0,0.222222,0.953309,0.0,1.0,0.175415,0.503193,0.25692,0.0,0.087719
75%,0.776414,0.708447,0.527425,0.527592,0.000388,0.000435,0.47642,0.46629,0.28097,0.960504,...,1.0,0.333333,0.958686,0.0,1.0,0.364784,0.921533,0.392774,0.0,0.210526
max,0.961165,0.988011,1.009238,1.013925,0.592593,0.856185,0.979716,0.999316,0.31606,0.999047,...,1.0,1.0,0.997002,0.931818,1.0,0.974751,1.0,1.0,1.0,1.0


In [None]:
X_oot.describe()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_MEAN,WEIGHTED_EXT_SOURCE,INCOME_EXT_RATIO,CREDIT_EXT_RATIO,EXT_SOURCE_1,DAYS_CREDIT_MEAN,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,DAYS_EMPLOYED,...,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,OCCUPATION_TYPE,INTEREST_RATE_MIN_ALL,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,NAME_INCOME_TYPE_Working,DAYS_LAST_PHONE_CHANGE,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,REGION_POPULATION_RELATIVE,NAME_PRODUCT_TYPE_WALK-IN_MEAN_ALL,ORGANIZATION_TYPE
count,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,...,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0,30752.0
mean,0.601621,0.457463,0.213945,0.214491,0.000986,0.001091,0.22638,0.318629,0.280996,0.786024,...,0.836163,0.23283,0.657919,0.0079,0.515869,0.223675,0.557797,0.282859,0.076221,0.166071
std,0.224285,0.301016,0.303727,0.304704,0.013439,0.017224,0.2961,0.221822,0.000861,0.369317,...,0.265649,0.19345,0.443932,0.033852,0.499756,0.192827,0.329772,0.188501,0.179001,0.188953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.261935,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003367,0.0,0.0
25%,0.459452,0.193733,0.0,0.0,0.000106,7.4e-05,0.0,0.144935,0.28097,0.954003,...,0.752604,0.055556,0.0,0.0,0.0,0.062666,0.289918,0.134587,0.0,0.017544
50%,0.662193,0.513079,0.0,0.0,0.000218,0.000215,0.0,0.31742,0.28097,0.956416,...,1.0,0.222222,0.953309,0.0,1.0,0.175914,0.503193,0.25692,0.0,0.087719
75%,0.777841,0.712262,0.523383,0.524304,0.000378,0.000424,0.47211,0.466119,0.28097,0.960449,...,1.0,0.333333,0.958323,0.0,1.0,0.364535,0.921533,0.392774,0.0,0.210526
max,1.0,0.984741,0.993649,0.988138,0.666667,0.875,0.97718,1.0,0.372885,0.999017,...,1.0,1.0,1.001576,0.795455,1.0,0.948173,1.0,1.0,1.0,1.0


Lets store the dataframes that has reduced variables based on IV value, has scaler transformed and also has train SMOTE.
Lets also store scaler for future reference.

In [None]:
# Lets store the dataframes to reuse
pickle_path = '/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/'
pickle_out=open(pickle_path+'X_train_iv.pickle', 'wb')
pickle.dump(X_train,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_train_smote_iv.pickle', 'wb')
pickle.dump(X_train_smote,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_test_iv.pickle', 'wb')
pickle.dump(X_test,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_oot_iv.pickle', 'wb')
pickle.dump(X_oot,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_train_iv.pickle', 'wb')
pickle.dump(y_train,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_train_smote_iv.pickle', 'wb')
pickle.dump(y_train_smote,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_test_iv.pickle', 'wb')
pickle.dump(y_test,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_oot_iv.pickle', 'wb')
pickle.dump(y_oot,pickle_out)
pickle_out.close()

In [None]:
pickle_out=open(pickle_path+'MinMaxscaler.pickle', 'wb')
pickle.dump(scaler,pickle_out)
pickle_out.close()

In [None]:
pickle_out=open(pickle_path+'X_test_oot_iv.pickle', 'wb')
pickle.dump(X_test_oot,pickle_out)
pickle_out.close()

In [None]:
del X_train
del y_train
del X_train_smote
del y_train_smote
del X_test
del y_test
del X_oot
del y_oot
del X_test_oot

In [None]:
# Importing the train test oot and train_smote dataframes
pickle_path = '/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/'

#1. X_train dataframe
pickle_in=open(pickle_path+"X_train_iv.pickle","rb")
X_train=pickle.load(pickle_in)
pickle_in.close()
print("Shape of X_train dataframe is",X_train.shape)

#2. y_train dataframe
pickle_in=open(pickle_path+"y_train_iv.pickle","rb")
y_train=pickle.load(pickle_in)
pickle_in.close()
print("Shape of y_train dataframe is",y_train.shape)

#3. X_train_smote dataframe
pickle_in=open(pickle_path+"X_train_smote_iv.pickle","rb")
X_train_smote=pickle.load(pickle_in)
pickle_in.close()
print("Shape of X_train_smote dataframe is",X_train_smote.shape)

#4. y_train_smote dataframe
pickle_in=open(pickle_path+"y_train_smote_iv.pickle","rb")
y_train_smote=pickle.load(pickle_in)
pickle_in.close()
print("Shape of y_train_smote dataframe is",y_train_smote.shape)

#5. X_test dataframe
pickle_in=open(pickle_path+"X_test_iv.pickle","rb")
X_test=pickle.load(pickle_in)
pickle_in.close()
print("Shape of X_test dataframe is",X_test.shape)

#6. y_test dataframe
pickle_in=open(pickle_path+"y_test_iv.pickle","rb")
y_test=pickle.load(pickle_in)
pickle_in.close()
print("Shape of y_test dataframe is",y_test.shape)

#7. X_oot dataframe
pickle_in=open(pickle_path+"X_oot_iv.pickle","rb")
X_oot=pickle.load(pickle_in)
pickle_in.close()
print("Shape of X_oot dataframe is",X_oot.shape)

#8. y_oot dataframe
pickle_in=open(pickle_path+"y_oot_iv.pickle","rb")
y_oot=pickle.load(pickle_in)
pickle_in.close()
print("Shape of y_oot dataframe is",y_oot.shape)

#9. X_test_oot dataframe
pickle_in=open(pickle_path+"X_test_oot_iv.pickle","rb")
X_test_oot=pickle.load(pickle_in)
pickle_in.close()
print("Shape of X_test_oot dataframe is",X_test_oot.shape)

Shape of X_train dataframe is (246008, 38)
Shape of y_train dataframe is (246008,)
Shape of X_train_smote dataframe is (452272, 38)
Shape of y_train_smote dataframe is (452272,)
Shape of X_test dataframe is (30751, 38)
Shape of y_test dataframe is (30751,)
Shape of X_oot dataframe is (30752, 38)
Shape of y_oot dataframe is (30752,)
Shape of X_test_oot dataframe is (48744, 38)


## Selection of top 15 variables by votes count:
Now lets select the top 15 variables from manually selected 39 variables based on the information value. Lets count the votes based on the multiple algorithms.

### 1. RFE with Logisticregression

In [None]:
def feature_selection_rfe_lr(train_x,train_y,feature_to_select=20,step=3):
  from sklearn.linear_model import LogisticRegression
  from sklearn.feature_selection import RFE
  rfe=RFE(estimator=LogisticRegression(), n_features_to_select=feature_to_select,step=step,verbose=10)
  rfe=rfe.fit(train_x,train_y.values.ravel())
  feature_importance_rfe=pd.DataFrame({'feature':train_x.columns,
                                       'ranking':rfe.ranking_,
                                       'RFE_LR_decisions':rfe.support_})
  return(feature_importance_rfe)

In [None]:
fs_1=feature_selection_rfe_lr(X_train_smote,y_train_smote)
print('Top 20 features are:')
fs_1=fs_1.sort_values(by=['ranking'])
fs_1

Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Top 20 features are:


Unnamed: 0,feature,ranking,RFE_LR_decisions
0,EXT_SOURCE_2,1,True
23,DAYS_DETAILS_CHANGE_MUL,1,True
24,AMT_CREDIT,1,True
16,NAME_CONTRACT_STATUS_MEAN_ALL,1,True
15,DAYS_PAYMENT_RATIO_MAX_Latest_year,1,True
14,CURRENT_CREDIT_DEBT_DIFF_MEAN,1,True
25,EMA_AMT_PAYMENT_DIFF_LAST_Latest_year,1,True
12,DAYS_BIRTH,1,True
26,AMT_PAYMENT_DIFF_MAX_Latest_year,1,True
22,INTEREST_SHARE_MEAN_LAST_5,1,True


In [None]:
# Lets store the fs_1
pickle_out=open(pickle_path+'fs_1.pickle', 'wb')
pickle.dump(fs_1,pickle_out)
pickle_out.close()

### 2. Correlation

In [None]:
def feature_selection_stats_corr(train_x,train_y):
  return(train_x.corrwith(train_y).reset_index().rename(columns={'index':'feature',0:'correlation'}))

In [None]:
fs_2=feature_selection_stats_corr(X_train_smote,y_train_smote)
print("Top features are:")
fs_2['Correlation_decision']=np.where(abs(fs_2['correlation'])>=0.071,True,False)
fs_2['correlation']=fs_2['correlation'].abs()
fs_2=fs_2.sort_values(by=['correlation'],ascending=False)
fs_2[fs_2['Correlation_decision']==True]

Top features are:


Unnamed: 0,feature,correlation,Correlation_decision
0,EXT_SOURCE_2,0.276816,True
1,EXT_SOURCE_3,0.234374,True
7,DAYS_CREDIT_MEAN,0.175907,True
12,DAYS_BIRTH,0.154611,True
11,CREDIT_ACTIVE_CLOSED_MEAN,0.148378,True
6,EXT_SOURCE_1,0.142267,True
23,DAYS_DETAILS_CHANGE_MUL,0.128967,True
3,WEIGHTED_EXT_SOURCE,0.127342,True
2,EXT_SOURCE_MEAN,0.127105,True
33,DAYS_LAST_PHONE_CHANGE,0.123175,True


In [None]:
# Lets store the fs_2
pickle_out=open(pickle_path+'fs_2.pickle', 'wb')
pickle.dump(fs_2,pickle_out)
pickle_out.close()

### 3. Lasso

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
L1_test = SelectFromModel(LogisticRegression(penalty='l1',solver='liblinear'))
L1_selector=L1_test.fit(X_train,y_train)
np.set_printoptions(precision=2)
L1_coef=np.round(L1_selector.estimator_.coef_,2).ravel()
L1_support=L1_selector.get_support()
L1_feature=X_train_smote.loc[:,L1_support].columns.tolist()
print(str(len(L1_feature)),'selected feature')

34 selected feature


In [None]:
fs_3=pd.DataFrame({'feature':X_train_smote.columns,'Coeff_Lasso':L1_coef})
fs_3['Lasso_decision']=np.where(abs(fs_3['Coeff_Lasso'])<0.59,False,True)
print('selected features are',len(fs_3[fs_3['Lasso_decision']==True]['feature']))
fs_3[fs_3['Lasso_decision']==True]

selected features are 21


Unnamed: 0,feature,Coeff_Lasso,Lasso_decision
0,EXT_SOURCE_2,-1.94,True
1,EXT_SOURCE_3,-1.24,True
2,EXT_SOURCE_MEAN,5.6,True
3,WEIGHTED_EXT_SOURCE,-4.23,True
4,INCOME_EXT_RATIO,0.9,True
6,EXT_SOURCE_1,-1.77,True
8,CURRENT_DEBT_TO_CREDIT_RATIO_MEAN,-1.47,True
10,AMT_GOODS_PRICE,-10.45,True
12,DAYS_BIRTH,-0.6,True
13,EMPLOYED_TO_AGE_RATIO,-1.12,True


In [None]:
# Lets store the fs_3
pickle_out=open(pickle_path+'fs_3.pickle', 'wb')
pickle.dump(fs_3,pickle_out)
pickle_out.close()

### 4. RFE with Decisiontree

In [None]:
def feature_selection_rfe_dt(train_x,train_y,feature_to_select=20,step=3):
  from sklearn import tree
  from sklearn.feature_selection import RFE
  from sklearn.tree import DecisionTreeClassifier
  rfe=RFE(estimator=DecisionTreeClassifier(), n_features_to_select=feature_to_select,step=step,verbose=10)
  rfe=rfe.fit(train_x,train_y.values.ravel())
  feature_importance_rfe=pd.DataFrame({'feature':train_x.columns,
                                       'ranking':rfe.ranking_,
                                       'RFE_DT_decisions':rfe.support_})
  return(feature_importance_rfe)

In [None]:
fs_4=feature_selection_rfe_dt(X_train_smote,y_train_smote)
print('Top 20 features are:')
fs_4=fs_4.sort_values(by=['ranking'])
fs_4[fs_4['RFE_DT_decisions']==True]

Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Top 20 features are:


Unnamed: 0,feature,ranking,RFE_DT_decisions
0,EXT_SOURCE_2,1,True
35,REGION_POPULATION_RELATIVE,1,True
34,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,1,True
33,DAYS_LAST_PHONE_CHANGE,1,True
31,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,1,True
30,INTEREST_RATE_MIN_ALL,1,True
29,OCCUPATION_TYPE,1,True
26,AMT_PAYMENT_DIFF_MAX_Latest_year,1,True
23,DAYS_DETAILS_CHANGE_MUL,1,True
22,INTEREST_SHARE_MEAN_LAST_5,1,True


In [None]:
# Lets store the fs_4
pickle_out=open(pickle_path+'fs_4.pickle', 'wb')
pickle.dump(fs_4,pickle_out)
pickle_out.close()

### 5. RFE with Randomforest

In [None]:
def feature_selection_rfe_rf(train_x,train_y,feature_to_select=20,step=3):
  from sklearn import tree
  from sklearn.feature_selection import RFE
  from sklearn.ensemble import RandomForestClassifier
  rfe=RFE(estimator=RandomForestClassifier(), n_features_to_select=feature_to_select,step=step,verbose=10)
  rfe=rfe.fit(train_x,train_y.values.ravel())
  feature_importance_rfe=pd.DataFrame({'feature':train_x.columns,
                                       'ranking':rfe.ranking_,
                                       'RFE_RF_decisions':rfe.support_})
  return(feature_importance_rfe)

In [None]:
fs_5=feature_selection_rfe_rf(X_train_smote,y_train_smote)
print('Top 20 features are:')
fs_5=fs_5.sort_values(by=['ranking'])
fs_5[fs_5['RFE_RF_decisions']==True]

Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Top 20 features are:


Unnamed: 0,feature,ranking,RFE_RF_decisions
0,EXT_SOURCE_2,1,True
35,REGION_POPULATION_RELATIVE,1,True
34,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,1,True
33,DAYS_LAST_PHONE_CHANGE,1,True
31,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,1,True
26,AMT_PAYMENT_DIFF_MAX_Latest_year,1,True
25,EMA_AMT_PAYMENT_DIFF_LAST_Latest_year,1,True
24,AMT_CREDIT,1,True
23,DAYS_DETAILS_CHANGE_MUL,1,True
22,INTEREST_SHARE_MEAN_LAST_5,1,True


In [None]:
# Lets store the fs_5
pickle_out=open(pickle_path+'fs_5.pickle', 'wb')
pickle.dump(fs_5,pickle_out)
pickle_out.close()

### 6. RFE with GradientBoostingClassifier

In [None]:
def feature_selection_rfe_gb(train_x,train_y,feature_to_select=20,step=3):
  from sklearn import tree
  from sklearn.feature_selection import RFE
  from sklearn.ensemble import GradientBoostingClassifier
  rfe=RFE(estimator=GradientBoostingClassifier(), n_features_to_select=feature_to_select,step=step,verbose=10)
  rfe=rfe.fit(train_x,train_y.values.ravel())
  feature_importance_rfe=pd.DataFrame({'feature':train_x.columns,
                                       'ranking':rfe.ranking_,
                                       'RFE_GB_decisions':rfe.support_})
  return(feature_importance_rfe)

In [None]:
fs_6=feature_selection_rfe_gb(X_train_smote,y_train_smote)
print('Top 20 features are:')
fs_6=fs_6.sort_values(by=['ranking'])
fs_6[fs_6['RFE_GB_decisions']==True]

Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Top 20 features are:


Unnamed: 0,feature,ranking,RFE_GB_decisions
0,EXT_SOURCE_2,1,True
35,REGION_POPULATION_RELATIVE,1,True
34,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,1,True
33,DAYS_LAST_PHONE_CHANGE,1,True
31,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,1,True
30,INTEREST_RATE_MIN_ALL,1,True
29,OCCUPATION_TYPE,1,True
28,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,1,True
27,NAME_EDUCATION_TYPE,1,True
26,AMT_PAYMENT_DIFF_MAX_Latest_year,1,True


In [None]:
# Lets store the fs_6
pickle_out=open(pickle_path+'fs_6.pickle', 'wb')
pickle.dump(fs_6,pickle_out)
pickle_out.close()

### 7. RFE with Perceptron

In [None]:
def feature_selection_rfe_perceptron(train_x,train_y,feature_to_select=20,step=3):
  from sklearn import tree
  from sklearn.feature_selection import RFE
  from sklearn.linear_model import Perceptron
  rfe=RFE(estimator=Perceptron(), n_features_to_select=feature_to_select,step=step,verbose=10)
  rfe=rfe.fit(train_x,train_y.values.ravel())
  feature_importance_rfe=pd.DataFrame({'feature':train_x.columns,
                                       'ranking':rfe.ranking_,
                                       'RFE_Pe_decisions':rfe.support_})
  return(feature_importance_rfe)

In [None]:
fs_7=feature_selection_rfe_perceptron(X_train_smote,y_train_smote)
print('Top 20 features are:')
fs_7=fs_7.sort_values(by=['ranking'])
fs_7[fs_7['RFE_Pe_decisions']==True]

Fitting estimator with 38 features.
Fitting estimator with 35 features.
Fitting estimator with 32 features.
Fitting estimator with 29 features.
Fitting estimator with 26 features.
Fitting estimator with 23 features.
Top 20 features are:


Unnamed: 0,feature,ranking,RFE_Pe_decisions
0,EXT_SOURCE_2,1,True
32,NAME_INCOME_TYPE_Working,1,True
31,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,1,True
26,AMT_PAYMENT_DIFF_MAX_Latest_year,1,True
25,EMA_AMT_PAYMENT_DIFF_LAST_Latest_year,1,True
24,AMT_CREDIT,1,True
22,INTEREST_SHARE_MEAN_LAST_5,1,True
20,ANNUITY_GOODS_MAX_ALL,1,True
19,EMA_BALANCE_LIMIT_RATIO_LAST,1,True
15,DAYS_PAYMENT_RATIO_MAX_Latest_year,1,True


In [None]:
# Lets store the fs_7
pickle_out=open(pickle_path+'fs_7.pickle', 'wb')
pickle.dump(fs_7,pickle_out)
pickle_out.close()

### 8. ExtraTreeClassifier

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier()
model.fit(X_train_smote,y_train_smote)

In [None]:
fs_8= model
fs_8= pd.DataFrame({'feature':X_train_smote.columns,'F_imp':fs_8.feature_importances_})
fs_8['ExtraTreeClf_decision']=np.where(abs(fs_8['F_imp'])<0.026,False,True)
print('Selected variables are ',len(fs_8[fs_8['ExtraTreeClf_decision']==True]))
fs_8=fs_8.sort_values(by=['F_imp'])
fs_8[fs_8['ExtraTreeClf_decision']==True]

Selected variables are  20


Unnamed: 0,feature,F_imp,ExtraTreeClf_decision
22,INTEREST_SHARE_MEAN_LAST_5,0.02688,True
9,DAYS_EMPLOYED,0.027108,True
29,OCCUPATION_TYPE,0.027347,True
37,ORGANIZATION_TYPE,0.027674,True
16,NAME_CONTRACT_STATUS_MEAN_ALL,0.028811,True
23,DAYS_DETAILS_CHANGE_MUL,0.028854,True
6,EXT_SOURCE_1,0.029141,True
13,EMPLOYED_TO_AGE_RATIO,0.029663,True
21,DEF_60_CREDIT_RATIO,0.030872,True
7,DAYS_CREDIT_MEAN,0.032443,True


In [None]:
# Lets store the fs_8
pickle_out=open(pickle_path+'fs_8.pickle', 'wb')
pickle.dump(fs_8,pickle_out)
pickle_out.close()

### 9. Permutation Importance

In [None]:
from sklearn.inspection import permutation_importance
fs_9=permutation_importance(model,X_train_smote,y_train_smote,n_repeats=5, random_state=0)

In [None]:
fs_9= pd.DataFrame({'feature':X_train_smote.columns,'F_imp':fs_9.importances_mean})
fs_9['Permutation_decision']=np.where(abs(fs_9['F_imp'])<0.00035,False,True)
print('Selected variables are ',len(fs_9[fs_9['Permutation_decision']==True]))
fs_9=fs_9.sort_values(by=['F_imp'])
fs_9[fs_9['Permutation_decision']==True]

Selected variables are  21


Unnamed: 0,feature,F_imp,Permutation_decision
31,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,0.000367,True
36,NAME_PRODUCT_TYPE_WALK-IN_MEAN_ALL,0.000382,True
2,EXT_SOURCE_MEAN,0.00055,True
3,WEIGHTED_EXT_SOURCE,0.000651,True
15,DAYS_PAYMENT_RATIO_MAX_Latest_year,0.00079,True
23,DAYS_DETAILS_CHANGE_MUL,0.001102,True
34,PRODUCT_COMBINATION_FREQ_ENCODE_LAST_ALL,0.001141,True
28,CODE_REJECT_REASON_FREQ_ENCODE_MEAN_ALL,0.001566,True
16,NAME_CONTRACT_STATUS_MEAN_ALL,0.002141,True
35,REGION_POPULATION_RELATIVE,0.002818,True


In [None]:
# Lets store the fs_9
pickle_out=open(pickle_path+'fs_9.pickle', 'wb')
pickle.dump(fs_9,pickle_out)
pickle_out.close()

## Votes Counting for feature selection

In [None]:
# Lets import all the votes dataframes
#1. fs_1
pickle_in=open(pickle_path+"fs_1.pickle","rb")
fs_1=pickle.load(pickle_in)
pickle_in.close()

#2. fs_2
pickle_in=open(pickle_path+"fs_2.pickle","rb")
fs_2=pickle.load(pickle_in)
pickle_in.close()

#3. fs_3
pickle_in=open(pickle_path+"fs_3.pickle","rb")
fs_3=pickle.load(pickle_in)
pickle_in.close()

#4. fs_4
pickle_in=open(pickle_path+"fs_4.pickle","rb")
fs_4=pickle.load(pickle_in)
pickle_in.close()

#5. fs_5
pickle_in=open(pickle_path+"fs_5.pickle","rb")
fs_5=pickle.load(pickle_in)
pickle_in.close()

#6. fs_6
pickle_in=open(pickle_path+"fs_6.pickle","rb")
fs_6=pickle.load(pickle_in)
pickle_in.close()

#7. fs_7
pickle_in=open(pickle_path+"fs_7.pickle","rb")
fs_7=pickle.load(pickle_in)
pickle_in.close()

#8. fs_8
pickle_in=open(pickle_path+"fs_8.pickle","rb")
fs_8=pickle.load(pickle_in)
pickle_in.close()

#9. fs_9
pickle_in=open(pickle_path+"fs_9.pickle","rb")
fs_9=pickle.load(pickle_in)
pickle_in.close()

In [None]:
dfs=[fs_1,fs_2,fs_3,fs_4,fs_5,fs_6,fs_7,fs_8,fs_9]
Final_feature_selection=pd.concat([x.set_index('feature') for x in dfs],axis=1).reset_index()
Final_feature_selection.head(5)

Unnamed: 0,feature,ranking,RFE_LR_decisions,correlation,Correlation_decision,Coeff_Lasso,Lasso_decision,ranking.1,RFE_DT_decisions,ranking.2,RFE_RF_decisions,ranking.3,RFE_GB_decisions,ranking.4,RFE_Pe_decisions,F_imp,ExtraTreeClf_decision,F_imp.1,Permutation_decision
0,EXT_SOURCE_2,1,True,0.276816,True,-1.94,True,1,True,1,True,1,True,1,True,0.069256,True,0.08072,True
1,DAYS_DETAILS_CHANGE_MUL,1,True,0.128967,True,0.59,True,1,True,1,True,5,False,2,False,0.028228,True,0.001102,True
2,AMT_CREDIT,1,True,0.083425,True,10.47,True,2,False,1,True,1,True,1,True,0.032414,True,6.7e-05,False
3,NAME_CONTRACT_STATUS_MEAN_ALL,1,True,0.113167,True,0.59,True,4,False,4,False,3,False,4,False,0.028669,True,0.002141,True
4,DAYS_PAYMENT_RATIO_MAX_Latest_year,1,True,0.038589,False,-2.67,True,1,True,1,True,1,True,1,True,0.038746,True,0.00079,True


In [None]:
Final_feature_selection['votes']=np.sum([Final_feature_selection['RFE_LR_decisions'],
                                        Final_feature_selection['Correlation_decision'],
                                        Final_feature_selection['Lasso_decision'],
                                        Final_feature_selection['RFE_DT_decisions'],
                                        Final_feature_selection['RFE_RF_decisions'],
                                        Final_feature_selection['RFE_GB_decisions'],
                                        Final_feature_selection['RFE_Pe_decisions'],
                                        Final_feature_selection['ExtraTreeClf_decision'],
                                        Final_feature_selection['Permutation_decision']],axis=0)

In [None]:
Final_feature_selection.head(5)

Unnamed: 0,feature,ranking,RFE_LR_decisions,correlation,Correlation_decision,Coeff_Lasso,Lasso_decision,ranking.1,RFE_DT_decisions,ranking.2,RFE_RF_decisions,ranking.3,RFE_GB_decisions,ranking.4,RFE_Pe_decisions,F_imp,ExtraTreeClf_decision,F_imp.1,Permutation_decision,votes
0,EXT_SOURCE_2,1,True,0.276816,True,-1.94,True,1,True,1,True,1,True,1,True,0.069256,True,0.08072,True,9
1,DAYS_DETAILS_CHANGE_MUL,1,True,0.128967,True,0.59,True,1,True,1,True,5,False,2,False,0.028228,True,0.001102,True,7
2,AMT_CREDIT,1,True,0.083425,True,10.47,True,2,False,1,True,1,True,1,True,0.032414,True,6.7e-05,False,7
3,NAME_CONTRACT_STATUS_MEAN_ALL,1,True,0.113167,True,0.59,True,4,False,4,False,3,False,4,False,0.028669,True,0.002141,True,5
4,DAYS_PAYMENT_RATIO_MAX_Latest_year,1,True,0.038589,False,-2.67,True,1,True,1,True,1,True,1,True,0.038746,True,0.00079,True,8


In [None]:
selected_feature_df=Final_feature_selection.loc[Final_feature_selection['votes']>5]
selected_feature_df=selected_feature_df.sort_values(by=['votes'],ascending=False)
print('shape of selected feature dataframe is ',selected_feature_df.shape)
selected_feature_df.head(15)

shape of selected feature dataframe is  (15, 20)


Unnamed: 0,feature,ranking,RFE_LR_decisions,correlation,Correlation_decision,Coeff_Lasso,Lasso_decision,ranking.1,RFE_DT_decisions,ranking.2,RFE_RF_decisions,ranking.3,RFE_GB_decisions,ranking.4,RFE_Pe_decisions,F_imp,ExtraTreeClf_decision,F_imp.1,Permutation_decision,votes
0,EXT_SOURCE_2,1,True,0.276816,True,-1.94,True,1,True,1,True,1,True,1,True,0.069256,True,0.08072,True,9
17,EXT_SOURCE_3,1,True,0.234374,True,-1.24,True,1,True,1,True,1,True,1,True,0.0609,True,0.078976,True,9
4,DAYS_PAYMENT_RATIO_MAX_Latest_year,1,True,0.038589,False,-2.67,True,1,True,1,True,1,True,1,True,0.038746,True,0.00079,True,8
10,AMT_GOODS_PRICE,1,True,0.102654,True,-10.45,True,1,True,1,True,1,True,1,True,0.034352,True,9.4e-05,False,8
12,CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year,1,True,0.099438,True,3.45,True,1,True,1,True,1,True,1,True,0.015394,False,0.000367,True,8
13,EXT_SOURCE_1,1,True,0.142267,True,-1.77,True,1,True,3,False,1,True,1,True,0.029823,True,0.017723,True,8
1,DAYS_DETAILS_CHANGE_MUL,1,True,0.128967,True,0.59,True,1,True,1,True,5,False,2,False,0.028228,True,0.001102,True,7
2,AMT_CREDIT,1,True,0.083425,True,10.47,True,2,False,1,True,1,True,1,True,0.032414,True,6.7e-05,False,7
7,DAYS_BIRTH,1,True,0.154611,True,-0.6,True,1,True,1,True,2,False,5,False,0.037041,True,0.016498,True,7
19,DEF_60_CREDIT_RATIO,1,True,0.087042,True,-0.97,True,1,True,1,True,1,True,7,False,0.030408,True,0.000148,False,7


In [None]:
selected_variable=list(selected_feature_df['feature'])
print('Number of finally selected variables are',len(selected_variable))
selected_variable

Number of finally selected variables are 15


['EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'DAYS_PAYMENT_RATIO_MAX_Latest_year',
 'AMT_GOODS_PRICE',
 'CNT_DRAWINGS_ATM_CURRENT_MAX_Latest_year',
 'EXT_SOURCE_1',
 'DAYS_DETAILS_CHANGE_MUL',
 'AMT_CREDIT',
 'DAYS_BIRTH',
 'DEF_60_CREDIT_RATIO',
 'AMT_PAYMENT_DIFF_MAX_Latest_year',
 'INTEREST_SHARE_MEAN_LAST_5',
 'REGION_POPULATION_RELATIVE',
 'DAYS_LAST_PHONE_CHANGE',
 'EMPLOYED_TO_AGE_RATIO']

## Creating Final dataframes ready for modeling

In [None]:
# creating final dataframes
X_train_final=X_train[selected_variable]
X_train_smote_final=X_train_smote[selected_variable]
X_test_final=X_test[selected_variable]
X_oot_final=X_oot[selected_variable]
X_test_oot_final=X_test_oot[selected_variable]

y_train_final=y_train.copy()
y_train_smote_final=y_train_smote.copy()
y_test_final=y_test.copy()
y_oot_final=y_oot.copy()

print('Shape of X_train_final is {} and of y_train_final is {} '.format(X_train_final.shape,y_train_final.shape))
print('Shape of X_train_smote_final is {} and of y_train_smote_final is {} '.format(X_train_smote_final.shape,y_train_smote_final.shape))
print('Shape of X_test_final is {} and of y_test_final is {} '.format(X_test_final.shape,y_test_final.shape))
print('Shape of X_oot_final is {} and of y_oot_final is {} '.format(X_oot_final.shape,y_oot_final.shape))
print('Shape of X_test_oot_final is {} '.format(X_test_oot_final.shape))

Shape of X_train_final is (246008, 15) and of y_train_final is (246008,) 
Shape of X_train_smote_final is (452272, 15) and of y_train_smote_final is (452272,) 
Shape of X_test_final is (30751, 15) and of y_test_final is (30751,) 
Shape of X_oot_final is (30752, 15) and of y_oot_final is (30752,) 
Shape of X_test_oot_final is (48744, 15) 


In [None]:
# Lets store the dataframes to reuse
pickle_path = '/content/drive/MyDrive/Career/DS/Case Studies/Home Credit Default Risk/'
pickle_out=open(pickle_path+'X_train_final.pickle', 'wb')
pickle.dump(X_train_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_train_smote_final.pickle', 'wb')
pickle.dump(X_train_smote_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_test_final.pickle', 'wb')
pickle.dump(X_test_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_oot_final.pickle', 'wb')
pickle.dump(X_oot_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'X_test_oot_final.pickle', 'wb')
pickle.dump(X_test_oot_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_train_final.pickle', 'wb')
pickle.dump(y_train_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_train_smote_final.pickle', 'wb')
pickle.dump(y_train_smote_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_test_final.pickle', 'wb')
pickle.dump(y_test_final,pickle_out)
pickle_out.close()

pickle_out=open(pickle_path+'y_oot_final.pickle', 'wb')
pickle.dump(y_oot_final,pickle_out)
pickle_out.close()

## Summary:
Following steps are performed to select the top 15 features from the set of 1771 features:
1. Information value of all the features is calculated and 40 different features with high information value are selected
2. Different 9 algorithms are further developed on the SMOTE train data and important 20 features of each algorithm are selected.
3. For each feature, count of votes based on its importance in vaious 9 algorithms is counted. Amongst this, top 15 features were selected.