In [None]:
#pip install boruta

In [1]:
# Importing libraries

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
import numpy as np

In [2]:
# loading the file

df = pd.read_csv('Credit_Risk_Train_data.csv')

# check the shape of the file

print(df.shape)

#check the columns in file
print(df.columns)

(614, 13)
Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')


In [3]:
# assigning the target variable
tar_var = df["Loan_Status"]

In [4]:
# dropping the variables
df.drop("Loan_ID", axis=1, inplace=True)
df.drop("Loan_Status", axis=1, inplace=True)


In [5]:
train = pd.get_dummies(df, drop_first=False, dummy_na=False)
train.shape
train.columns

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Gender_Female', 'Gender_Male',
       'Married_No', 'Married_Yes', 'Dependents_0', 'Dependents_1',
       'Dependents_2', 'Dependents_3+', 'Education_Graduate',
       'Education_Not Graduate', 'Self_Employed_No', 'Self_Employed_Yes',
       'Property_Area_Rural', 'Property_Area_Semiurban',
       'Property_Area_Urban'],
      dtype='object')

In [6]:
# taking all features
features = [f for f in train.columns]
len(features)

20

In [7]:
train.isnull().sum()

ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Gender_Female               0
Gender_Male                 0
Married_No                  0
Married_Yes                 0
Dependents_0                0
Dependents_1                0
Dependents_2                0
Dependents_3+               0
Education_Graduate          0
Education_Not Graduate      0
Self_Employed_No            0
Self_Employed_Yes           0
Property_Area_Rural         0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [8]:
# imputing features with nulls by their mean
train[features] = train[features].fillna(train[features].mean())

In [9]:
# separating x and y 
X = train[features].values
Y = tar_var.values.ravel()

In [10]:
# applying boruta
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', random_state=4242,
                                   max_iter = 100, perc = 90)
boruta_feature_selector.fit(X, Y)

BorutaPy(estimator=RandomForestClassifier(class_weight='balanced', max_depth=5,
                                          n_estimators=48, n_jobs=-1,
                                          random_state=RandomState(MT19937) at 0x14BBBD98D40),
         n_estimators='auto', perc=90,
         random_state=RandomState(MT19937) at 0x14BBBD98D40)

In [11]:
# applying transform
X_filtered = boruta_feature_selector.transform(X)
X_filtered.shape


(614, 2)

In [12]:
# resultant features
final_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(features[x])
print(final_features)

['LoanAmount', 'Credit_History']
