# <center> End-to-End Project- Credit Risk

### Task- 2

### Develop the ML model(s) to predict the credit risk(low or high) for a given applicant.

## Importing Libraries 

In [1]:
import numpy as np                                                        # For pre-preocessing data
import pandas as pd                                                       # For pre-preocessing data
import matplotlib.pyplot as plt                                           # For visualisation
%matplotlib inline
import seaborn as sns                                                     # For visualisation

pd.set_option('display.max_rows', 250)                                    # to show upto 250 rows in output
pd.set_option('display.max_colwidth',250)                                 # to show upto 250 cols in output
pd.set_option('display.float_format', lambda x: '%.5f' % x)               # customised format for pandas dataframe output


import warnings
warnings.filterwarnings('ignore')                                        # To supress warnings

from sklearn.linear_model import LogisticRegression                       # For training our Logistic Regression model
from sklearn.ensemble import RandomForestClassifier
                                                                          
from sklearn.metrics import confusion_matrix                              # For Performance metrics 
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score

from sklearn.model_selection import train_test_split                      # For train test split
from sklearn.metrics import roc_auc_score                                 # For ROC AUC 
from sklearn.metrics import roc_curve                                     # For plotting ROC 
from sklearn.metrics import precision_recall_curve                        # For plotting Precision and Recall 
from sklearn.preprocessing import LabelEncoder


## Loading Data

In [2]:
# imporitng data to a pandas dataframe

path1 = r'G:\Downloads\1_Relevel_DS\Data Science Assignment\applicant.csv'
path2 = r'G:\Downloads\1_Relevel_DS\Data Science Assignment\loan.csv'

applicant_data = pd.read_csv(path1)
loan_data = pd.read_csv(path2)

In [3]:

applicant_data.head()

Unnamed: 0,applicant_id,Primary_applicant_age_in_years,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Has_been_employed_for_at_least,Has_been_employed_for_at_most,Telephone,Foreign_worker,Savings_account_balance,Balance_in_existing_bank_account_(lower_limit_of_bucket),Balance_in_existing_bank_account_(upper_limit_of_bucket)
0,1469590,67,male,single,1,own,4,skilled employee / official,7 years,,Registered under the applicant's name,1,,,0
1,1203873,22,female,divorced/separated/married,1,own,2,skilled employee / official,1 year,4 years,,1,Low,0.0,2 lac
2,1432761,49,male,single,2,own,3,unskilled - resident,4 years,7 years,,1,Low,,
3,1207582,45,male,single,2,for free,4,skilled employee / official,4 years,7 years,,1,Low,,0
4,1674436,53,male,single,2,for free,4,skilled employee / official,1 year,4 years,,1,Low,,0


In [4]:

loan_data.head()

Unnamed: 0,loan_application_id,applicant_id,Months_loan_taken_for,Purpose,Principal_loan_amount,EMI_rate_in_percentage_of_disposable_income,Property,Has_coapplicant,Has_guarantor,Other_EMI_plans,Number_of_existing_loans_at_this_bank,Loan_history,high_risk_applicant
0,d68d975e-edad-11ea-8761-1d6f9c1ff461,1469590,6,electronic equipment,1169000,4,real estate,0,0,,2,critical/pending loans at other banks,0
1,d68d989e-edad-11ea-b1d5-2bcf65006448,1203873,48,electronic equipment,5951000,2,real estate,0,0,,1,existing loans paid back duly till now,1
2,d68d995c-edad-11ea-814a-1b6716782575,1432761,12,education,2096000,2,real estate,0,0,,1,critical/pending loans at other banks,0
3,d68d99fc-edad-11ea-8841-17e8848060ae,1207582,42,FF&E,7882000,2,building society savings agreement/life insurance,0,1,,1,existing loans paid back duly till now,0
4,d68d9a92-edad-11ea-9f3d-1f8682db006a,1674436,24,new vehicle,4870000,3,,0,0,,2,delay in paying off loans in the past,1


## Data Dictionary

- 0 = Low credit risk i.e high chance of paying back the loan amount
- 1 = High credit risk i.e low chance of paying back the loan amount

1. applicant_data : This file contains personal data about the (primary) applicant

    - Unique ID: applicant_id (string)

    - Other fields:

        - Primary_applicant_age_in_years (numeric)
        - Gender (string)
        - Marital_status (string)
        - Number_of_dependents (numeric)
        - Housing (string)
        - Years_at_current_residence (numeric)
        - Employment_status (string)
        - Has_been_employed_for_at_least (string)
        - Has_been_employed_for_at_most (string)
        - Telephone (string)
        - Foreign_worker (numeric)
        - Savings_account_balance (string)
        - Balance_in_existing_bank_account_(lower_limit_of_bucket) (string)
        - Balance_in_existing_bank_account_(upper_limit_of_bucket) (string)


2. loan_data : This file contains data more specific to the loan application
    - Target: high_risk_application (numeric)
    - Other fields:
        - applicant_id (string)
        - Months_loan_taken_for (numeric)
        - Purpose (string)
        - Principal_loan_amount (numeric)
        - EMI_rate_in_percentage_of_disposable_income (numeric)
        - Property (string)
        - Has_coapplicant (numeric)
        - Has_guarantor (numeric)
        - Other_EMI_plans (string)
        - Number_of_existing_loans_at_this_bank (numeric)
        - Loan_history (string)


## Preprocessing Data 

### applicant_data

In [5]:
# Checking for missing values
applicant_data .isnull().sum()

applicant_id                                                  0
Primary_applicant_age_in_years                                0
Gender                                                        0
Marital_status                                                0
Number_of_dependents                                          0
Housing                                                       0
Years_at_current_residence                                    0
Employment_status                                             0
Has_been_employed_for_at_least                               62
Has_been_employed_for_at_most                               253
Telephone                                                   596
Foreign_worker                                                0
Savings_account_balance                                     183
Balance_in_existing_bank_account_(lower_limit_of_bucket)    668
Balance_in_existing_bank_account_(upper_limit_of_bucket)    457
dtype: int64

**Observation:** There are many null values in applicant data. We can check percentage of null value in dataset and delete the columns having more than 45% of null value. 

Note: There is no any specific criteria for delete the columns having null values. It's totally depends on business demand.

In [6]:
100*applicant_data.isnull().sum()/len(applicant_data)

applicant_id                                                0.00000
Primary_applicant_age_in_years                              0.00000
Gender                                                      0.00000
Marital_status                                              0.00000
Number_of_dependents                                        0.00000
Housing                                                     0.00000
Years_at_current_residence                                  0.00000
Employment_status                                           0.00000
Has_been_employed_for_at_least                              6.20000
Has_been_employed_for_at_most                              25.30000
Telephone                                                  59.60000
Foreign_worker                                              0.00000
Savings_account_balance                                    18.30000
Balance_in_existing_bank_account_(lower_limit_of_bucket)   66.80000
Balance_in_existing_bank_account_(upper_limit_of

In [7]:
# deleting columns having >45% null data
null_data = pd.DataFrame({'% of null data':100*applicant_data.isnull().sum()/len(applicant_data)} )
null_data = null_data[null_data['% of null data']<=20]
null_data.index

Index(['applicant_id', 'Primary_applicant_age_in_years', 'Gender',
       'Marital_status', 'Number_of_dependents', 'Housing',
       'Years_at_current_residence', 'Employment_status',
       'Has_been_employed_for_at_least', 'Foreign_worker',
       'Savings_account_balance'],
      dtype='object')

In [8]:
# new applicant data
applicant_data_1 = applicant_data[null_data.index].copy()
applicant_data_1.head()

Unnamed: 0,applicant_id,Primary_applicant_age_in_years,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Has_been_employed_for_at_least,Foreign_worker,Savings_account_balance
0,1469590,67,male,single,1,own,4,skilled employee / official,7 years,1,
1,1203873,22,female,divorced/separated/married,1,own,2,skilled employee / official,1 year,1,Low
2,1432761,49,male,single,2,own,3,unskilled - resident,4 years,1,Low
3,1207582,45,male,single,2,for free,4,skilled employee / official,4 years,1,Low
4,1674436,53,male,single,2,for free,4,skilled employee / official,1 year,1,Low


In [9]:
# Chaking duplicate values

applicant_data_1.duplicated().sum()

0

We can see that there is no duplicate data present in applicant data.

#### Imputing null values

In [10]:
# let's check data having null values 
null_data[null_data['% of null data']>0]

Unnamed: 0,% of null data
Has_been_employed_for_at_least,6.2
Savings_account_balance,18.3


In above table we can see that there is 3 columns having null value. We can impute this null value by taking mode.

In [11]:
def fill_nan(col):                                                         # function that takes column as parameter
    na_value = applicant_data_1[col].mode()[0]                             # calculating mode of column
    applicant_data_1[col] = applicant_data_1[col].fillna(na_value, axis=0) # impute null value by mode
    return applicant_data_1[col].isnull().sum()                            # returns sum of null value after imputation

In [12]:
fill_nan('Has_been_employed_for_at_least')

0

In [13]:
fill_nan('Savings_account_balance')

0

#### Data Cleaning

In [14]:
applicant_data_1.columns

Index(['applicant_id', 'Primary_applicant_age_in_years', 'Gender',
       'Marital_status', 'Number_of_dependents', 'Housing',
       'Years_at_current_residence', 'Employment_status',
       'Has_been_employed_for_at_least', 'Foreign_worker',
       'Savings_account_balance'],
      dtype='object')

##### 'Marital_status' Column


In [15]:
l = []                                                              # creating empty list
for i in applicant_data_1['Marital_status']:
    if i == 'divorced/separated/married':                           
        l.append('separated')
    elif i == 'divorced/separated':
        l.append('separated')
    elif i == 'married/widowed':
        l.append('married')
    else:
        l.append(i) 
l[:5]

['single', 'separated', 'single', 'single', 'single']

In [16]:
applicant_data_1['Marital_status'] = l                       # update old Marital_status with new values store in list l

##### "Housing" Column

In [17]:
applicant_data_1['Housing'].unique()                         # check unique value present in a column

array(['own', 'for free', 'rent'], dtype=object)

In [18]:
applicant_data_1['Housing'] = applicant_data_1['Housing'].apply(
    lambda x: x.replace('for free', 'free'))             
                                                           # apply lambda function to replace 'for free' by 'free'

applicant_data_1['Housing'].unique()                       # check unique value present in a column after cleaning

array(['own', 'free', 'rent'], dtype=object)

##### "Employment_status" column

In [19]:
applicant_data_1['Employment_status'].unique()

array(['skilled employee / official', 'unskilled - resident',
       'management / self-employed / highly qualified employee / officer',
       'unemployed / unskilled - non-resident'], dtype=object)

In [24]:
x = []
for i in applicant_data_1['Employment_status']:
    if i =='skilled employee / official':
        x.append('employee')
    elif i == 'management / self-employed / highly qualified employee / officer':
        x.append('qualified employee')
    elif i == 'unemployed / unskilled - non-resident':
        x.append('unemployed')
    else:
        x.append(i)
applicant_data_1['Employment_status'] = x
applicant_data_1['Employment_status'].unique()

array(['employee', 'unskilled - resident', 'qualified employee',
       'unemployed'], dtype=object)

##### "Has_been_employed_for_at_least" column

In [25]:
applicant_data_1['Has_been_employed_for_at_least'].unique()

array(['7 years', '1 year', '4 years', '0 year'], dtype=object)

In [26]:
applicant_data_1['Has_been_employed_for_at_least (year)'] = applicant_data_1[
    'Has_been_employed_for_at_least'].apply(
        lambda x: x.strip(' years') or x.strip(' year'))                         # remove year or years from the data 
          
applicant_data_1['Has_been_employed_for_at_least (year)'].unique()

array(['7', '1', '4', '0'], dtype=object)

In [27]:
# Convert data type in int64
applicant_data_1['Has_been_employed_for_at_least (year)'] = applicant_data_1[
    'Has_been_employed_for_at_least (year)'].astype(int)

type(applicant_data_1['Has_been_employed_for_at_least (year)'][0])              # check data type of data

numpy.int32

In [28]:
applicant_data_1.drop(['Has_been_employed_for_at_least'], axis=1,inplace=True )

In [29]:
# final data looks loke

applicant_data_1.sample(10)

Unnamed: 0,applicant_id,Primary_applicant_age_in_years,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Foreign_worker,Savings_account_balance,Has_been_employed_for_at_least (year)
715,1314021,63,male,single,1,own,4,employee,1,Low,7
268,1310758,45,male,separated,1,own,4,qualified employee,0,Low,7
901,1621154,44,male,separated,1,own,4,employee,1,Low,0
793,1414229,51,male,separated,1,free,4,employee,1,Low,7
214,1529476,51,male,single,1,own,2,employee,1,Low,7
727,1695402,25,female,separated,1,rent,4,employee,1,Low,1
690,1345875,25,male,separated,1,own,3,employee,1,Low,1
810,1109861,26,male,married,1,own,2,employee,1,Low,0
717,1849511,32,male,single,1,free,2,qualified employee,1,Low,1
970,1662480,22,male,single,1,own,2,employee,1,Medium,1


In [30]:
applicant_data_1.shape

(1000, 11)

In [31]:
applicant_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                                 Non-Null Count  Dtype 
---  ------                                 --------------  ----- 
 0   applicant_id                           1000 non-null   int64 
 1   Primary_applicant_age_in_years         1000 non-null   int64 
 2   Gender                                 1000 non-null   object
 3   Marital_status                         1000 non-null   object
 4   Number_of_dependents                   1000 non-null   int64 
 5   Housing                                1000 non-null   object
 6   Years_at_current_residence             1000 non-null   int64 
 7   Employment_status                      1000 non-null   object
 8   Foreign_worker                         1000 non-null   int64 
 9   Savings_account_balance                1000 non-null   object
 10  Has_been_employed_for_at_least (year)  1000 non-null   int32 
dtypes: int32(1), int64

<center>=//=

### loan_data:

In [32]:
loan_data.sample(20)

Unnamed: 0,loan_application_id,applicant_id,Months_loan_taken_for,Purpose,Principal_loan_amount,EMI_rate_in_percentage_of_disposable_income,Property,Has_coapplicant,Has_guarantor,Other_EMI_plans,Number_of_existing_loans_at_this_bank,Loan_history,high_risk_applicant
851,d68f7236-edad-11ea-8fe0-5087c4a61be6,1785162,24,used vehicle,4042000,3,building society savings agreement/life insurance,0,0,,2,critical/pending loans at other banks,0
759,d68f3c80-edad-11ea-ac64-5a2530ff8e11,1239673,12,new vehicle,691000,4,building society savings agreement/life insurance,0,0,,2,critical/pending loans at other banks,1
877,d68f7f74-edad-11ea-aa5f-477c81b7c402,1624943,36,electronic equipment,3595000,4,car or other,0,0,,1,existing loans paid back duly till now,0
96,d68dcbfc-edad-11ea-ad44-06ed8ffa90db,1610826,12,education,2012000,4,car or other,0,0,,1,critical/pending loans at other banks,0
798,d68f55d0-edad-11ea-84a8-2800d7327005,1258889,24,new vehicle,717000,4,car or other,0,0,,2,delay in paying off loans in the past,0
660,d68f07ec-edad-11ea-b6db-3496b8dbd5c8,1352279,12,electronic equipment,1297000,3,real estate,0,0,,1,existing loans paid back duly till now,0
756,d68f3ae6-edad-11ea-98f0-22b178508d26,1868234,6,new vehicle,1299000,1,real estate,0,0,,3,critical/pending loans at other banks,0
653,d68f042c-edad-11ea-b98f-12dc3733eac6,1476071,36,new vehicle,8086000,2,car or other,0,0,,4,delay in paying off loans in the past,1
217,d68e0c66-edad-11ea-b0b3-1fa0e1d66548,1698484,36,electronic equipment,3913000,2,real estate,0,0,,1,existing loans paid back duly till now,0
101,d68dcea4-edad-11ea-bb1a-384a200d31e4,1466716,36,electronic equipment,2323000,4,car or other,0,0,,1,existing loans paid back duly till now,0


#### Imputing Null Value

In [33]:
loan_data.shape

(1000, 13)

In [34]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   loan_application_id                          1000 non-null   object
 1   applicant_id                                 1000 non-null   int64 
 2   Months_loan_taken_for                        1000 non-null   int64 
 3   Purpose                                      988 non-null    object
 4   Principal_loan_amount                        1000 non-null   int64 
 5   EMI_rate_in_percentage_of_disposable_income  1000 non-null   int64 
 6   Property                                     846 non-null    object
 7   Has_coapplicant                              1000 non-null   int64 
 8   Has_guarantor                                1000 non-null   int64 
 9   Other_EMI_plans                              186 non-null    object
 10  Number_of_exi

In [35]:
100 * loan_data.isnull().sum()/len(loan_data)

loan_application_id                            0.00000
applicant_id                                   0.00000
Months_loan_taken_for                          0.00000
Purpose                                        1.20000
Principal_loan_amount                          0.00000
EMI_rate_in_percentage_of_disposable_income    0.00000
Property                                      15.40000
Has_coapplicant                                0.00000
Has_guarantor                                  0.00000
Other_EMI_plans                               81.40000
Number_of_existing_loans_at_this_bank          0.00000
Loan_history                                   0.00000
high_risk_applicant                            0.00000
dtype: float64

In [36]:
loan_data.drop('Other_EMI_plans', axis=1, inplace=True)          # Other_EMI_plans has more than 80% null data. droping it.

In [37]:
loan_data.isnull().sum()

loan_application_id                              0
applicant_id                                     0
Months_loan_taken_for                            0
Purpose                                         12
Principal_loan_amount                            0
EMI_rate_in_percentage_of_disposable_income      0
Property                                       154
Has_coapplicant                                  0
Has_guarantor                                    0
Number_of_existing_loans_at_this_bank            0
Loan_history                                     0
high_risk_applicant                              0
dtype: int64

In [38]:
loan_data['Purpose'].mode()

0    electronic equipment
Name: Purpose, dtype: object

In [39]:
loan_data['Property'].mode()

0    car or other
Name: Property, dtype: object

In [40]:
def fill_nan(col):                                                         # function that takes column as parameter
    na_value = loan_data[col].mode()[0]                                    # calculating mode of column
    loan_data[col] = loan_data[col].fillna(na_value, axis=0)               # impute null value by mode
    return loan_data[col].isnull().sum()                                   # returns sum of null value after imputation

In [41]:
fill_nan('Purpose')

0

In [42]:
fill_nan('Property')

0

In [43]:
loan_data.isnull().sum() 

loan_application_id                            0
applicant_id                                   0
Months_loan_taken_for                          0
Purpose                                        0
Principal_loan_amount                          0
EMI_rate_in_percentage_of_disposable_income    0
Property                                       0
Has_coapplicant                                0
Has_guarantor                                  0
Number_of_existing_loans_at_this_bank          0
Loan_history                                   0
high_risk_applicant                            0
dtype: int64

In [44]:
loan_data.columns

Index(['loan_application_id', 'applicant_id', 'Months_loan_taken_for',
       'Purpose', 'Principal_loan_amount',
       'EMI_rate_in_percentage_of_disposable_income', 'Property',
       'Has_coapplicant', 'Has_guarantor',
       'Number_of_existing_loans_at_this_bank', 'Loan_history',
       'high_risk_applicant'],
      dtype='object')

In [45]:
loan_data['Loan_history'].value_counts()

existing loans paid back duly till now     530
critical/pending loans at other banks      293
delay in paying off loans in the past       88
all loans at this bank paid back duly       49
no loans taken/all loans paid back duly     40
Name: Loan_history, dtype: int64

In [46]:
df = applicant_data_1.merge(loan_data,on = 'applicant_id')          # final data
df.sample(10)

Unnamed: 0,applicant_id,Primary_applicant_age_in_years,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Foreign_worker,Savings_account_balance,...,Months_loan_taken_for,Purpose,Principal_loan_amount,EMI_rate_in_percentage_of_disposable_income,Property,Has_coapplicant,Has_guarantor,Number_of_existing_loans_at_this_bank,Loan_history,high_risk_applicant
148,1901178,28,male,single,1,own,2,employee,1,Low,...,36,FF&E,5371000,3,building society savings agreement/life insurance,0,1,2,critical/pending loans at other banks,0
294,1563134,46,male,separated,2,own,2,qualified employee,1,Low,...,48,business,7629000,4,car or other,0,0,2,critical/pending loans at other banks,0
781,1129775,60,male,single,1,own,4,employee,1,Very high,...,24,new vehicle,1940000,4,real estate,0,0,1,critical/pending loans at other banks,0
893,1851827,34,male,single,1,own,4,employee,1,Low,...,36,used vehicle,5800000,3,car or other,0,0,2,critical/pending loans at other banks,0
131,1817517,29,male,single,1,own,3,employee,1,Low,...,36,education,6887000,4,building society savings agreement/life insurance,0,0,1,delay in paying off loans in the past,1
930,1629759,24,male,single,1,own,1,unskilled - resident,0,Low,...,24,FF&E,1747000,4,building society savings agreement/life insurance,1,0,1,existing loans paid back duly till now,0
755,1621486,32,female,separated,1,rent,4,employee,1,Low,...,24,new vehicle,1285000,4,car or other,0,0,1,existing loans paid back duly till now,1
209,1241829,55,male,single,1,own,2,employee,0,Very high,...,12,used vehicle,1413000,3,building society savings agreement/life insurance,0,0,1,existing loans paid back duly till now,0
568,1754897,41,male,single,2,own,1,employee,1,Low,...,48,electronic equipment,3979000,4,car or other,0,0,2,existing loans paid back duly till now,0
791,1372040,41,male,single,2,own,4,qualified employee,1,Low,...,24,used vehicle,6313000,3,car or other,0,0,1,existing loans paid back duly till now,0


In [47]:
df.shape

(1000, 22)

In [48]:
df.columns

Index(['applicant_id', 'Primary_applicant_age_in_years', 'Gender',
       'Marital_status', 'Number_of_dependents', 'Housing',
       'Years_at_current_residence', 'Employment_status', 'Foreign_worker',
       'Savings_account_balance', 'Has_been_employed_for_at_least (year)',
       'loan_application_id', 'Months_loan_taken_for', 'Purpose',
       'Principal_loan_amount', 'EMI_rate_in_percentage_of_disposable_income',
       'Property', 'Has_coapplicant', 'Has_guarantor',
       'Number_of_existing_loans_at_this_bank', 'Loan_history',
       'high_risk_applicant'],
      dtype='object')

In [49]:
df.drop(['applicant_id','loan_application_id'], axis=1, inplace=True)

In [50]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Primary_applicant_age_in_years,1000.0,35.546,11.37547,19.0,27.0,33.0,42.0,75.0
Number_of_dependents,1000.0,1.155,0.36209,1.0,1.0,1.0,1.0,2.0
Years_at_current_residence,1000.0,2.845,1.10372,1.0,2.0,3.0,4.0,4.0
Foreign_worker,1000.0,0.963,0.18886,0.0,1.0,1.0,1.0,1.0
Has_been_employed_for_at_least (year),1000.0,2.868,2.71366,0.0,1.0,1.0,7.0,7.0
Months_loan_taken_for,1000.0,20.903,12.05881,4.0,12.0,18.0,24.0,72.0
Principal_loan_amount,1000.0,3271258.0,2822736.87596,250000.0,1365500.0,2319500.0,3972250.0,18424000.0
EMI_rate_in_percentage_of_disposable_income,1000.0,2.973,1.11871,1.0,2.0,3.0,4.0,4.0
Has_coapplicant,1000.0,0.041,0.19839,0.0,0.0,0.0,0.0,1.0
Has_guarantor,1000.0,0.052,0.22214,0.0,0.0,0.0,0.0,1.0


## <center> Feature Engineering

### Feature Engineering | Dropping Unnecessary or duplicate features and deleting Outliers

In [51]:
# Checking the features available in our data before dropping
df1 = df.copy()
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   Primary_applicant_age_in_years               1000 non-null   int64 
 1   Gender                                       1000 non-null   object
 2   Marital_status                               1000 non-null   object
 3   Number_of_dependents                         1000 non-null   int64 
 4   Housing                                      1000 non-null   object
 5   Years_at_current_residence                   1000 non-null   int64 
 6   Employment_status                            1000 non-null   object
 7   Foreign_worker                               1000 non-null   int64 
 8   Savings_account_balance                      1000 non-null   object
 9   Has_been_employed_for_at_least (year)        1000 non-null   int32 
 10  Months_loan_t

In [52]:
# As per EDA, dropping columns which are either correlated with some feature or are duplicates.
clm_to_drop = ['Principal_loan_amount','Months_loan_taken_for','Primary_applicant_age_in_years']
df1.drop(clm_to_drop, axis=1, inplace=True)

In [53]:
df1.sample(20)

Unnamed: 0,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Foreign_worker,Savings_account_balance,Has_been_employed_for_at_least (year),Purpose,EMI_rate_in_percentage_of_disposable_income,Property,Has_coapplicant,Has_guarantor,Number_of_existing_loans_at_this_bank,Loan_history,high_risk_applicant
53,male,single,1,own,1,employee,1,Low,1,used vehicle,2,building society savings agreement/life insurance,0,0,1,existing loans paid back duly till now,0
35,male,single,1,own,2,unskilled - resident,1,Low,0,electronic equipment,4,building society savings agreement/life insurance,0,0,2,critical/pending loans at other banks,1
31,male,single,1,own,2,employee,1,Low,1,FF&E,2,car or other,0,0,1,existing loans paid back duly till now,0
786,male,single,1,own,4,employee,1,High,7,electronic equipment,3,car or other,0,0,1,existing loans paid back duly till now,0
177,male,single,1,own,4,employee,1,High,7,electronic equipment,4,car or other,0,0,2,critical/pending loans at other banks,0
658,female,separated,1,own,1,employee,1,Low,1,business,2,car or other,0,0,2,no loans taken/all loans paid back duly,0
957,male,single,1,own,4,unskilled - resident,1,Low,7,electronic equipment,2,real estate,0,0,3,critical/pending loans at other banks,0
542,male,single,1,own,4,employee,1,Low,7,FF&E,4,building society savings agreement/life insurance,0,0,1,existing loans paid back duly till now,1
94,male,single,1,own,4,employee,1,Very high,7,new vehicle,4,real estate,0,0,1,existing loans paid back duly till now,0
323,male,single,1,own,2,employee,1,Very high,1,FF&E,4,car or other,0,0,1,existing loans paid back duly till now,0


### Feature Engineering | Creating dummies 

In [54]:
df1.columns


Index(['Gender', 'Marital_status', 'Number_of_dependents', 'Housing',
       'Years_at_current_residence', 'Employment_status', 'Foreign_worker',
       'Savings_account_balance', 'Has_been_employed_for_at_least (year)',
       'Purpose', 'EMI_rate_in_percentage_of_disposable_income', 'Property',
       'Has_coapplicant', 'Has_guarantor',
       'Number_of_existing_loans_at_this_bank', 'Loan_history',
       'high_risk_applicant'],
      dtype='object')

In [55]:
df1['Loan_history'].unique()

array(['critical/pending loans at other banks',
       'existing loans paid back duly till now',
       'delay in paying off loans in the past',
       'no loans taken/all loans paid back duly',
       'all loans at this bank paid back duly'], dtype=object)

In [56]:
# Dropping target to create Features set an Target set seperately as X & Y
X = df1.drop(['high_risk_applicant'], axis=1)
Y = df1['high_risk_applicant']

In [57]:
X.select_dtypes(include='object').columns

Index(['Gender', 'Marital_status', 'Housing', 'Employment_status',
       'Savings_account_balance', 'Purpose', 'Property', 'Loan_history'],
      dtype='object')

In [58]:
labelencoder_X = LabelEncoder()
for i in (X.select_dtypes(include='object').columns):
    X[i] = labelencoder_X.fit_transform(X[i])

In [59]:
X.head(10)

Unnamed: 0,Gender,Marital_status,Number_of_dependents,Housing,Years_at_current_residence,Employment_status,Foreign_worker,Savings_account_balance,Has_been_employed_for_at_least (year),Purpose,EMI_rate_in_percentage_of_disposable_income,Property,Has_coapplicant,Has_guarantor,Number_of_existing_loans_at_this_bank,Loan_history
0,1,2,1,1,4,0,1,1,7,5,4,2,0,0,2,1
1,0,1,1,1,2,0,1,1,1,5,2,2,0,0,1,3
2,1,2,2,1,3,3,1,1,4,4,2,2,0,0,1,1
3,1,2,2,0,4,0,1,1,4,0,2,0,0,1,1,3
4,1,2,2,0,4,0,1,1,1,6,3,1,0,0,2,2
5,1,2,2,0,4,3,1,1,1,4,2,1,0,0,1,3
6,1,2,1,1,4,0,1,0,7,0,3,0,0,0,1,3
7,1,2,1,2,2,1,1,1,1,8,2,1,0,0,1,3
8,1,1,1,1,4,3,1,3,4,5,2,2,0,0,1,3
9,1,0,1,1,2,1,1,1,1,6,4,1,0,0,2,1


In [60]:
X.shape

(1000, 16)

###  Feature Engineering | Train Test Split

In [62]:
# Train tst split

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, random_state = 1,stratify=Y)

## <center>  Logistic Regression 

In [64]:
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, classification_report

m1 = 'LogisticRegression'

lr = LogisticRegression(random_state=1, max_iter=1000)

model = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)

lr_conf_matrix = confusion_matrix(y_test, lr_predict)
lr_acc_score = accuracy_score(y_test, lr_predict)

print("confussion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(y_test,lr_predict))

confussion matrix
[[134   6]
 [ 57   3]]


Accuracy of Logistic Regression: 68.5 

              precision    recall  f1-score   support

           0       0.70      0.96      0.81       140
           1       0.33      0.05      0.09        60

    accuracy                           0.69       200
   macro avg       0.52      0.50      0.45       200
weighted avg       0.59      0.69      0.59       200



## <center>Random Forest

In [75]:
from sklearn.ensemble import RandomForestClassifier
print('Random Forest Classfier')

rf_model = RandomForestClassifier(n_estimators=150, random_state=0,max_depth=8, 
                            n_jobs=-1, max_features=10)
rf_model.fit(X_train,y_train)

rf_predicted = rf_model.predict(X_test)
rf_conf_matrix = confusion_matrix(y_test, rf_predicted,labels=(1,0))
rf_acc_score = accuracy_score(y_test, rf_predicted)

print("confussion matrix")
print(rf_conf_matrix)
print("\n")
print("Accuracy of Random Forest:",rf_acc_score*100,'\n')
print(classification_report(y_test,rf_predicted))

Random Forest Classfier
confussion matrix
[[ 15  45]
 [  9 131]]


Accuracy of Random Forest: 73.0 

              precision    recall  f1-score   support

           0       0.74      0.94      0.83       140
           1       0.62      0.25      0.36        60

    accuracy                           0.73       200
   macro avg       0.68      0.59      0.59       200
weighted avg       0.71      0.73      0.69       200



In [76]:
rf_model.feature_importances_

array([0.03416757, 0.05247769, 0.02911435, 0.04680883, 0.07133124,
       0.0631739 , 0.01132881, 0.06374291, 0.09557558, 0.13169478,
       0.0827777 , 0.06584328, 0.03251147, 0.0239286 , 0.04378322,
       0.15174006])

In [77]:
Feature= pd.DataFrame({'Feature':X_train.columns,'Importance':rf_model.feature_importances_})
Feature.sort_values('Importance',ascending=False,inplace=True)
Feature.set_index('Feature',inplace=True)
Feature.head(10)

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
Loan_history,0.15174
Purpose,0.13169
Has_been_employed_for_at_least (year),0.09558
EMI_rate_in_percentage_of_disposable_income,0.08278
Years_at_current_residence,0.07133
Property,0.06584
Savings_account_balance,0.06374
Employment_status,0.06317
Marital_status,0.05248
Housing,0.04681


In [162]:
# exporting Random Forest for new data prediction 
import pickle
pickle.dump(rf_model,open('rf_model.pkl','wb'))

In [78]:
# Thank You