# introduction:
this is a project where the goal is to predict whether for the given conditions the loan will be approved or not based on various factors.

# importing relevent packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

## importing data

In [2]:
df = pd.read_csv("../loan_approval_dataset.csv")

In [3]:
df.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,4,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected


In [4]:
df.shape    

(4269, 13)

## number of unique items in each column

In [5]:
df.nunique()

loan_id                      4269
 no_of_dependents               6
 education                      2
 self_employed                  2
 income_annum                  98
 loan_amount                  378
 loan_term                     10
 cibil_score                  601
 residential_assets_value     278
 commercial_assets_value      188
 luxury_assets_value          379
 bank_asset_value             146
 loan_status                    2
dtype: int64

appearantly each column has atleast 2 unique items
* if a column has only 1 unique value it would have no use to us.

## duplicate check

In [6]:
df.duplicated().sum()

np.int64(0)

* there are no duplicates in our data set

## columns check

In [7]:
df.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

* just by looking at the column names , we can see that there is some incnsistancy in the naming of the columns. some columns have spaces and some don't. we need to address this as it would make things difficult for us in the futures

## refining column names
* removing the empty spaces from the column names

In [8]:
df.columns = [c.strip() for c in df.columns] 
# the .strip() method removes the empty spaces

In [9]:
df.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status'],
      dtype='object')

## checking for missing values

In [10]:
df.isna().sum()

loan_id                     0
no_of_dependents            0
education                   0
self_employed               0
income_annum                0
loan_amount                 0
loan_term                   0
cibil_score                 0
residential_assets_value    0
commercial_assets_value     0
luxury_assets_value         0
bank_asset_value            0
loan_status                 0
dtype: int64

it looks like there are no missing values in any columns 
* hence , there is no need to impute anything

## splitting the data
here we are splitting the dependent variables and independent variables

In [11]:
X = df.drop(columns = ["loan_status"])
y = df["loan_status"]

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   loan_id                   4269 non-null   int64 
 1   no_of_dependents          4269 non-null   int64 
 2   education                 4269 non-null   object
 3   self_employed             4269 non-null   object
 4   income_annum              4269 non-null   int64 
 5   loan_amount               4269 non-null   int64 
 6   loan_term                 4269 non-null   int64 
 7   cibil_score               4269 non-null   int64 
 8   residential_assets_value  4269 non-null   int64 
 9   commercial_assets_value   4269 non-null   int64 
 10  luxury_assets_value       4269 non-null   int64 
 11  bank_asset_value          4269 non-null   int64 
dtypes: int64(10), object(2)
memory usage: 400.3+ KB


### applying domain knowledge
* the columns  residential_assets_value, commercial_assets_value, 
 luxury_assets_value, bank_asset_value   are added to make a common value which is the sum of all asset values.

In [13]:
X["total_assets_value"] = X["residential_assets_value"] + X["bank_asset_value"] + X["commercial_assets_value"] + X["luxury_assets_value"]


In [14]:
X.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'total_assets_value'],
      dtype='object')

In [15]:
X.drop(columns = ['residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value'], inplace = True)

In [16]:
X.columns

Index(['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'total_assets_value'],
      dtype='object')

In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   loan_id             4269 non-null   int64 
 1   no_of_dependents    4269 non-null   int64 
 2   education           4269 non-null   object
 3   self_employed       4269 non-null   object
 4   income_annum        4269 non-null   int64 
 5   loan_amount         4269 non-null   int64 
 6   loan_term           4269 non-null   int64 
 7   cibil_score         4269 non-null   int64 
 8   total_assets_value  4269 non-null   int64 
dtypes: int64(7), object(2)
memory usage: 300.3+ KB


## handling categorical data

In [18]:
X["education"].unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [19]:
X["self_employed"].unique()

array([' No', ' Yes'], dtype=object)

* it looks like the "education" and "self_employed" columns have only two values.
* also there seems that there is an empty space at the begining of each value, lets remove this for the sake of convenience . 

In [20]:
y.unique()

array([' Approved', ' Rejected'], dtype=object)

* even the target variable has empty space at the begining, lets remove thease too.

In [21]:
X["education"] = X["education"].str.strip()
X["self_employed"] = X["self_employed"].str.strip()
y = y.str.strip()

In [22]:
X

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,1,2,Graduate,No,9600000,29900000,12,778,50700000
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,17000000
2,3,3,Graduate,No,9100000,29700000,20,506,57700000
3,4,3,Graduate,No,8200000,30700000,8,467,52700000
4,5,5,Not Graduate,Yes,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...,...
4264,4265,5,Graduate,Yes,1000000,2300000,12,317,7400000
4265,4266,0,Not Graduate,Yes,3300000,11300000,20,559,20000000
4266,4267,2,Not Graduate,No,6500000,23900000,18,457,39000000
4267,4268,1,Not Graduate,No,4100000,12800000,8,780,28800000


In [23]:
X["education"].unique()

array(['Graduate', 'Not Graduate'], dtype=object)

## encoding the categorical data
convertig the categorical columns such that :
* where we have graduate we should have 1 & and where we have Nt graduate we should have 0
* where we have Yes we should have 1 and where we have No we should have 0
since this cannot be ensured using builtin encoders , lets build a function to do so.

In [24]:
## categorical data transform function

In [25]:
def transform_categorical_to_binary(df , columns_to_transform):
    '''
     Transforms the categorical columns into binary values.

     Args: 
     df: the DataFrame to be modified
     columns_to_transform: a dictionary where the columns are 
     column names and the values are the list of categorical values to map to 1.
     returns: 
     the modified DataFrame with transformed columns. 
       '''
    for column_name, positive_value in columns_to_transform.items():
        df[column_name] = df[column_name].apply(lambda x: 1 if x in positive_value else 0)
        
    return df


In [26]:
columns_to_transform = {
    'education' :['Graduate'],
    'self_employed' : ["Yes"]
}

transformed_X = transform_categorical_to_binary(X , columns_to_transform)
transformed_X

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,1,2,1,0,9600000,29900000,12,778,50700000
1,2,0,0,1,4100000,12200000,8,417,17000000
2,3,3,1,0,9100000,29700000,20,506,57700000
3,4,3,1,0,8200000,30700000,8,467,52700000
4,5,5,0,1,9800000,24200000,20,382,55000000
...,...,...,...,...,...,...,...,...,...
4264,4265,5,1,1,1000000,2300000,12,317,7400000
4265,4266,0,0,1,3300000,11300000,20,559,20000000
4266,4267,2,0,0,6500000,23900000,18,457,39000000
4267,4268,1,0,0,4100000,12800000,8,780,28800000


## Scaling the value


### log transformation

In [27]:
transformed_X.head(1)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,1,2,1,0,9600000,29900000,12,778,50700000


In [28]:
log_cols = ["income_annum", "loan_amount", "total_assets_value"] # columns to be applyed by log
X[log_cols]  = np.log(X[log_cols])

In [29]:
X.head(1)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,1,2,1,0,16.077274,17.213369,12,778,17.741436


In [30]:
# encoding the target variables to binary using map method
y = y.map({"Approved":1,
           "Rejected":0})

In [31]:
y[:5]

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [32]:
X.head()

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,1,2,1,0,16.077274,17.213369,12,778,17.741436
1,2,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,3,1,0,16.023785,17.206658,20,506,17.870768
3,4,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,5,0,1,16.097893,17.001863,20,382,17.822844


### dropping loan_id column as it is irrelevent to our prediction

In [37]:
X.drop(columns = "loan_id", inplace = True)
X.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,total_assets_value
0,2,1,0,16.077274,17.213369,12,778,17.741436
1,0,0,1,15.226498,16.316947,8,417,16.648724
2,3,1,0,16.023785,17.206658,20,506,17.870768
3,3,1,0,15.919645,17.239773,8,467,17.780126
4,5,0,1,16.097893,17.001863,20,382,17.822844


## exporting processed data

In [38]:
X.to_csv("X_processed.csv", index = False)

In [39]:
y.to_csv("y_processed.csv", index = False)