## Import libraries

In [1]:
import numpy as np
import pandas as pd 

## Loading Dataset

In [2]:
data = pd.read_csv("Dataset/loan_data.csv")
## 
df = data.copy()

## Data Preprocessing
- Normalize the column names to lower case
- Drop the ID column
- Remove the (+) sign on the Dependants column
- Fill the NaN in the (Dependants, Credit_History, Loan_Amount, Gender, Self_Employed) columns
- Change the dtypes for (Gender, Married, Self_Employed) to categorical
- Replace categorical column(Gender, Married, Self_Employed and Loan-Status) with integers


In [3]:
df.columns = df.columns.str.lower()
df.columns

Index(['loan_id', 'gender', 'married', 'dependents', 'education',
       'self_employed', 'applicantincome', 'coapplicantincome', 'loanamount',
       'loan_amount_term', 'credit_history', 'property_area', 'loan_status'],
      dtype='object')

In [4]:
df = df.drop(['loan_id'], axis=1)

In [5]:
## remove + from the dependents column
df['dependents'] = df['dependents'].str.replace('+', '')

In [6]:
## convert the dependents column type to float
df['dependents'] = df['dependents'].astype('float')

In [7]:
df['dependents'].isnull().sum()

8

In [8]:
## fill the null values in the dependents column
df['dependents'] = df['dependents'].fillna(df['dependents'].mean())

In [9]:
## fill the null values in the credit_history column
df['credit_history'] = df['credit_history'].fillna(df['credit_history'].mean())

In [10]:
## fill the null values in the loan_amount_term column
df['loan_amount_term'] = df['loan_amount_term'].fillna(df['loan_amount_term'].mean())

In [11]:
## convert the dependents column type to integers
df['dependents'] = df['dependents'].astype('int64')

In [12]:
## convert the loan_amount_term column type to integer
df['loan_amount_term'] = df['loan_amount_term'].astype('int64')

In [13]:
## convert the applicantincome column type to float
df['applicantincome'] = df['applicantincome'].astype('float')

In [14]:
## convert the credit history column type to integer
df['credit_history'] = df['credit_history'].astype('int64')

In [15]:
## convert the gender column type to category
df['gender'] = df['gender'].astype('category')

In [16]:
## convert the astype of the gender from category to integer
df.gender = (df.gender == 'Male').astype(int)

In [17]:
## convert the married column type to category
df['married'] = df['married'].astype('category')

In [18]:
## convert the astype of the married from category to integer
df.married = (df.married == 'Yes').astype(int)

In [19]:
## convert the self employed column type to category
df['self_employed'] = df['self_employed'].astype('category')

In [20]:
## convert the astype of the self employed from category to integer
df.self_employed = (df.self_employed == 'No').astype(int)

In [21]:
## target variable 
df.gender.value_counts()

gender
1    291
0     90
Name: count, dtype: int64

In [22]:
## fill the missing value of the gender column to Male
df.gender = df.gender.fillna("Male")

In [23]:
df.self_employed.value_counts()

self_employed
1    325
0     56
Name: count, dtype: int64

In [24]:
## fill the missing value of the self employed column to No
df.self_employed = df.self_employed.fillna("No")

In [25]:
## confirming the null values
df.isnull().sum()

gender               0
married              0
dependents           0
education            0
self_employed        0
applicantincome      0
coapplicantincome    0
loanamount           0
loan_amount_term     0
credit_history       0
property_area        0
loan_status          0
dtype: int64

In [26]:
## confirming the dtypes
df.dtypes

gender                 int32
married                int32
dependents             int64
education             object
self_employed          int32
applicantincome      float64
coapplicantincome    float64
loanamount           float64
loan_amount_term       int64
credit_history         int64
property_area         object
loan_status           object
dtype: object

In [27]:
df.loan_status.head()

0    N
1    Y
2    Y
3    Y
4    Y
Name: loan_status, dtype: object

In [28]:
## convert the astype of the loan status from category to integer
df.loan_status = (df.loan_status == 'N').astype(int)

In [29]:
df.loan_status.head()

0    1
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int32

In [30]:
df.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,1,1,1,Graduate,1,4583.0,1508.0,128.0,360,1,Rural,1
1,1,1,0,Graduate,0,3000.0,0.0,66.0,360,1,Urban,0
2,1,1,0,Not Graduate,1,2583.0,2358.0,120.0,360,1,Urban,0
3,1,0,0,Graduate,1,6000.0,0.0,141.0,360,1,Urban,0
4,1,1,0,Not Graduate,1,2333.0,1516.0,95.0,360,1,Urban,0


In [31]:
## saving the cleaned loan dataset
df.to_csv("Dataset/cleaned_loan_status_dataset.csv")