# Project Overview

* In this project, we will try to predict whether a loan application would be approved or not, using Decision Tree and Random Forest algorithms.The dataset includes applicant details such as income, education, credit history, and loan amount and other relevant features.

In [1]:
import pandas as pd
import numpy as npS
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('LoanApprovalPrediction.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Data Preprocessing

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 598 entries, 0 to 597
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            598 non-null    object 
 1   Gender             598 non-null    object 
 2   Married            598 non-null    object 
 3   Dependents         586 non-null    float64
 4   Education          598 non-null    object 
 5   Self_Employed      598 non-null    object 
 6   ApplicantIncome    598 non-null    int64  
 7   CoapplicantIncome  598 non-null    float64
 8   LoanAmount         577 non-null    float64
 9   Loan_Amount_Term   584 non-null    float64
 10  Credit_History     549 non-null    float64
 11  Property_Area      598 non-null    object 
 12  Loan_Status        598 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 60.9+ KB


In [5]:
df.isnull().sum()

Loan_ID               0
Gender                0
Married               0
Dependents           12
Education             0
Self_Employed         0
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           21
Loan_Amount_Term     14
Credit_History       49
Property_Area         0
Loan_Status           0
dtype: int64

#### Fill missing values

In [6]:
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mode()[0])
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].mode()[0])

In [7]:
df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

#### Check duplicated values

In [8]:
df.duplicated().sum()

0

In [9]:
df.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,127.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y


In [10]:
df.drop('Loan_ID', axis =1, inplace = True)

In [11]:
df.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

### Convert categorical veriables into indicators

* As our categorical variable has few unique values so will go with pd.get_dummies (One-Hot Encoding)

In [12]:
df = pd.get_dummies(df, columns = ['Gender','Married','Education','Self_Employed','Property_Area','Loan_Status'],
               drop_first = True, dtype = int)

In [13]:
df.head(3)

Unnamed: 0,Dependents,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,0.0,5849,0.0,127.0,360.0,1.0,1,0,0,0,0,1,1
1,1.0,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0
2,0.0,3000,0.0,66.0,360.0,1.0,1,1,0,1,0,1,1


### Let's split our dataset

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X = df.drop('Loan_Status_Y', axis = 1)
y = df['Loan_Status_Y']

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 101)

## Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
dt_pred = dt_model.predict(X_test)

### Evaluate Model

In [18]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, dt_pred))
print(classification_report(y_test, dt_pred))

[[30 26]
 [28 96]]
              precision    recall  f1-score   support

           0       0.52      0.54      0.53        56
           1       0.79      0.77      0.78       124

    accuracy                           0.70       180
   macro avg       0.65      0.65      0.65       180
weighted avg       0.70      0.70      0.70       180



## Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)
rf_pred = rf_model.predict(X_test)

### Evaluate Model

In [20]:
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, rf_pred))
print(classification_report(y_test, rf_pred))

[[ 25  31]
 [  7 117]]
              precision    recall  f1-score   support

           0       0.78      0.45      0.57        56
           1       0.79      0.94      0.86       124

    accuracy                           0.79       180
   macro avg       0.79      0.69      0.71       180
weighted avg       0.79      0.79      0.77       180



* We successfully built and compared two models to predict loan approval. Random Forest gave us better performance due to its ensemble nature.

## Decision Tree vs Random Forest

* A DecisionTreeClassifier is a single, tree-like model that uses a hierarchical decision-making process to classify data. In contrast, a RandomForestClassifier is an ensemble method that combines multiple DecisionTrees, each trained on different subsets of the data and features. This makes Random Forests generally more accurate,