# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Importing Dataset

In [17]:
# Set a random seed for reproducibility
np.random.seed(0)

# Generate random data for the features
data = {
    'MonthlyIncome': np.random.randint(2000, 10000, 1000),
    'Age': np.random.randint(20, 60, 1000),
    'CreditScore': np.random.randint(300, 850, 1000), # Standard range of credit score is 300 to 850
    'LoanAmountRequested': np.random.randint(5000, 20000, 1000),
    'LoanTermMonths': np.random.randint(12, 60, 1000),
    'EmploymentStatus': np.random.choice(['Employed', 'Unemployed', 'Self-Employed'], 1000),
    'MaritalStatus': np.random.choice(['Single', 'Married', 'Divorced'], 1000),
    'NumDependents': np.random.randint(0, 5, 1000),
    'LoanApproved': np.random.choice([0, 1], 1000),  # Binary: 0 for Not Approved, 1 for Approved
}

# Create a DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,MonthlyIncome,Age,CreditScore,LoanAmountRequested,LoanTermMonths,EmploymentStatus,MaritalStatus,NumDependents,LoanApproved
0,4732,58,472,11572,50,Self-Employed,Divorced,1,0
1,4607,40,540,13194,24,Self-Employed,Single,4,0
2,3653,20,667,16230,27,Self-Employed,Divorced,0,1
3,5264,56,498,6093,40,Employed,Divorced,1,1
4,6931,22,562,14775,37,Self-Employed,Single,0,0


In [88]:
# Saving dataframe into 'csv' file
df.to_csv('loan_approval_data.csv', index=False)

In [18]:
# Saving the synthetic dataset
df.to_csv('loan_approval_dataset.csv', index=False)

In [19]:
df.shape

(1000, 9)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   MonthlyIncome        1000 non-null   int32 
 1   Age                  1000 non-null   int32 
 2   CreditScore          1000 non-null   int32 
 3   LoanAmountRequested  1000 non-null   int32 
 4   LoanTermMonths       1000 non-null   int32 
 5   EmploymentStatus     1000 non-null   object
 6   MaritalStatus        1000 non-null   object
 7   NumDependents        1000 non-null   int32 
 8   LoanApproved         1000 non-null   int32 
dtypes: int32(7), object(2)
memory usage: 43.1+ KB


In [21]:
# Checking the missing value
df.isnull().sum()

MonthlyIncome          0
Age                    0
CreditScore            0
LoanAmountRequested    0
LoanTermMonths         0
EmploymentStatus       0
MaritalStatus          0
NumDependents          0
LoanApproved           0
dtype: int64

# Data Preprocessing

### Replacing Catogorical value to Numerical value

In [22]:
df.MaritalStatus=df.MaritalStatus.map({'Married':0,'Divorced':1,'Single':2})
df['MaritalStatus'].value_counts()

0    345
2    343
1    312
Name: MaritalStatus, dtype: int64

In [24]:
df.EmploymentStatus=df.EmploymentStatus.map({'Employed':0,'Unemployed':1,'Self-Employed':2})
df['EmploymentStatus'].value_counts()

1    362
0    333
2    305
Name: EmploymentStatus, dtype: int64

In [27]:
df['LoanApproved'].value_counts()

1    515
0    485
Name: LoanApproved, dtype: int64

In [29]:
# Final Dataframe
df.head()

Unnamed: 0,MonthlyIncome,Age,CreditScore,LoanAmountRequested,LoanTermMonths,EmploymentStatus,MaritalStatus,NumDependents,LoanApproved
0,4732,58,472,11572,50,2,1,1,0
1,4607,40,540,13194,24,2,2,4,0
2,3653,20,667,16230,27,2,1,0,1
3,5264,56,498,6093,40,0,1,1,1
4,6931,22,562,14775,37,2,2,0,0


# Model Building

In [30]:
# Importing packeges for classification algorithms

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [34]:
X = df.drop('LoanApproved', axis = 1)
y = df['LoanApproved']
print('shape of X =', X.shape)
print('shape of y =', y.shape)

shape of X = (1000, 8)
shape of y = (1000,)


In [35]:
# Spliting the data into train and test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

print('Shape of X_train', X_train.shape)
print('Shape of X_test', X_test.shape)
print('Shape of y_train', y_train.shape)
print('Shape of y_test', y_test.shape)

Shape of X_train (800, 8)
Shape of X_test (200, 8)
Shape of y_train (800,)
Shape of y_test (200,)


In [36]:
model = LogisticRegression()

In [37]:
model.fit(X_train, y_train)

In [38]:
# Predicting target variable on the X_test data
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1])

# Model Evaluation

In [39]:
# importing metrics for model evaluation
from sklearn import metrics

In [40]:
accuracy=metrics.accuracy_score(y_test,y_pred)
accuracy

0.465

In [41]:
precision=metrics.precision_score(y_test,y_pred)
precision

0.4909090909090909

In [42]:
recall=metrics.recall_score(y_test,y_pred)
recall

0.5142857142857142

In [44]:
F1_score = metrics.f1_score(y_test, y_pred)
F1_score

0.5023255813953488

In [45]:
# accuracy of the model
model.score(X_test, y_test)

0.465

# Feature Engineering

In [71]:
# Normalization
from sklearn.preprocessing import MinMaxScaler

In [59]:
# To make our dataset values between 0 and 1
mmc = MinMaxScaler()
mmc.fit(X_train)

In [60]:
X_train_mmc = mmc.transform(X_train)
X_test_mmc = mmc.transform(X_test)

In [61]:
X_train_mmc

array([[0.62836064, 0.56410256, 0.96357013, ..., 0.5       , 0.5       ,
        0.        ],
       [0.07565337, 0.79487179, 0.20218579, ..., 1.        , 1.        ,
        1.        ],
       [0.549331  , 0.30769231, 0.0564663 , ..., 1.        , 0.        ,
        1.        ],
       ...,
       [0.91134175, 1.        , 0.97632058, ..., 0.5       , 0.        ,
        0.75      ],
       [0.78554458, 0.23076923, 0.47176685, ..., 0.5       , 0.        ,
        0.        ],
       [0.83106165, 0.48717949, 0.41165756, ..., 0.5       , 0.5       ,
        0.5       ]])

In [62]:
# Creating dataframe of transformed MinMaxScaler X_train

X_train_mmc = pd.DataFrame(X_train_mmc, columns = ['MonthlyIncome','Age','CreditScore','LoanAmountRequested','LoanTermMonths'
                                                ,'EmploymentStatus','MaritalStatus','NumDependents'])

X_test_mmc = pd.DataFrame(X_test_mmc, columns = ['MonthlyIncome','Age','CreditScore','LoanAmountRequested','LoanTermMonths'
                                                ,'EmploymentStatus','MaritalStatus','NumDependents'])

In [65]:
X_train_mmc.describe()       # You can see after after transforming MinMaxScalar --> min value(min) = 0 and max value(max) = 1

Unnamed: 0,MonthlyIncome,Age,CreditScore,LoanAmountRequested,LoanTermMonths,EmploymentStatus,MaritalStatus,NumDependents
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,0.495535,0.507756,0.493689,0.502647,0.49609,0.489375,0.4925,0.494375
std,0.28797,0.295901,0.29121,0.28833,0.294943,0.398934,0.417027,0.349279
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.250907,0.25641,0.23816,0.245994,0.25,0.0,0.0,0.25
50%,0.493122,0.487179,0.495446,0.512485,0.510638,0.5,0.5,0.5
75%,0.733681,0.769231,0.748634,0.750083,0.765957,1.0,1.0,0.75
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [68]:
# Predicted value of y after applying MinMaxScaler
y_pred_mmc = model.predict(X_test_mmc)
y_pred_mmc

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [70]:
# accuracy of model
accuracy=metrics.accuracy_score(y_test,y_pred_mmc)
accuracy

0.49

In [72]:
model.score(X_test_mmc, y_test)

0.49

#### Utilizing feature engineering techniques out model got improved by 46% to 49%.

In [81]:
# Predicting Loan Approval

In [82]:
model.predict([[5743,36,459,18990,24,1,2,2]])[0]

0

In [83]:
model.predict([[3653,20,667,16230,27,2,1,0]])[0]

1

#### Here,
#### 1 = Loan Approved
#### 0 = Loan Not Approved

### Comparing Logistic Regression model with other classification model

In [84]:
# importing SVM model
from sklearn.svm import SVC

In [85]:
svm = SVC()
svm.fit(X_train,y_train)

In [86]:
#accuracy
svm.score(X_test,y_test)

0.525

#### Accuracy of model (using Logistic Regression) = 54%
#### Accuracy of model (using Support Vector Machine) = 41%
#### Thus, we can  see that Support Vector Machine is good model for this dataset.