In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
train_data = pd.read_csv(r"C:\Users\Pratikk\OneDrive\Desktop\Loan Approval\Training Dataset.csv")
test_data = pd.read_csv(r"C:\Users\Pratikk\OneDrive\Desktop\Loan Approval\Test Dataset.csv")

# Printing Unprocessed Data

In [3]:
print(train_data.head)
print(test_data.head)

<bound method NDFrame.head of       Loan_ID  Gender Married Dependents     Education Self_Employed  \
0    LP001002    Male      No          0      Graduate            No   
1    LP001003    Male     Yes          1      Graduate            No   
2    LP001005    Male     Yes          0      Graduate           Yes   
3    LP001006    Male     Yes          0  Not Graduate            No   
4    LP001008    Male      No          0      Graduate            No   
..        ...     ...     ...        ...           ...           ...   
609  LP002978  Female      No          0      Graduate            No   
610  LP002979    Male     Yes         3+      Graduate            No   
611  LP002983    Male     Yes          1      Graduate            No   
612  LP002984    Male     Yes          2      Graduate            No   
613  LP002990  Female      No          0      Graduate           Yes   

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0               5849            

# Selection of Features

In [4]:
features = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term', 'Credit_History']
encoded_features = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']

# Feature Extraction

In [5]:
train_data = train_data[features+encoded_features+['Loan_Status']]
test_data = test_data[features+encoded_features]

In [6]:
for column in encoded_features:
    le = LabelEncoder()
    train_data[column] = le.fit_transform(train_data[column].astype(str))
    test_data[column] = le.fit_transform(test_data[column].astype(str))

# Handling Missing Values

In [7]:
train_data = train_data.dropna()
test_data = test_data.dropna()

In [8]:
X = train_data.drop('Loan_Status', axis=1)
label = LabelEncoder()
Y  = label.fit_transform(train_data['Loan_Status'])

# Proccessed Data

In [9]:
print(X)
print(Y, len(Y))
print(test_data)

     ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
1               4583             1508.0       128.0             360.0   
2               3000                0.0        66.0             360.0   
3               2583             2358.0       120.0             360.0   
4               6000                0.0       141.0             360.0   
5               5417             4196.0       267.0             360.0   
..               ...                ...         ...               ...   
609             2900                0.0        71.0             360.0   
610             4106                0.0        40.0             180.0   
611             8072              240.0       253.0             360.0   
612             7583                0.0       187.0             360.0   
613             4583                0.0       133.0             360.0   

     Credit_History  Gender  Married  Education  Self_Employed  Property_Area  \
1               1.0       1        1      

# Data Splitting

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

print(f"Train size : {len(X_train)}")
print(f"Test size : {len(X_test)}")

Train size : 423
Test size : 106


In [11]:
# Data Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train)
print(X_test)

[[ 0.57433706 -0.60600209  1.11261895 ... -0.42297682  1.2378509
  -0.77755637]
 [ 1.6396026  -0.60600209  1.2964439  ...  1.54316817  1.2378509
  -0.77755637]
 [-0.10872579  0.13234255  0.00966928 ... -0.42297682 -0.0518312
   1.81907267]
 ...
 [-0.33957928 -0.34667382 -0.88647733 ... -0.42297682  1.2378509
   0.95352965]
 [-0.67276199  0.42130833 -0.40393685 ... -0.42297682 -1.34151329
  -0.77755637]
 [ 0.12764245  0.0219428   0.72199094 ... -0.42297682 -0.0518312
   0.95352965]]
[[-0.50348985  1.86354399 -0.88647733 ... -0.42297682 -0.0518312
  -0.77755637]
 [ 0.06192503 -0.60600209 -0.03628696 ... -0.42297682  1.2378509
  -0.77755637]
 [ 1.70164352 -0.23034656  0.0326474  ...  1.54316817 -1.34151329
   0.95352965]
 ...
 [-0.67659167  0.5135551  -0.42691497 ...  1.54316817  1.2378509
   0.08798664]
 [ 0.01765386  1.11593763  0.17051611 ... -0.42297682 -1.34151329
   0.95352965]
 [ 0.13269763  1.02998883  0.71050188 ... -0.42297682 -1.34151329
  -0.77755637]]


# Logistic Regression Model

In [12]:
model = LogisticRegression()

model.fit(X_train, Y_train)

# Prediction

In [13]:
# Making Predictions
y_pred = model.predict(X_test)

# Performance Matrix

In [14]:
accuracy = accuracy_score(Y_test, y_pred)
conf_matrix = confusion_matrix(Y_test, y_pred)
class_report = classification_report(Y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.8018867924528302
Confusion Matrix:
 [[15 21]
 [ 0 70]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.42      0.59        36
           1       0.77      1.00      0.87        70

    accuracy                           0.80       106
   macro avg       0.88      0.71      0.73       106
weighted avg       0.85      0.80      0.77       106

