In [17]:
import pandas as pd

In [18]:
data=pd.read_csv('data\loan_prediction.csv')

In [19]:
#drop Loan_ID column as its insignificant
data=data.drop(labels=['Loan_ID'],axis=1)


In [21]:
data.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [22]:
# Handling missing values in Dependents column
data['Dependents'].fillna(0, inplace=True)  # Replace missing values with 0
data['Dependents'] = data['Dependents'].replace('3+', 3) #replacing 3+ with 3 
data['Dependents'] = data['Dependents'].astype(int) #changing its datatype from object to int

# if the coapplicant's income is zero, the person is married otherwise unmarried.
data['Married'] = data.Married.fillna(data.apply(lambda x: 'Yes' if x['CoapplicantIncome'] > 0 else 'No', axis=1))


# if income of an applicant is less than the the average income of all the applicants, then the applicant is considered female, otherwise male 
data['Gender'] = data.Gender.fillna(data.apply(lambda x: 'Female' if x['ApplicantIncome'] < data['ApplicantIncome'].mean() else 'Male', axis=1))



#if the income of co-applicant is 0 then the person is self-employed, otherwise considered not.
data['Self_Employed'] = data.Self_Employed.fillna(data.apply(lambda x: 'Yes' if x['CoapplicantIncome'] == 0 else 'No', axis=1)) 


data['Loan_Status'] = data['Loan_Status'].astype(str) #converting datatype of loan_status to string


#replacing 'Y' with 1.0 and 'N' with 0.0 for null values in credit_history
data['Credit_History'] = data.apply(lambda row: 1.0 if row['Loan_Status'] == 'Y' else 0.0 if pd.isnull(row['Credit_History']) else row['Credit_History'], axis=1)


#replacing null values with their mean in LoanAmount and Loan_Amount_Term columns
data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean()) 
data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())




In [24]:
data.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             614 non-null    object 
 1   Married            614 non-null    object 
 2   Dependents         614 non-null    int32  
 3   Education          614 non-null    object 
 4   Self_Employed      614 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         614 non-null    float64
 8   Loan_Amount_Term   614 non-null    float64
 9   Credit_History     614 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int32(1), int64(1), object(6)
memory usage: 55.3+ KB


In [25]:
#Encoding non-numerical features
data.replace({'Gender':{'Female':0,'Male':1}},inplace=True)
data.replace({'Married':{'Yes':1,'No':0}},inplace=True)
data.replace({'Dependents':{'0':0,'1':1,'2':2,'3+':3}},inplace=True)
data.replace({'Education':{'Graduate':0,'Not Graduate':1}},inplace=True)
data.replace({'Self_Employed':{'No':0,'Yes':1}},inplace=True)
data.replace({'Credit_History':{'1.0':1,'0.0':0}},inplace=True)
data.replace({'Property_Area':{'Semiurban':0,'Urban':1,'Rural':2}},inplace=True)
data.replace({'Loan_Status':{'Y':1,'N':0}},inplace=True)


In [26]:
data

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,1,1
1,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,2,0
2,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,1,1
3,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,1,1
4,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,2,1
610,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,2,1
611,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,1,1
612,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,1,1


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [28]:
X = data.drop(['Loan_Status'], axis=1)
y = data['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [30]:

models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'SupportVectorClassifier': SVC()
}

trained_model_list = []
model_list = []
r2_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make Predictions
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:", rmse)
    print("MAE:", mae)
    print("R2 score:", r2_square * 100)
    print("Accuracy:", accuracy * 100)

    r2_list.append(r2_square)
    accuracy_list.append(accuracy)

    print('=' * 35)
    print('\n')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression
Model Training Performance
RMSE: 0.4026936331284146
MAE: 0.16216216216216217
R2 score: 28.846153846153843
Accuracy: 83.78378378378379


DecisionTreeClassifier
Model Training Performance
RMSE: 0.4931969619160719
MAE: 0.24324324324324326
R2 score: -6.730769230769229
Accuracy: 75.67567567567568


RandomForestClassifier
Model Training Performance
RMSE: 0.4093501771925445
MAE: 0.16756756756756758
R2 score: 26.47435897435897
Accuracy: 83.24324324324324


SupportVectorClassifier
Model Training Performance
RMSE: 0.5927489783638191
MAE: 0.35135135135135137
R2 score: -54.16666666666667
Accuracy: 64.86486486486487




In [31]:
max_accuracy_index = accuracy_list.index(max(accuracy_list))
best_model_name = model_list[max_accuracy_index]
best_accuracy = accuracy_list[max_accuracy_index]

print("Model with the highest accuracy:")
print("Model Name:", best_model_name)
print("Accuracy:", best_accuracy)


Model with the highest accuracy:
Model Name: LogisticRegression
Accuracy: 0.8378378378378378
