Loan Eligibility Prediction

In [None]:
# Load the dataset using Pandas, numpy, pandas, seaborn, and matplotlib.pyplot library (data analysis and visualization)


import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

data = pd.read_csv('LoanApprovalPrediction.csv')
data.shape

In [None]:
# To get the information about the LoanApprovalPrediction.csv dataset.

data.info()

In [None]:
# Data Preprocessing by checking the count of the missing values in the LoanApprovalPrediction.csv dataset.

data.isna().sum()

In [None]:
# Identify the total number of the dataset 'data'

data.Loan_ID.nunique()

In [None]:
# Identify the Loan_ID contains Duplicate Values

data.duplicated().sum()

In [None]:
# Drop and remove the Loan_ID column from the DataFrame

data.drop(['Loan_ID'], axis=1, inplace=True)

In [None]:
# Calculates the total number of missing values in a Pandas DataFrame named 'data'.

data.isna().sum().sum()

In [None]:
# Handling the Null Values of the variables


# Fill the categorical values with the mode

data['Dependents'] = data['Dependents'].fillna(data['Dependents'].mode()[0])

# Fill the missing values with the mean starting with Loan Amount

data['LoanAmount'] = data['LoanAmount'].fillna(data['LoanAmount'].mean())

# Fill the missing values in Loan Amount Term 

data['Loan_Amount_Term'] = data['Loan_Amount_Term'].fillna(data['Loan_Amount_Term'].mean())

# Fill the missing values in Credit History 

data['Credit_History'] = data.Credit_History.fillna(1)


In [None]:
# Snippet that maps the values of a column named "Gender" in a Pandas DataFrame named data from string values to numerical values.

data.Gender = data.Gender.map({'Male': 0, 'Female':1})

In [None]:
# Use the Seaborn library to create a countplot visualization of the distribution of a variable (Gender)

sns.countplot(data=data,x='Gender' ,hue='Married')
plt.show()

In [None]:
# Create a countplot visualization of the distribution of a variable (Dependents)

sns.countplot(data=data,x='Dependents')
plt.show()

In [None]:
# Create a countplot visualization of the distribution of a variable (Education)

sns.countplot(data=data, x='Education')
plt.show()

In [None]:
# Create a countplot visualization of the distribution of loan Approval status for each category of the variable (Married).

sns.countplot(data=data,x='Loan_Status',hue='Married')
plt.show()

In [None]:
# Create a countplot visualization of the distribution of a variable (Self_Employed)

sns.countplot(data=data,x='Self_Employed')
plt.show()

In [None]:
#Reset the index of the DataFrame 'Data'.

data = data.reset_index()

In [None]:
#Add a new column to DataFrame 'Data' called 'Total_Income', which is calculated by summing the 'ApplicantIncome' and 'CoapplicantIncome' columns.

data['Total_Income'] = data['ApplicantIncome'] + data['CoapplicantIncome']

In [None]:
# Create a countplot visualization of the distribution of a variable (Total_Income)

sns.displot(data=data, x='Total_Income')
plt.show()

In [None]:
# Create a new DataFrame that is a copy of the original one 'Data'.
# In case the numeric values skewed - log the data 

model_data = data.copy()

In [None]:
# Reduce the impact of extreme values and make the data more normally distributed

model_data['Total_Income'] = np.log(model_data['Total_Income'])

In [None]:
# Create a countplot visualization of the distribution of a variable (Total_Income) of model_data set

sns.displot(data=model_data, x='Total_Income')
plt.show()

In [None]:
# Retrieve the column labels for the 'model_data' DataFrame and variables included in the dataset.

model_data.columns

In [None]:
# Reduce the impact of extreme values and make the data more normally distributed
# Create a countplot visualization of the distribution of a variable (LoanAmount) of model_data set


model_data['LoanAmount'] = np.log(model_data['LoanAmount'])
sns.displot(data=model_data, x='LoanAmount')
plt.show()

In [None]:
#Identifying the different loan terms that are available in the 'model_data'

model_data['Loan_Amount_Term'].unique()

In [None]:
# Create a countplot visualization of the distribution of a variable (Loan_Amount_Term) of model_data set

sns.displot(data=model_data,x='Loan_Amount_Term')
plt.show()

In [None]:
# Reduce the impact of extreme values and make the data more normally distributed
# Create a countplot visualization of the distribution of a variable (Loan_Amount_Term) of model_data set


model_data['Loan_Amount_Term'] = np.log(model_data['Loan_Amount_Term'])
sns.displot(data=model_data, x='Loan_Amount_Term')
plt.show()

In [None]:
# Retrieves all unique values in that variable 'Credit_History' column.
# Return an array containing [0, 1].

model_data['Credit_History'].unique()

In [None]:
# Create a countplot visualization of the distribution of a variable (Credit_History) of model_data set

sns.countplot(data=model_data, x='Credit_History')
plt.show()

In [None]:
# Create a countplot visualization of the distribution of a variable (Property Area)

sns.countplot(data=data,x='Property_Area')
plt.show()

In [None]:
# Create a heatmap visualization of a correlation matrix.

sns.heatmap(data.corr(), cmap='BuPu', annot=True)
data.corr()

In [None]:
# Categorical variables in the DataFrame will have been replaced with integer codes

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

obj = (data.dtypes == 'object')
for col in list(obj[obj].index):
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
# Fill the missing values in a Pandas DataFrame named data with the mean value of each column.

for col in data.columns:
    data[col] = data[col].fillna(data[col].mean())

In [None]:
# Prepares the data for a supervised machine learning task by separating the input features and the target variable in a Pandas DataFrame named data.

x = data.drop(['Loan_Status'], axis=1)
y = data.Loan_Status

In [None]:
# Splits the data into a training set and a testing set and 30% of the data will be used for testing and 70% will be used for training.

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=7)


print("Number of Original Data:", model_data.shape[0])
print("Number of Original Variable Data:", model_data.shape[1])
print("\n")

print("Training Data:", x_train.shape[0])
print("Variables of Training Data:", x_train.shape[1])

print("\n")
print("Testing Data:", x_test.shape[0])
print("Variables of Testing Data:", x_test.shape[1])



In [None]:
#Trains and evaluates multiple machine learning models including Logistic Regression, Decision Tree Classifier, and Random Forest Classifier. on a dataset using cross-validation.

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score 
from sklearn.metrics import recall_score
from tabulate import tabulate


# Define the models
models = []
models.append(('Logistic Regression', LogisticRegression(max_iter=1000)))
models.append(('Decision Tree Classifier', DecisionTreeClassifier()))
models.append(('Random Forest Classifier', RandomForestClassifier()))


def modeling(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred, average='weighted') * 100
    prec = precision_score(y_test, y_pred, average='weighted') * 100
    rec = recall_score(y_test, y_pred, average='weighted') * 100
    return acc, f1, prec, rec
    
    
# Evaluate the machine learning models    
results = []
for name, model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    acc = accuracy_score(y_test, y_pred) * 100
    f1 = f1_score(y_test, y_pred, average='weighted') * 100
    prec = precision_score(y_test, y_pred, average='weighted') * 100
    rec = recall_score(y_test, y_pred, average='weighted') * 100
    results.append([name, acc, f1, prec, rec])

    
# Print the results in a table
headers = ['Model', 'Accuracy', 'F1 Score', 'Precision', 'Recall']
table = tabulate(results, headers=headers, tablefmt='orgtbl')
print(table) 
    

In [None]:
y_pred = model.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
cm

In [None]:
sns.heatmap(cm,annot=True)
plt.show()