# MIS 637 - Final Project (Group 3)

In [33]:
################################
# Import the libraries
################################

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.metrics import classification_report
from scipy.stats import f_oneway

In [34]:
# Loan_ID					Unique Loan ID
# Gender					Male/ Female
# Married					Applicant married (Y/N)
# Dependents				Number of dependents
# Education					Applicant Education (Graduate/ Under Graduate)
# Self_Employed				Self employed (Y/N)
# ApplicantIncome			Applicant income
# CoapplicantIncome			Coapplicant income
# LoanAmount				Loan amount in thousands
# Loan_Amount_Term			Term of loan in months
# Credit_History			(1- has all debts paid, 0- not paid)
# Property_Area				Urban/ Semi Urban/ Rural
# Loan_Status				(Target) Loan approved (Y/N)


In [35]:
# Load the data
main_loan_df = pd.read_csv('Loan_Data.csv')

# Make sure that the result is as expected
main_loan_df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [36]:
# copying data to new variables 
loan_df = main_loan_df.copy()

In [37]:
loan_df.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [38]:
# drop unwanted column 
loan_df = loan_df.drop(columns = ['Loan_ID'], axis=1)
# loan_df = loan_df.drop(columns = ['Gender'], axis=1)
# loan_df = loan_df.drop(columns = ['Property_Area'], axis=1)

In [39]:
loan_df.shape

(614, 12)

In [40]:
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 57.7+ KB


In [41]:
# check the loan approval status 
loan_df['Loan_Status'].value_counts()

Y    422
N    192
Name: Loan_Status, dtype: int64

In [None]:
# get a % of approved vs non-approved
loan_df['Loan_Status'].value_counts(normalize=True)

Looks like 422 people (68.72%) out of 614 people got approved for a loan

In [None]:
loan_df['Loan_Status'].value_counts().plot.bar()

In [None]:
loan_df['Gender'].value_counts(normalize=True)

In [None]:
loan_df['Married'].value_counts(normalize=True)

In [None]:
loan_df['Self_Employed'].value_counts(normalize=True)

In [None]:
loan_df['Credit_History'].value_counts(normalize=True)

In [42]:
# Checking for null values in the columns
loan_df.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

# Option 1 (Filling the Null values with random values)

In [None]:
loan_df.Gender.value_counts()

In [None]:
# get the number of missing values in the Gender column
num_missing = loan_df["Gender"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
random_values = np.random.choice(["Male", "Female"], size=num_missing)

# fill in the missing values with the random values
loan_df.loc[loan_df["Gender"].isnull(), "Gender"] = random_values

In [None]:
loan_df.Gender.value_counts()

In [None]:
loan_df.Married.value_counts()

In [None]:
# get the number of missing values in the Gender column
num_missing = loan_df["Married"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
random_values = np.random.choice(["Yes", "No"], size=num_missing)

# fill in the missing values with the random values
loan_df.loc[loan_df["Married"].isnull(), "Married"] = random_values

In [None]:
loan_df.Married.value_counts()

In [None]:
loan_df.Dependents.value_counts()

In [None]:
# get the number of missing values in the Gender column
num_missing = loan_df["Dependents"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
random_values = np.random.choice(["0", "1", "2", "3+"], size=num_missing)

# fill in the missing values with the random values
loan_df.loc[loan_df["Dependents"].isnull(), "Dependents"] = random_values

In [None]:
loan_df.Dependents.value_counts()

In [None]:
loan_df.Self_Employed.value_counts()

In [None]:
# get the number of missing values in the Gender column
num_missing = loan_df["Self_Employed"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
random_values = np.random.choice(["Yes", "No"], size=num_missing)

# fill in the missing values with the random values
loan_df.loc[loan_df["Self_Employed"].isnull(), "Self_Employed"] = random_values

In [None]:
loan_df.Self_Employed.value_counts()

In [None]:
loan_df.LoanAmount.value_counts()

In [None]:
loan_df.LoanAmount.fillna(loan_df.LoanAmount.median(),inplace=True)

In [None]:
loan_df.LoanAmount.value_counts()

In [None]:
loan_df.Loan_Amount_Term.value_counts()

In [None]:
loan_df.Loan_Amount_Term.fillna(loan_df.Loan_Amount_Term.mean(),inplace=True)

In [None]:
loan_df.Loan_Amount_Term.value_counts()

In [None]:
loan_df.Credit_History.value_counts()

In [None]:
# get the number of missing values in the Gender column
num_missing = loan_df["Credit_History"].isnull().sum()

# generate an array of random values with the same length as the number of missing values
random_values = np.random.choice([1, 0], size=num_missing)

# fill in the missing values with the random values
loan_df.loc[loan_df["Credit_History"].isnull(), "Credit_History"] = random_values

In [None]:
loan_df.Credit_History.value_counts()

# Option 2 (Dropping the null values)

In [6]:
# drop rows with missing values
    # since we have NaN values the regression will fail
loan_df.dropna(inplace=True) 

# Logistic Regression

In [None]:
loan_df.shape

In [None]:
# extract the feature matrix X and target variable y
X = loan_df.drop("Loan_Status", axis=1)
X = X.drop("ApplicantIncome", axis = 1)
y = loan_df["Loan_Status"]

In [None]:
X.head(5)

In [None]:
# The different categories

# Categorical features: These features have categories 
# => Gender, Married, Self_Employed, Credit_History   (This is the final reasult = Loan_Status)

# Ordinal features: Variables in categorical features having some order involved 
# => Dependents, Education, Property_Area

# Numerical features: These features have numerical values 
# => ApplicantIncome, CoapplicantIncome, LoanAmount, Loan_Amount_Term

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
X.head(5)

In [None]:
# Training the dataset : training (60%) and validation (40%)
    # What should be the split ?????
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [None]:
# Logistic Regression model
model = LogisticRegression()

# Fit it into the training data
model.fit(X_train, y_train)

# Predictions on the data
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

print(classification_report(y_test, y_pred))

# with null values filled with random (Option 1)
# Accuracy: 0.7357723577235772

# with dropping the null values (Option 2)
# Accuracy: 0.8020833333333334

In [None]:
# .score() takes the input and output as arguments and returns the ratio 
# of the number of correct predictions to the number of observations.
model.score(X_train, y_train)

# with null values filled with random (Option 1)
# 0.7907608695652174

# with dropping the null values (Option 2)
# 0.8263888888888888


# Testing things

In [None]:
# print(loan_df['LoanAmount'].value_counts())

counts = loan_df['LoanAmount'].value_counts().sort_values(ascending=False)
# counts = loan_df['LoanAmount'].value_counts().sort_index()

for val, cnt in counts.iteritems():
    print(f"{val}: {cnt}")

In [None]:
loan_df['LoanAmount_log'].value_counts()

In [None]:
loan_df['LoanAmount'].hist(bins=20) 

In [None]:
loan_df['LoanAmount_log'] = np.log(loan_df['LoanAmount']) 
loan_df['LoanAmount_log'].hist(bins=20) 

In [None]:
sns.histplot(loan_df['ApplicantIncome'])
plt.show()
loan_df['ApplicantIncome'].plot.box(figsize=(16,5))
plt.show()
     

In [None]:
sns.histplot(loan_df['LoanAmount'])
plt.show()
loan_df['LoanAmount'].plot.box(figsize=(16,5))
plt.show()
     

In [None]:
# print(X.corr())
f, ax = plt.subplots(figsize=(22, 20))

# plotting correlation heatmap
dataplot = sns.heatmap(X.corr(), cmap="YlGnBu", annot=True)

# displaying heatmap
plt.show()

In [None]:
# X.columns = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome',
#                    'CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area',
#                    'Loan_Status']

# Correlation between different variables
corr = X.corr()

# Set up the matplotlib plot configuration
f, ax = plt.subplots(figsize=(12, 10))

# Generate a mask for upper traingle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Configure a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap
sns.heatmap(corr, annot=True, mask = mask, cmap=cmap)

In [None]:
from sklearn.metrics import classification_report

# Preprocess the data
X = loan_df[['LoanAmount', 'Loan_Amount_Term']]  # Extract the income feature
y = loan_df['Loan_Status']  # Extract the loan status target variable

# X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict loan status for the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

In [9]:
CategoryGroupLists = loan_df.groupby('Loan_Status')['ApplicantIncome'].apply(list)
AnnovaResults = f_oneway(*CategoryGroupLists)
print('P-value for anova is : ', AnnovaResults[1])

P-value for anova is :  0.907287812130378


In [14]:
loan_df.replace({'Loan_Status':{'N':0, 'Y':1}},inplace=True)

group1 = loan_df['Loan_Status']
group2 = loan_df['ApplicantIncome']

f_statistic, p_value = f_oneway(group1, group2)

# Print the results
print("F-statistic:", f_statistic)
print("p-value:", p_value)

F-statistic: 480.2368392788594
p-value: 4.330702635909282e-90


In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


X = loan_df.drop('Loan_Status', axis=1)  # Features (excluding loan_status)
y = loan_df['Loan_Status']  # Target variable

X = pd.get_dummies(X, drop_first=True)
# print(X.head(5))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

# Train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict loan status for the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='Y')
recall = recall_score(y_test, y_pred, pos_label='Y')
f1 = f1_score(y_test, y_pred, pos_label='Y')
classification_report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Classification Report:\n", classification_report)

# Precision : out of all the applicants the model predicted would get approved only 74% actually did
# Recall : Out of all the applicants that actually got approved, the model only predicted this outcome 
#          correctly for 98%
# F1 Score : The closer to 1 the better. The model did a good job if its closer to 1

Accuracy: 0.8020833333333334
Precision: 0.7987421383647799
Recall: 0.9548872180451128
F1-Score: 0.8698630136986302
Classification Report:
               precision    recall  f1-score   support

           N       0.82      0.46      0.59        59
           Y       0.80      0.95      0.87       133

    accuracy                           0.80       192
   macro avg       0.81      0.71      0.73       192
weighted avg       0.80      0.80      0.78       192



In [None]:
# FYI things for final project
# Create a ppt
    # Mention whats the prob
    # what are we working on
    # graphs
    # normal presentation (voice over)
# no need to submit the code
