### **Data Collection and Preparation**

In [2]:
# Importing the necessary libraries

import pandas as pd
import numpy as np

##### **Loading datasets directly from Github**

In [None]:
url = r"https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv"

data = pd.read_csv(url)

In [6]:
data.shape

(367, 12)

In [None]:
url2 = r"https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"

data2 = pd.read_csv(url2)

data2.head()


In [7]:
data2.shape

(614, 13)

In [None]:
# Concatenating dataset

comb_data = pd.concat([data, data2], ignore_index=True)

comb_data.head()

comb_data.to_csv(r"Home_loan_dataset")

In [10]:
comb_data.shape

(981, 13)

In [5]:
# To handle the issue of loading data due to network, I will be using the saved dataset instead

df = pd.read_csv(r"Home_loan_dataset")

df.head()

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,
1,1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,
2,2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,
3,3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban,
4,4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,


#### **Inspecting for missing values, duplicates, and data type inconsistencies**

In [None]:
# Checking missing values
df.isna().sum()

Unnamed: 0             0
Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [31]:
# Checking for duplicates values
df.duplicated().sum()

np.int64(0)

No duplicate values in the data frame.

In [33]:
# Checking for data inconsistencies
df.dtypes

Unnamed: 0             int64
Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

#### **Cleaning the dataset**

In [26]:
# Fill missing values in all non-numerical columns with their respective modes.
mode_gender = df["Gender"].mode()[0]        # This will return the highest occurrences values therein.
df["Gender"].fillna(mode_gender, inplace=True)

mode_married = df["Married"].mode()[0]
df["Married"].fillna(mode_married, inplace=True)

mode_dependents = df["Dependents"].mode()[0]
df["Dependents"].fillna(mode_dependents, inplace=True)

mode_employed = df["Self_Employed"].mode()[0]
df["Self_Employed"].fillna(mode_employed, inplace=True)

mode_loan = df["Loan_Status"].mode()[0]
df["Loan_Status"].fillna(mode_loan, inplace=True)



In [38]:
# Fill missing values in all numerical columns with their respective medians
df.fillna(df.median(numeric_only=True), inplace=True)

In [39]:
df.isna().sum()

Unnamed: 0           0
Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### **Exploratory Data Analysis (EDA)**

In [41]:
# Descriptive Statistics of the data
df.head(3)

Unnamed: 0.1,Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,Y
2,2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y


In [None]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
# Setting Loan_ID as index
df.set_index("Loan_ID", inplace=True)

In [53]:
df.head(3)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,Y
LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y


In [54]:
# Picking out the numerical columns and get the descriptions
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_features].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,981.0,5179.795107,5695.104533,0.0,2875.0,3800.0,5516.0,81000.0
CoapplicantIncome,981.0,1601.91633,2718.772806,0.0,0.0,1110.0,2365.0,41667.0
LoanAmount,981.0,142.057085,76.395592,9.0,101.0,126.0,160.0,700.0
Loan_Amount_Term,981.0,342.56473,64.482011,6.0,360.0,360.0,360.0,480.0
Credit_History,981.0,0.849134,0.358101,0.0,1.0,1.0,1.0,1.0


In [57]:
# Checking the number of loan applicants by gender
pd.DataFrame(df["Gender"].value_counts())

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,799
Female,182


In [60]:
# Checking the number of loan applicants by Education
pd.DataFrame(df["Education"].value_counts())

Unnamed: 0_level_0,count
Education,Unnamed: 1_level_1
Graduate,763
Not Graduate,218


In [61]:
# Checking the number of loan applicants by Property area
pd.DataFrame(df["Property_Area"].value_counts())

Unnamed: 0_level_0,count
Property_Area,Unnamed: 1_level_1
Semiurban,349
Urban,342
Rural,290
