## **Project Overview**

### **Data Collection and Preparation**

In [1]:
# Importing the necessary libraries

import pandas as pd
import numpy as np

##### **Loading datasets directly from Github**

In [4]:
url = r"https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv"

data = pd.read_csv(url)

url2 = r"https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"

data2 = pd.read_csv(url2)


In [None]:
# Concatenating dataset

comb_data = pd.concat([data, data2], ignore_index=True)

comb_data.head()

comb_data.to_csv(r"Home_loan_dataset")  # Saving this for future usage locally

In [8]:
comb_data.shape

comb_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,
3,LP001035,Male,Yes,2,Graduate,No,2340,2546.0,100.0,360.0,,Urban,
4,LP001051,Male,No,0,Not Graduate,No,3276,0.0,78.0,360.0,1.0,Urban,


In [None]:
# To handle the issue of loading data due to network, I will be using the saved dataset instead

# df = pd.read_csv(r"Home_loan_dataset")

# df.head()

#### **Inspecting for missing values, duplicates, and data type inconsistencies**

In [7]:
# Checking missing values
comb_data.isna().sum()

Loan_ID                0
Gender                24
Married                3
Dependents            25
Education              0
Self_Employed         55
ApplicantIncome        0
CoapplicantIncome      0
LoanAmount            27
Loan_Amount_Term      20
Credit_History        79
Property_Area          0
Loan_Status          367
dtype: int64

In [9]:
# Checking for duplicates values
comb_data.duplicated().sum()

np.int64(0)

No duplicate values in the data frame.

In [10]:
# Checking for data inconsistencies
comb_data.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

#### **Cleaning the dataset**

In [None]:
# Fill missing values in all non-numerical columns with their respective modes.
mode_gender = comb_data["Gender"].mode()[0]        # This will return the highest occurrences values therein.
comb_data["Gender"].fillna(mode_gender, inplace=True)

mode_married = comb_data["Married"].mode()[0]
comb_data["Married"].fillna(mode_married, inplace=True)

mode_dependents = comb_data["Dependents"].mode()[0]
comb_data["Dependents"].fillna(mode_dependents, inplace=True)

mode_employed = comb_data["Self_Employed"].mode()[0]
comb_data["Self_Employed"].fillna(mode_employed, inplace=True)

mode_loan = comb_data["Loan_Status"].mode()[0]
comb_data["Loan_Status"].fillna(mode_loan, inplace=True)



In [12]:
# Fill missing values in all numerical columns with their respective medians
comb_data.fillna(comb_data.median(numeric_only=True), inplace=True)

In [13]:
comb_data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

### **Exploratory Data Analysis (EDA)**

In [14]:
# Descriptive Statistics of the data
comb_data.head(3)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
1,LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,Y
2,LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y


In [15]:
# Setting Loan_ID as index
comb_data.set_index("Loan_ID", inplace=True)

In [16]:
comb_data.head(3)

Unnamed: 0_level_0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
Loan_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
LP001015,Male,Yes,0,Graduate,No,5720,0.0,110.0,360.0,1.0,Urban,Y
LP001022,Male,Yes,1,Graduate,No,3076,1500.0,126.0,360.0,1.0,Urban,Y
LP001031,Male,Yes,2,Graduate,No,5000,1800.0,208.0,360.0,1.0,Urban,Y


In [17]:
# Picking out the numerical columns and get the descriptions
numerical_features = comb_data.select_dtypes(include=['float64', 'int64']).columns
comb_data[numerical_features].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ApplicantIncome,981.0,5179.795107,5695.104533,0.0,2875.0,3800.0,5516.0,81000.0
CoapplicantIncome,981.0,1601.91633,2718.772806,0.0,0.0,1110.0,2365.0,41667.0
LoanAmount,981.0,142.057085,76.395592,9.0,101.0,126.0,160.0,700.0
Loan_Amount_Term,981.0,342.56473,64.482011,6.0,360.0,360.0,360.0,480.0
Credit_History,981.0,0.849134,0.358101,0.0,1.0,1.0,1.0,1.0


In [18]:
# Checking the number of loan applicants by gender
pd.DataFrame(comb_data["Gender"].value_counts())

Unnamed: 0_level_0,count
Gender,Unnamed: 1_level_1
Male,799
Female,182


In [19]:
# Checking the number of loan applicants by Education
pd.DataFrame(comb_data["Education"].value_counts())

Unnamed: 0_level_0,count
Education,Unnamed: 1_level_1
Graduate,763
Not Graduate,218


In [20]:
# Checking the number of loan applicants by Property area
pd.DataFrame(comb_data["Property_Area"].value_counts())

Unnamed: 0_level_0,count
Property_Area,Unnamed: 1_level_1
Semiurban,349
Urban,342
Rural,290
