In [29]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
# load dataset through url
train_url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_train.csv"

test_url = "https://raw.githubusercontent.com/ek-chris/Practice_datasets/refs/heads/main/home_loan_test.csv"


df_train = pd.read_csv(train_url)
df_test = pd.read_csv(test_url)
print("Dataset loaded sucessfuly")

Dataset loaded sucessfuly


In [31]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [32]:
def data_cleaning(df):
    print("\n1. Set Index:")
    df.set_index("Loan_ID", inplace=True)
    print("\n2. Column Cleaning and Handling data types")
    df["Dependents"] = df["Dependents"].str.replace("+","", regex=False)
    df["Dependents"] = df["Dependents"].astype(float)
    print("\n3. Missing Values:")
    missing_vale = df.isna().sum()
    if missing_vale.sum() > 0:
     print(missing_vale[missing_vale > 0])
    # extracts numerical columns
    num_col = df.select_dtypes(include=["float64","int64"]).columns
     # extracts categorical column
    cat_col = df.select_dtypes(include="object").columns
    # fill the  missing values in numerical columns with median
    df.loc[:, num_col] = df[num_col].fillna(df[num_col].median())
    # get the mode of the categorical columns
    modes = df[cat_col].apply(lambda x: x.value_counts().index[0])
    # fill the missing values in the catgorical columns with modes
    df.loc[:,cat_col] = df[cat_col].fillna(modes)
    #re_check for missing values
    print("Missing Values after Handling")
    print(df.isna().sum())
    print("\n4. Duplicate Rows:")
    #check for duplicate values
    duplicates = df.duplicated().sum()
    #print duplicated errors
    print(f"Number of duplicate rows: {duplicates}")
    #handle duplicated rows if at all
    if duplicates > 0:
       print(f"Percentage of duplicates: {(duplicates/len(df))*100:.2f}%")
       df.drop_duplicates(inplace=True)
    else:
       print("No duplicated values")
    return df

In [33]:
cleaned_data = data_cleaning(df_train)
cleaned_data.to_csv("cleaned_data.csv", index=False)


1. Set Index:

2. Column Cleaning and Handling data types

3. Missing Values:
Gender              13
Married              3
Dependents          15
Self_Employed       32
LoanAmount          22
Loan_Amount_Term    14
Credit_History      50
dtype: int64
Missing Values after Handling
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

4. Duplicate Rows:
Number of duplicate rows: 0
No duplicated values
