In [4]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import warnings

warnings.filterwarnings("ignore")
from datetime import datetime, timedelta

In [5]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))


np.random.seed(42)
num_records = 10000

loan_ids = np.arange(1, num_records + 1)
disbursement_dates = [
    random_date(datetime(2020, 1, 1), datetime(2023, 1, 1)) for _ in range(num_records)
]
expire_dates = [d + timedelta(days=random.randint(30, 365)) for d in disbursement_dates]

is_employed = np.random.choice([True, False, np.nan], num_records, p=[0.7, 0.25, 0.05])
loan_amounts = np.random.choice(
    [np.nan, *np.random.uniform(1000, 50000, num_records)], num_records
)
number_of_defaults = np.random.choice(
    [np.nan, *np.random.randint(0, 10, num_records)], num_records
)
outstanding_balances = np.random.choice(
    [np.nan, *np.random.uniform(0, 30000, num_records)], num_records
)
interest_rates = np.random.choice(
    [np.nan, *np.random.uniform(0.01, 0.25, num_records)], num_records
)
ages = np.random.choice([np.nan, *np.random.randint(18, 65, num_records)], num_records)
remaining_terms = np.random.choice(
    [np.nan, *np.random.randint(1, 60, num_records)], num_records
)
salaries = np.random.choice(
    [np.nan, *np.random.uniform(1000, 20000, num_records)], num_records
)
loan_statuses = np.random.choice(
    ["Default", "Non-Default", np.nan], num_records, p=[0.3, 0.65, 0.05]
)
sectors = np.random.choice(
    ["Agriculture", "Manufacturing", "Services", "IT", "Retail"], num_records
)
currencies = np.random.choice(["USD", "EUR", "ZWL", "GBP", "AUD"], num_records)
employee_sectors = np.random.choice(
    ["Public", "Private", "Self-employed", "Unemployed"], num_records
)
statuses = np.random.choice(["Active", "Inactive"], num_records)

loan_data = pd.DataFrame(
    {
        "loan_id": loan_ids,
        "disbursement_date": disbursement_dates,
        "expire_date": expire_dates,
        "is_employed": is_employed,
        "loan_amount": loan_amounts,
        "number_of_defaults": number_of_defaults,
        "outstanding_balance": outstanding_balances,
        "interest_rate": interest_rates,
        "age": ages,
        "remaining_term": remaining_terms,
        "salary": salaries,
        "sector": sectors,
        "currency": currencies,
        "employee_sector": employee_sectors,
        "status": statuses,
        "loan_status": loan_statuses,
    }
)

In [6]:
def fill_missing_values(df):
    df["is_employed"].fillna(df["is_employed"].mode()[0], inplace=True)
    df["loan_amount"].fillna(df["loan_amount"].median(), inplace=True)
    df["number_of_defaults"].fillna(df["number_of_defaults"].median(), inplace=True)
    df["outstanding_balance"].fillna(df["outstanding_balance"].median(), inplace=True)
    df["interest_rate"].fillna(df["interest_rate"].median(), inplace=True)
    df["age"].fillna(df["age"].median(), inplace=True)
    df["remaining_term"].fillna(df["remaining_term"].median(), inplace=True)
    return df

In [7]:
data = loan_data.copy()
data = fill_missing_values(data)