In [23]:
import pandas as pd
import numpy as np 
from faker import Faker
import random
from datetime import datetime, timedelta

# **Generate Fake Data**

In [24]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))
np.random.seed(42)
num_records = 10000

loan_ids = np.arange(1, num_records + 1)
disbursement_dates = [random_date(datetime(2020, 1, 1), datetime(2023, 1, 1)) for _ in range(num_records)]
expire_dates = [d + timedelta(days=random.randint(30, 365)) for d in disbursement_dates]

is_employed = np.random.choice([True, False, np.nan], num_records, p=[0.7, 0.25, 0.05])
loan_amounts = np.random.choice([np.nan, *np.random.uniform(1000, 50000, num_records)], num_records)
number_of_defaults = np.random.choice([np.nan, *np.random.randint(0, 10, num_records)], num_records)
outstanding_balances = np.random.choice([np.nan, *np.random.uniform(0, 30000, num_records)], num_records)
interest_rates = np.random.choice([np.nan, *np.random.uniform(0.01, 0.25, num_records)], num_records)
ages = np.random.choice([np.nan, *np.random.randint(18, 65, num_records)], num_records)
remaining_terms = np.random.choice([np.nan, *np.random.randint(1, 60, num_records)], num_records)
salaries = np.random.choice([np.nan, *np.random.uniform(1000, 20000, num_records)], num_records)
loan_statuses = np.random.choice(['Default', 'Non-Default', np.nan], num_records, p=[0.3, 0.65, 0.05])

loan_data = pd.DataFrame({
    'loan_id': loan_ids,
    'disbursement_date': disbursement_dates,
    'expire_date': expire_dates,
    'is_employed': is_employed,
    'loan_amount': loan_amounts,
    'number_of_defaults': number_of_defaults,
    'outstanding_balance': outstanding_balances,
    'interest_rate': interest_rates,
    'age': ages,
    'remaining_term': remaining_terms,
    'salary': salaries,
    'loan_status': loan_statuses
})



In [26]:
loan_data.head(3)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,loan_status
0,1,2021-10-20,2022-02-22,1.0,16582.170274,9.0,25154.422913,0.052483,43.0,49.0,19450.106658,Non-Default
1,2,2020-05-10,2021-02-06,,12098.292985,2.0,27739.299907,0.201858,30.0,14.0,15597.194863,Non-Default
2,3,2022-02-14,2022-04-27,0.0,34200.65373,4.0,6381.836851,0.029256,56.0,23.0,7056.297556,Non-Default


# Data Cleanign steps

#### Check missing values 

In [10]:
loan_data.isnull().sum()

loan_id                0
disbursement_date      0
expire_date            0
is_employed            0
loan_amount            0
number_of_defaults     0
outstanding_balance    0
interest_rate          0
age                    0
remaining_term         0
salary                 0
loan_status            0
dtype: int64

In [13]:
loan_data.loc[loan_data.isna().any(axis=1)]

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,loan_status
