In [1]:
import pandas as pd
import numpy as np 
from faker import Faker
import random
from datetime import datetime, timedelta

# **Generate Fake Data**

In [2]:
def random_date(start, end):
    return start + timedelta(days=random.randint(0, int((end - start).days)))
np.random.seed(42)
num_records = 10000

loan_ids = np.arange(1, num_records + 1)
disbursement_dates = [random_date(datetime(2020, 1, 1), datetime(2023, 1, 1)) for _ in range(num_records)]
expire_dates = [d + timedelta(days=random.randint(30, 365)) for d in disbursement_dates]

is_employed = np.random.choice([True, False, np.nan], num_records, p=[0.7, 0.25, 0.05])
loan_amounts = np.random.choice([np.nan, *np.random.uniform(1000, 50000, num_records)], num_records)
number_of_defaults = np.random.choice([np.nan, *np.random.randint(0, 10, num_records)], num_records)
outstanding_balances = np.random.choice([np.nan, *np.random.uniform(0, 30000, num_records)], num_records)
interest_rates = np.random.choice([np.nan, *np.random.uniform(0.01, 0.25, num_records)], num_records)
ages = np.random.choice([np.nan, *np.random.randint(18, 65, num_records)], num_records)
remaining_terms = np.random.choice([np.nan, *np.random.randint(1, 60, num_records)], num_records)
salaries = np.random.choice([np.nan, *np.random.uniform(1000, 20000, num_records)], num_records)
loan_statuses = np.random.choice(['Default', 'Non-Default', np.nan], num_records, p=[0.3, 0.65, 0.05])

loan_data = pd.DataFrame({
    'loan_id': loan_ids,
    'disbursement_date': disbursement_dates,
    'expire_date': expire_dates,
    'is_employed': is_employed,
    'loan_amount': loan_amounts,
    'number_of_defaults': number_of_defaults,
    'outstanding_balance': outstanding_balances,
    'interest_rate': interest_rates,
    'age': ages,
    'remaining_term': remaining_terms,
    'salary': salaries,
    'loan_status': loan_statuses
})



In [3]:
loan_data.head(3)

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,loan_status
0,1,2022-10-03,2023-01-12,1.0,29806.398905,4.0,29869.899084,0.11234,38.0,49.0,10442.103846,Non-Default
1,2,2022-12-22,2023-08-16,,28348.573748,5.0,27408.000182,0.128941,30.0,30.0,8779.630547,Non-Default
2,3,2020-04-20,2020-12-14,0.0,25470.384359,2.0,19078.8504,0.224119,63.0,17.0,9925.697326,Non-Default


In [12]:
loan_data.shape

(10000, 12)

# Data Cleanign steps

- Data cleaning is the process of ensuring that data is in the proper format, *making it suitable for analysis and modeling*

### 1.Check missing values 

In [4]:
loan_data.isnull().sum()

loan_id                  0
disbursement_date        0
expire_date              0
is_employed            474
loan_amount              1
number_of_defaults       1
outstanding_balance      2
interest_rate            1
age                      1
remaining_term           2
salary                   0
loan_status              0
dtype: int64

In [10]:
def check_missing_values(data:pd.DataFrame)->pd.DataFrame:
     return data.isnull().sum()

In [11]:
check_missing = check_missing_values(data=loan_data)
check_missing

loan_id                  0
disbursement_date        0
expire_date              0
is_employed            474
loan_amount              1
number_of_defaults       1
outstanding_balance      2
interest_rate            1
age                      1
remaining_term           2
salary                   0
loan_status              0
dtype: int64

- As we can see from the data above, there are missing values in the following columns: `is_employed`, `loan_amount`, `number_of_defaults`, `outstanding_balance`, `interest_rate`, `age`, and `remaining_term`.


- We can also return DataFrame with missing values 

In [13]:
def check_missing_values(data:pd.DataFrame)->pd.DataFrame:
     return data.loc[data.isnull().any(axis=1)]

In [14]:
missing_values= check_missing_values(loan_data)
missing_values

Unnamed: 0,loan_id,disbursement_date,expire_date,is_employed,loan_amount,number_of_defaults,outstanding_balance,interest_rate,age,remaining_term,salary,loan_status
1,2,2022-12-22,2023-08-16,,28348.573748,5.0,27408.000182,0.128941,30.0,30.0,8779.630547,Non-Default
11,12,2020-02-22,2021-02-12,,11471.935768,1.0,4842.795849,0.192387,27.0,38.0,5981.078707,Non-Default
34,35,2021-06-07,2022-02-23,,12458.570085,3.0,14616.721703,0.189646,24.0,24.0,8985.982774,Non-Default
50,51,2022-04-11,2022-08-18,,34742.835205,0.0,9980.403155,0.243796,39.0,2.0,19021.746320,Default
69,70,2020-05-05,2020-09-24,,24528.498651,7.0,15711.060298,0.209259,27.0,14.0,3521.346822,Non-Default
...,...,...,...,...,...,...,...,...,...,...,...,...
9851,9852,2022-06-15,2023-02-28,,6735.032652,5.0,26000.942849,0.073461,44.0,59.0,8351.716821,Non-Default
9927,9928,2022-09-26,2023-04-10,,37890.602781,6.0,2531.808990,0.090598,55.0,13.0,16555.449232,
9971,9972,2022-10-10,2023-06-03,,15641.350140,9.0,10288.396845,0.160507,36.0,40.0,5218.878111,Non-Default
9972,9973,2020-07-12,2020-12-21,,28699.159038,4.0,12616.283408,0.231205,23.0,32.0,19874.948375,Non-Default
