In [16]:
import pandas as pd
import numpy as np

np.random.seed(42)

def generate_credit_default_data(n = 1000):
    age = np.random.normal(40, 10, n).astype(int)
    age[np.random.choice(n, size=20)] = np.random.randint(18, 90, 20) #inject Outliers

    income = np.random.exponential(50000, n)
    income[np.random.choice(n, size=30, replace=False)] = np.nan #inject values

    loan_amount = (income * np.random.uniform(0.1, 0.5, n)).astype(float)
    loan_amount += np.random.normal(0, 10000, n) #inject noise
    loan_amount[np.random.choice(n, size=25, replace=False)] = np.nan #inject values

    education = np.random.choice(['HighSchool', 'Graduate', 'PostGradute'], size=n, p=[0.5, 0.35, 0.15])

    marital_status = np.random.choice(['Single', 'Married', 'Divorced', 'married'], size=n, p=[0.4, 0.4, 0.15, 0.05])

    employment_status = np.random.choice(['Salaried', 'Self-Employed', 'Unemployed', None], size=n, p=[0.5, 0.3, 0.15, 0.05])

    credit_score = np.random.normal(650, 50, n)
    credit_score = np.clip(credit_score, 300, 850)

    income_2 = income * 0.95 + np.random.normal(0, 5000, n) #inject noise

    default = np.random.binomial(1, 0.2, n) # 20% default rate

    df = pd.DataFrame({
        'age': age,
        'income': income,
        'loan_amount': loan_amount,
        'education': education,
        'marital_status': marital_status,
        'employment_status': employment_status,
        'credit_score': credit_score,
        'income_2': income_2,
        'default': default
    })

    return df

df_raw = generate_credit_default_data(1000)   

In [17]:
df_raw.to_csv('../data/raw/credit_default_data.csv', index=False)