In [1]:
import numpy as np
import pandas as pd

np.random.seed(42)

# -----------------------
# PARAMETERS
# -----------------------
N = 50000

# -----------------------
# BASIC DEMOGRAPHICS
# -----------------------
customer_id = np.arange(100000, 100000 + N)

age = np.random.randint(21, 65, N)
employment_length_years = np.clip(age - 21 - np.random.randint(0, 10, N), 0, 40)

gender = np.random.choice(["Male", "Female"], N)

annual_income = np.random.normal(65000, 20000, N)
annual_income = np.clip(annual_income, 20000, 200000)

income_growth_rate = np.random.normal(0.04, 0.03, N)

# -----------------------
# CREDIT LIMIT & SPENDING
# -----------------------
current_credit_limit = annual_income * np.random.uniform(0.15, 0.35, N)
current_credit_limit = np.clip(current_credit_limit, 2000, 50000)

credit_utilization_ratio = np.random.beta(2, 3, N)  # realistic distribution (skewed lower)

revolving_balance = current_credit_limit * credit_utilization_ratio

avg_monthly_spend_6m = revolving_balance / np.random.uniform(1.2, 2.5, N)

payment_ratio = np.random.beta(5, 1.5, N)  # most people pay well

min_payment_flag = (payment_ratio < 0.5).astype(int)

# -----------------------
# CREDIT RISK VARIABLES
# -----------------------
credit_score = np.random.normal(700, 60, N)
credit_score = np.clip(credit_score, 300, 850)

credit_score_trend_6m = np.random.normal(5, 20, N)

debt_to_income_ratio = np.random.beta(2, 4, N)

external_loans_count = np.random.poisson(2, N)
hard_inquiries_6m = np.random.poisson(1, N)

days_past_due_30_6m = np.random.poisson(credit_utilization_ratio * 2)
days_past_due_60_6m = np.random.binomial(1, credit_utilization_ratio * 0.3)

delinquencies_12m = days_past_due_30_6m + days_past_due_60_6m

account_age_months = np.random.randint(6, 180, N)

# -----------------------
# BANK REVENUE
# -----------------------
interest_rate = np.random.normal(19, 3, N)

interest_income_6m = revolving_balance * (interest_rate / 100) / 2
interchange_income_6m = avg_monthly_spend_6m * 0.015 * 6

annual_fee = np.random.choice([0, 95, 120, 150], N, p=[0.4, 0.3, 0.2, 0.1])

total_revenue_6m = interest_income_6m + interchange_income_6m + (annual_fee / 2)

# -----------------------
# DEFAULT PROBABILITY (REALISTIC RISK LOGIC)
# -----------------------
risk_score = (
    0.6 * credit_utilization_ratio +
    0.5 * (1 - credit_score / 850) +
    0.4 * debt_to_income_ratio +
    0.3 * min_payment_flag +
    0.2 * days_past_due_30_6m
)

default_probability = 1 / (1 + np.exp(-5 * (risk_score - 0.6)))

default_next_6m = np.random.binomial(1, default_probability)

# -----------------------
# OPTIMAL LIMIT CHANGE LOGIC
# -----------------------
optimal_limit_change_pct = np.where(
    (default_next_6m == 0) & (credit_utilization_ratio > 0.6) & (credit_score > 700),
    np.random.uniform(0.1, 0.3, N),  # increase
    np.where(
        (default_next_6m == 1) | (credit_utilization_ratio > 0.85),
        np.random.uniform(-0.35, -0.1, N),  # decrease
        np.random.uniform(-0.05, 0.05, N)  # keep
    )
)

limit_adjustment_decision = np.where(
    optimal_limit_change_pct > 0.05, 2,
    np.where(optimal_limit_change_pct < -0.05, 0, 1)
)

# -----------------------
# CREATE DATAFRAME
# -----------------------
df = pd.DataFrame({
    "customer_id": customer_id,
    "age": age,
    "gender": gender,
    "employment_length_years": employment_length_years,
    "annual_income": annual_income,
    "income_growth_rate": income_growth_rate,
    "current_credit_limit": current_credit_limit,
    "avg_monthly_spend_6m": avg_monthly_spend_6m,
    "credit_utilization_ratio": credit_utilization_ratio,
    "revolving_balance": revolving_balance,
    "payment_ratio": payment_ratio,
    "min_payment_flag": min_payment_flag,
    "days_past_due_30_6m": days_past_due_30_6m,
    "days_past_due_60_6m": days_past_due_60_6m,
    "delinquencies_12m": delinquencies_12m,
    "credit_score": credit_score,
    "credit_score_trend_6m": credit_score_trend_6m,
    "debt_to_income_ratio": debt_to_income_ratio,
    "external_loans_count": external_loans_count,
    "hard_inquiries_6m": hard_inquiries_6m,
    "account_age_months": account_age_months,
    "interest_rate": interest_rate,
    "interest_income_6m": interest_income_6m,
    "interchange_income_6m": interchange_income_6m,
    "annual_fee": annual_fee,
    "total_revenue_6m": total_revenue_6m,
    "default_next_6m": default_next_6m,
    "optimal_limit_change_pct": optimal_limit_change_pct,
    "limit_adjustment_decision": limit_adjustment_decision
})

# -----------------------
# SAVE DATASET
# -----------------------
df.to_csv("credit_limit_optimization_dataset.csv", index=False)

print("Dataset generated successfully!")
print(df.head())


Dataset generated successfully!
   customer_id  age  gender  employment_length_years  annual_income  \
0       100000   59  Female                       34   80865.181517   
1       100001   49    Male                       25   77567.846835   
2       100002   35    Male                       13   64554.910432   
3       100003   63  Female                       35   54337.771843   
4       100004   28  Female                        6   37028.105446   

   income_growth_rate  current_credit_limit  avg_monthly_spend_6m  \
0            0.021488          24883.703325           4106.802434   
1           -0.046969          12833.847851           2763.661604   
2            0.051170          14081.263009           2896.184939   
3           -0.005225           9474.257177           1350.919301   
4            0.066225           5921.536132           1166.402921   

   credit_utilization_ratio  revolving_balance  ...  hard_inquiries_6m  \
0                  0.281690        7009.500683  ... 