In [4]:
# Synthetic dataset generator (Indonesia-ready) - Tuned for IDR & realistic banking context
import numpy as np
import pandas as pd

np.random.seed(42)
N = 12000

# =============================
# 1. DEMOGRAFI
# =============================
usia = np.random.randint(18, 65, N)

pekerjaan = np.random.choice(
    ['karyawan', 'wiraswasta', 'pelajar', 'pns', 'buruh', 'tidak_bekerja'],
    size=N,
    p=[0.36, 0.26, 0.03, 0.12, 0.12, 0.11]  # pelajar minoritas (3%)
)

status_pernikahan = np.where(
    usia < 25,
    np.random.choice(['lajang', 'menikah'], N, p=[0.88, 0.12]),
    np.random.choice(['lajang', 'menikah', 'cerai'], N, p=[0.22, 0.68, 0.10])
)

pendidikan = np.random.choice(
    ['sd', 'smp', 'sma', 'd3', 's1', 's2'],
    size=N,
    p=[0.08, 0.15, 0.40, 0.12, 0.22, 0.03]
)

# =============================
# 2. KONDISI FINANSIAL (TERKAIT PEKERJAAN & USIA)
# =============================
# Use lower means (mu) for lognormal and apply a scale factor to get IDR-like values
saldo = np.zeros(N, dtype=float)
for i, job in enumerate(pekerjaan):
    if job == 'pelajar':
        mu, sigma = 10.5, 0.6
    elif job == 'buruh':
        mu, sigma = 11.5, 0.8
    elif job == 'karyawan':
        mu, sigma = 12.5, 0.9
    elif job == 'pns':
        mu, sigma = 12.6, 0.8
    elif job == 'wiraswasta':
        mu, sigma = 12.8, 1.0
    else:  # tidak_bekerja / lainnya
        mu, sigma = 11.0, 0.9
    saldo[i] = np.random.lognormal(mean=mu, sigma=sigma)

# age effect: older => modest multiplier on saldo
age_multiplier = 1 + ((usia - 30) / 80)  # gentler than before
age_multiplier = np.clip(age_multiplier, 0.8, 1.6)
saldo = saldo * age_multiplier

# scale factor to bring distribution to IDR (tuned so many values land in millions)
SCALE = 50  # adjust if you want median shift; 50 * exp(12.5) ~ some tens of millions
saldo = (saldo * SCALE).astype(int)
saldo = saldo.clip(0, 2_000_000_000)

# kategori_saldo consistent with business buckets
saldo_bucket = pd.cut(
    saldo,
    bins=[-1, 5_000_000, 50_000_000, np.inf],
    labels=['rendah', 'menengah', 'tinggi']
)

# punya_kpr: more likely if age and saldo suggest mortgage
punya_kpr = np.empty(N, dtype=object)
mask_kpr = (saldo > 20_000_000) & (usia >= 28) & (usia <= 55)
punya_kpr[mask_kpr] = np.random.choice(['ya', 'tidak'], mask_kpr.sum(), p=[0.55, 0.45])
punya_kpr[~mask_kpr] = np.random.choice(['ya', 'tidak'], (~mask_kpr).sum(), p=[0.20, 0.80])

# punya_pinjaman correlated with punya_kpr but moderate otherwise
punya_pinjaman = np.empty(N, dtype=object)
mask_pk = (punya_kpr == 'ya')
punya_pinjaman[mask_pk] = np.random.choice(['ya', 'tidak'], mask_pk.sum(), p=[0.65, 0.35])
punya_pinjaman[~mask_pk] = np.random.choice(['ya', 'tidak'], (~mask_pk).sum(), p=[0.30, 0.70])

# pernah_gagal_bayar: low overall, higher if punya_pinjaman == 'ya'
pernah_gagal_bayar = np.where(
    punya_pinjaman == 'ya',
    np.random.choice(['ya', 'tidak'], N, p=[0.12, 0.88]),
    np.random.choice(['ya', 'tidak'], N, p=[0.03, 0.97])
)

# =============================
# 3. METADATA NASABAH
# =============================
# skewed tenure: many new customers, fewer very old ones
probs = np.linspace(1.0, 0.15, 21)
probs = probs / probs.sum()
lama_nasabah_tahun = np.random.choice(np.arange(0, 21), size=N, p=probs)

segmen_nasabah = np.where(lama_nasabah_tahun >= 5, 'loyal',
                          np.where(lama_nasabah_tahun >= 1, 'existing', 'baru'))

# =============================
# 4. METODE PENAWARAN (Indonesia-ready)
# =============================
metode_penawaran = np.random.choice(
    ['whatsapp', 'telepon', 'email'],
    size=N,
    p=[0.55, 0.25, 0.20]
)

bulan_list = ['Jan', 'Feb', 'Mar', 'Apr', 'Mei', 'Jun',
              'Jul', 'Agu', 'Sep', 'Okt', 'Nov', 'Des']
periode_penawaran = np.random.choice(bulan_list, size=N, p=None)

# =============================
# 5. HISTORY: pernah ditawari & hasil sebelumnya (dependen pada tenure & saldo)
# =============================
pernah_ditawari_sebelumnya = np.random.choice(['ya', 'tidak'], N, p=[0.30, 0.70])

hasil_penawaran_sebelumnya = np.array(['tidak_ada'] * N, dtype=object)
mask_prev = pernah_ditawari_sebelumnya == 'ya'
# probability of previous success increases with tenure and high saldo
prob_success_prev = 0.10 + 0.02 * np.minimum(lama_nasabah_tahun[mask_prev], 10) \
                    + 0.12 * (saldo_bucket[mask_prev] == 'tinggi').astype(int)
prob_success_prev = np.clip(prob_success_prev, 0.03, 0.8)

r = np.random.rand(mask_prev.sum())
hasil_prev_for_mask = np.where(r < prob_success_prev, 'berhasil', 'gagal')
hasil_penawaran_sebelumnya[mask_prev] = hasil_prev_for_mask

# =============================
# 6. FREKUENSI & DURASI INTERAKSI
# =============================
base_lambda = 2.0
frekuensi_kontak = np.random.poisson(lam=base_lambda, size=N) + 1  # at least 1

# adjust based on previous result
mask_prev_success = hasil_penawaran_sebelumnya == 'berhasil'
mask_prev_failed = hasil_penawaran_sebelumnya == 'gagal'
# safer generation by separate draws
freq_arr = frekuensi_kontak.copy()
if mask_prev_success.sum() > 0:
    freq_arr[mask_prev_success] = np.maximum(1, np.random.poisson(lam=1.4, size=mask_prev_success.sum()))
if mask_prev_failed.sum() > 0:
    freq_arr[mask_prev_failed] = np.maximum(1, np.random.poisson(lam=2.3, size=mask_prev_failed.sum()))
frekuensi_kontak = freq_arr.astype(int)

# durasi_interaksi: gamma base, multipliers from saldo, metode, usia
base_durasi = np.random.gamma(shape=2.0, scale=80.0, size=N)  # mean ~160s

saldo_mult = np.where(saldo_bucket == 'tinggi', 1.45,
                      np.where(saldo_bucket == 'menengah', 1.15, 0.95))
metode_mult = np.where(metode_penawaran == 'telepon', 1.45,
                       np.where(metode_penawaran == 'whatsapp', 1.0, 0.6))
age_mult = 1 + ((usia - 30) / 120)
age_mult = np.clip(age_mult, 0.85, 1.5)

durasi_interaksi_detik = (base_durasi * saldo_mult * metode_mult * age_mult).astype(int)
durasi_interaksi_detik = np.clip(durasi_interaksi_detik, 5, 7200)

# =============================
# 7. TARGET: respon_penawaran_deposito
# =============================
# categorical masks
is_saldo_menengah = (saldo_bucket == 'menengah').astype(int)
is_saldo_tinggi = (saldo_bucket == 'tinggi').astype(int)
is_prev_contact = (pernah_ditawari_sebelumnya == 'ya').astype(int)
is_prev_success = (hasil_penawaran_sebelumnya == 'berhasil').astype(int)
is_telepon = (metode_penawaran == 'telepon').astype(int)
is_whatsapp = (metode_penawaran == 'whatsapp').astype(int)
is_email = (metode_penawaran == 'email').astype(int)
is_pernah_gagal = (pernah_gagal_bayar == 'ya').astype(int)
is_punya_kpr = (punya_kpr == 'ya').astype(int)

# Weighted linear predictor WITHOUT intercept (we will calibrate intercept for cold baseline)
logit_no_intercept = (
    0.15 * is_saldo_menengah   # modest uplift
    + 0.60 * is_saldo_tinggi   # reasonable uplift for high balance
    + 1.05 * is_prev_success   # strong lift for previous success
    + 0.35 * is_prev_contact   # moderate if contacted before
    + 0.60 * is_telepon        # telepon lift
    + 0.25 * is_whatsapp       # whatsapp smaller lift
    - 0.20 * is_email          # email slightly worse
    + 0.02 * (lama_nasabah_tahun)  # each year increases odds a bit
    - 0.80 * is_pernah_gagal   # penalize past failed payment
    - 0.20 * is_punya_kpr      # slight negative for KPR holders
    + 0.0012 * durasi_interaksi_detik  # small duration effect
)

# Calibrate intercept so that COLD prospects (never contacted before) have baseline ~12%
def calibrate_intercept_for_mask(logit_no_intercept, mask, target=0.12, iters=40):
    low, high = -10.0, 10.0
    for _ in range(iters):
        mid = (low + high) / 2
        probs = 1 / (1 + np.exp(-(logit_no_intercept + mid)))
        mean_mask = probs[mask].mean() if mask.sum() > 0 else probs.mean()
        if mean_mask > target:
            high = mid
        else:
            low = mid
    return (low + high) / 2

mask_cold = (pernah_ditawari_sebelumnya == 'tidak')
intercept = calibrate_intercept_for_mask(logit_no_intercept, mask_cold, target=0.12)
logit = logit_no_intercept + intercept

prob_respon = 1 / (1 + np.exp(-logit))
prob_respon = np.clip(prob_respon, 0.001, 0.98)

# Optionally regenerate to respect seasonality after calibration:
seasonal_multiplier = np.ones(N)
seasonal_multiplier[np.isin(periode_penawaran, ['Mar', 'Mei'])] = 0.92
seasonal_multiplier[np.isin(periode_penawaran, ['Agu', 'Nov', 'Des'])] = 1.06

prob_respon_seasonal = np.clip(prob_respon * seasonal_multiplier, 0.001, 0.99)
respon_penawaran_deposito = np.random.binomial(1, prob_respon_seasonal, size=N)

# =============================
# 8. BUILD DATAFRAME
# =============================
df = pd.DataFrame({
    'usia': usia,
    'pekerjaan': pekerjaan,
    'status_pernikahan': status_pernikahan,
    'pendidikan': pendidikan,
    'saldo_rekening': saldo,
    'kategori_saldo': saldo_bucket.astype(str),
    'punya_kpr': punya_kpr,
    'punya_pinjaman': punya_pinjaman,
    'pernah_gagal_bayar': pernah_gagal_bayar,
    'metode_penawaran': metode_penawaran,
    'periode_penawaran': periode_penawaran,
    'frekuensi_kontak': frekuensi_kontak,
    'durasi_interaksi_detik': durasi_interaksi_detik,
    'pernah_ditawari_sebelumnya': pernah_ditawari_sebelumnya,
    'hasil_penawaran_sebelumnya': hasil_penawaran_sebelumnya,
    'respon_penawaran_deposito': respon_penawaran_deposito,
    'lama_nasabah_tahun': lama_nasabah_tahun,
    'segmen_nasabah': segmen_nasabah
})

# small missingness for realism
idx_nan = np.random.choice(df.index, size=int(0.01 * N), replace=False)
df.loc[idx_nan, 'pendidikan'] = np.nan

# =============================
# 9. QUICK CHECKS (printable summary untuk interview)
# =============================
def quick_checks(df):
    print("=== QUICK SANITY CHECKS ===")
    print("N:", len(df))
    print("Overall response rate: {:.2%}".format(df['respon_penawaran_deposito'].mean()))
    print("\nMetode penawaran (proporsi):")
    print(df['metode_penawaran'].value_counts(normalize=True).round(3))
    print("\nResponse by metode_penawaran:")
    print(df.groupby('metode_penawaran')['respon_penawaran_deposito'].mean().round(3))
    print("\nResponse by kategori_saldo:")
    print(df.groupby('kategori_saldo')['respon_penawaran_deposito'].mean().round(3))
    print("\nResponse by hasil_penawaran_sebelumnya:")
    print(df.groupby('hasil_penawaran_sebelumnya')['respon_penawaran_deposito'].mean().round(3))
    print("\nPelajar proportion: {:.2%}".format((df['pekerjaan']=='pelajar').mean()))
    print("Saldo quantiles (IDR):")
    print(df['saldo_rekening'].quantile([0.01,0.1,0.25,0.5,0.75,0.9,0.99]).astype(int))
    # check extreme ratio between high and low categories
    low_mean = df.loc[df['kategori_saldo']=='rendah','respon_penawaran_deposito'].mean()
    high_mean = df.loc[df['kategori_saldo']=='tinggi','respon_penawaran_deposito'].mean()
    if low_mean > 0:
        print("\nHigh / Low response ratio: {:.2f}".format(high_mean / low_mean))
    print("============================")

quick_checks(df)

# =============================
# 10. EXPORT
# =============================
out_fn = 'dataset_simulasi_perbankan_indo_v3.csv'
df.to_csv(out_fn, index=False, encoding='utf-8')
print(f"\nCSV berhasil dibuat: {out_fn}")


=== QUICK SANITY CHECKS ===
N: 12000
Overall response rate: 14.53%

Metode penawaran (proporsi):
metode_penawaran
whatsapp    0.542
telepon     0.253
email       0.205
Name: proportion, dtype: float64

Response by metode_penawaran:
metode_penawaran
email       0.077
telepon     0.207
whatsapp    0.143
Name: respon_penawaran_deposito, dtype: float64

Response by kategori_saldo:
kategori_saldo
menengah    0.142
rendah      0.120
tinggi      0.229
Name: respon_penawaran_deposito, dtype: float64

Response by hasil_penawaran_sebelumnya:
hasil_penawaran_sebelumnya
berhasil     0.348
gagal        0.160
tidak_ada    0.121
Name: respon_penawaran_deposito, dtype: float64

Pelajar proportion: 2.72%
Saldo quantiles (IDR):
0.01       751820
0.10      2610955
0.25      5535736
0.50     12237946
0.75     25877163
0.90     48197925
0.99    145275353
Name: saldo_rekening, dtype: int64

High / Low response ratio: 1.92

CSV berhasil dibuat: dataset_simulasi_perbankan_indo_v3.csv


In [5]:
df = pd.read_csv("dataset_perbankan_indo.csv")

print("=== BASIC INFO ===")
print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())
print("\nData types:")
print(df.dtypes)

df

=== BASIC INFO ===
Shape: (12000, 18)

Columns:
['usia', 'pekerjaan', 'status_pernikahan', 'pendidikan', 'saldo_rekening', 'kategori_saldo', 'punya_kpr', 'punya_pinjaman', 'pernah_gagal_bayar', 'metode_penawaran', 'periode_penawaran', 'frekuensi_kontak', 'durasi_interaksi_detik', 'pernah_ditawari_sebelumnya', 'hasil_penawaran_sebelumnya', 'respon_penawaran_deposito', 'lama_nasabah_tahun', 'segmen_nasabah']

Data types:
usia                           int64
pekerjaan                     object
status_pernikahan             object
pendidikan                    object
saldo_rekening                 int64
kategori_saldo                object
punya_kpr                     object
punya_pinjaman                object
pernah_gagal_bayar            object
metode_penawaran              object
periode_penawaran             object
frekuensi_kontak               int64
durasi_interaksi_detik         int64
pernah_ditawari_sebelumnya    object
hasil_penawaran_sebelumnya    object
respon_penawaran_depos

Unnamed: 0,usia,pekerjaan,status_pernikahan,pendidikan,saldo_rekening,kategori_saldo,punya_kpr,punya_pinjaman,pernah_gagal_bayar,metode_penawaran,periode_penawaran,frekuensi_kontak,durasi_interaksi_detik,pernah_ditawari_sebelumnya,hasil_penawaran_sebelumnya,respon_penawaran_deposito,lama_nasabah_tahun,segmen_nasabah
0,56,tidak_bekerja,menikah,sd,2757691,rendah,tidak,tidak,tidak,whatsapp,Jul,2,75,ya,berhasil,0,8,loyal
1,46,wiraswasta,lajang,sma,5531440,menengah,tidak,tidak,tidak,telepon,Des,1,149,tidak,tidak_ada,0,19,loyal
2,32,pns,menikah,sma,17308773,menengah,tidak,tidak,tidak,whatsapp,Apr,2,99,tidak,tidak_ada,1,4,existing
3,60,karyawan,menikah,d3,3844860,rendah,tidak,tidak,tidak,telepon,Jan,2,212,ya,gagal,0,11,loyal
4,25,karyawan,cerai,smp,38608410,menengah,tidak,ya,tidak,email,Agu,3,78,tidak,tidak_ada,0,12,loyal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,28,karyawan,menikah,d3,10635133,menengah,tidak,tidak,tidak,telepon,Jun,3,260,tidak,tidak_ada,0,15,loyal
11996,49,buruh,menikah,smp,8327229,menengah,tidak,ya,tidak,telepon,Jan,1,367,tidak,tidak_ada,0,4,existing
11997,23,tidak_bekerja,lajang,s1,6447967,menengah,tidak,tidak,tidak,telepon,Jun,4,344,tidak,tidak_ada,0,1,existing
11998,52,karyawan,menikah,d3,28580825,menengah,tidak,tidak,tidak,whatsapp,Okt,1,59,tidak,tidak_ada,0,10,loyal


# Menampilkan nilai unik untuk kolom kategorikal utama
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    print(f"\n{col}")
    print(df[col].value_counts())
