In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from google.colab import files

# Upload Dataset
print("Silakan unggah file training_dataset.csv")
uploaded_train = files.upload()

train_file_name = next(iter(uploaded_train))
print(f"\nFile {train_file_name} berhasil diunggah.")

print("\nSilakan unggah file validation_set.csv")
uploaded_validation = files.upload()

validation_file_name = next(iter(uploaded_validation))
print(f"\nFile {validation_file_name} berhasil diunggah.")

# Load Dataset
try:
    train_df = pd.read_csv(train_file_name)
    validation_df = pd.read_csv(validation_file_name)
    print("\nDataset berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat dataset: {e}")
    # Hentikan eksekusi jika file tidak bisa dimuat
    raise

# 3. Pre-processing dan Rekayasa Fitur
def preprocess(df):
    # Rekayasa Fitur: 'hari_sejak_kontak_sebelumnya'
    # Buat fitur biner yang menunjukkan apakah pernah dikontak sebelumnya
    df['pernah_dikontak_sebelumnya'] = df['hari_sejak_kontak_sebelumnya'].apply(lambda x: 0 if x == 999 else 1)
    return df

train_df_processed = preprocess(train_df.copy())
validation_df_processed = preprocess(validation_df.copy())
print("\nPra-pemrosesan dan rekayasa fitur selesai.")

# Pisahkan variabel target
X = train_df_processed.drop(["customer_number", "berlangganan_deposito"], axis=1)
y = train_df_processed["berlangganan_deposito"]
X_test = validation_df_processed.drop("customer_number", axis=1)

# One-Hot Encode fitur kategorikal
# Identifikasi kolom kategorikal (yang bertipe 'object')
categorical_cols_train = X.select_dtypes(include='object').columns
categorical_cols_test = X_test.select_dtypes(include='object').columns

X_encoded = pd.get_dummies(X, columns=categorical_cols_train, dummy_na=False)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols_test, dummy_na=False)
print("One-hot encoding selesai.")

# Menyelaraskan kolom setelah one-hot encoding
all_cols = X_encoded.columns.union(X_test_encoded.columns)

X_encoded = X_encoded.reindex(columns=all_cols, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_cols, fill_value=0)

# Pastikan urutan kolom sama
X_test_encoded = X_test_encoded[X_encoded.columns]
print("Penyelarasan kolom selesai.")

# Latih Model (LightGBM dengan Cross-Validation)
NFOLDS = 5 # Jumlah lipatan untuk K-fold cross-validation
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(X_encoded.shape[0])
sub_preds = np.zeros(X_test_encoded.shape[0])

# Parameter LightGBM (dapat disesuaikan lebih lanjut)
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 10000,       # Jumlah estimator tinggi, akan menggunakan early stopping
    'learning_rate': 0.01,       # Learning rate kecil
    'num_leaves': 20,            # Jumlah daun, disesuaikan
    'max_depth': 5,              # Kedalaman maksimum pohon, disesuaikan
    'seed': 42 + 1,              # Seed untuk reproduktifitas
    'n_jobs': -1,                # Gunakan semua core
    'verbose': -1,               # Kurangi output log
    'colsample_bytree': 0.7,     # Persentase fitur yang digunakan per pohon
    'subsample': 0.7,            # Persentase data yang digunakan per pohon
    'reg_alpha': 0.1,            # Regularisasi L1
    'reg_lambda': 0.1,           # Regularisasi L2
}

# Hitung scale_pos_weight untuk data tidak seimbang
counts = y.value_counts()
if counts.get(1, 0) > 0: # Pastikan ada kelas positif
    scale_pos_weight_val = counts[0] / counts[1]
    lgb_params['scale_pos_weight'] = scale_pos_weight_val
    print(f"Menggunakan scale_pos_weight: {scale_pos_weight_val:.2f}")
else:
    print("Peringatan: Tidak ada sampel kelas positif (1) dalam data latih. scale_pos_weight tidak diatur.")


print("\nMemulai pelatihan model dengan cross-validation...")
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_encoded, y)):
    X_train_fold, y_train_fold = X_encoded.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_encoded.iloc[valid_idx], y.iloc[valid_idx]

    model = lgb.LGBMClassifier(**lgb_params)

    model.fit(X_train_fold, y_train_fold,
              eval_set=[(X_valid_fold, y_valid_fold)],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=-1)])

    oof_preds[valid_idx] = model.predict_proba(X_valid_fold)[:, 1]
    sub_preds += model.predict_proba(X_test_encoded)[:, 1] / folds.n_splits
    print(f"Fold {n_fold+1} selesai. AUC: {roc_auc_score(y_valid_fold, oof_preds[valid_idx]):.4f}")

overall_oof_auc = roc_auc_score(y, oof_preds)
print(f"\nPelatihan selesai. Skor CV (OOF AUC) Keseluruhan: {overall_oof_auc:.4f}")

# Buat File Submisi
submission_df = pd.DataFrame({
    "customer_number": validation_df["customer_number"],
    "berlangganan_deposito": sub_preds
})

submission_file_path = "Validation_Result_Submission.csv"
submission_df.to_csv(submission_file_path, index=False)
print(f"\nFile prediksi disimpan sebagai: {submission_file_path}")

# Simpan File Prediksi
try:
    files.download(submission_file_path)
    print(f"\nFile {submission_file_path} Downloading... ")
except Exception as e:
    print(f"Gagal mengunduh file secara otomatis: {e}")
    print("Anda dapat mengunduhnya secara manual dari panel file di sebelah kiri Colab.")

print("\n Selesai :) ")

Silakan unggah file training_dataset.csv


Saving training_dataset.csv to training_dataset.csv

File training_dataset.csv berhasil diunggah.

Silakan unggah file validation_set.csv


Saving validation_set.csv to validation_set.csv

File validation_set.csv berhasil diunggah.

Dataset berhasil dimuat.

Pra-pemrosesan dan rekayasa fitur selesai.
One-hot encoding selesai.
Penyelarasan kolom selesai.
Menggunakan scale_pos_weight: 7.77

Memulai pelatihan model dengan cross-validation...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[973]	valid_0's auc: 0.787233
Fold 1 selesai. AUC: 0.7872
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.798826
Fold 2 selesai. AUC: 0.7988
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.814237
Fold 3 selesai. AUC: 0.8142
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[168]	valid_0's auc: 0.795858
Fold 4 selesai. AUC: 0.7959
Training until validation scores don't improve for 200 rounds
Early stopping, bes

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


File Validation_Result_Submission.csv Downloading... 

 Selesai :) 


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb # Mengganti lightgbm dengan xgboost
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from google.colab import files

# ---------------------------------------------------------------------------
# 1. UNGGAH FILE DATASET
# ---------------------------------------------------------------------------
print("Silakan unggah file training_dataset.csv")
uploaded_train = files.upload()

train_file_name = next(iter(uploaded_train))
print(f"\nFile {train_file_name} berhasil diunggah.")

print("\nSilakan unggah file validation_set.csv")
uploaded_validation = files.upload()

validation_file_name = next(iter(uploaded_validation))
print(f"\nFile {validation_file_name} berhasil diunggah.")

# ---------------------------------------------------------------------------
# 2. MUAT DATASET
# ---------------------------------------------------------------------------
try:
    train_df = pd.read_csv(train_file_name)
    validation_df = pd.read_csv(validation_file_name)
    print("\nDataset berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat dataset: {e}")
    # Hentikan eksekusi jika file tidak bisa dimuat
    raise

# ---------------------------------------------------------------------------
# 3. PRA-PEMROSESAN DAN REKAYASA FITUR
# ---------------------------------------------------------------------------
def preprocess(df):
    # Rekayasa Fitur: 'hari_sejak_kontak_sebelumnya'
    # Buat fitur biner yang menunjukkan apakah pernah dikontak sebelumnya
    df['pernah_dikontak_sebelumnya'] = df['hari_sejak_kontak_sebelumnya'].apply(lambda x: 0 if x == 999 else 1)
    return df

train_df_processed = preprocess(train_df.copy())
validation_df_processed = preprocess(validation_df.copy())
print("\nPra-pemrosesan dan rekayasa fitur selesai.")

# Pisahkan variabel target
X = train_df_processed.drop(["customer_number", "berlangganan_deposito"], axis=1)
y = train_df_processed["berlangganan_deposito"]
X_test = validation_df_processed.drop("customer_number", axis=1)

# One-Hot Encode fitur kategorikal
# Identifikasi kolom kategorikal (yang bertipe 'object')
categorical_cols_train = X.select_dtypes(include='object').columns
categorical_cols_test = X_test.select_dtypes(include='object').columns

X_encoded = pd.get_dummies(X, columns=categorical_cols_train, dummy_na=False)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols_test, dummy_na=False)
print("One-hot encoding selesai.")

# Menyelaraskan kolom setelah one-hot encoding
all_cols = X_encoded.columns.union(X_test_encoded.columns)

X_encoded = X_encoded.reindex(columns=all_cols, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_cols, fill_value=0)

# Pastikan urutan kolom sama
X_test_encoded = X_test_encoded[X_encoded.columns]
print("Penyelarasan kolom selesai.")


# 4. PELATIHAN MODEL (XGBoost dengan Cross-Validation)
# ---------------------------------------------------------------------------
NFOLDS = 5 # Jumlah lipatan untuk K-fold cross-validation
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)
oof_preds_xgb = np.zeros(X_encoded.shape[0])
sub_preds_xgb = np.zeros(X_test_encoded.shape[0])

# Parameter XGBoost (dapat disesuaikan lebih lanjut)
# Beberapa nama parameter berbeda dari LightGBM
xgb_params = {
    'objective': 'binary:logistic', # Untuk klasifikasi biner
    # 'eval_metric': 'auc',           # Metrik evaluasi - Dipindahkan ke constructor
    'n_estimators': 10000,          # Jumlah estimator tinggi, akan menggunakan early stopping
    'learning_rate': 0.01,          # Learning rate kecil
    'max_depth': 5,                 # Kedalaman maksimum pohon (sama seperti 'max_depth' di LGBM)
    'random_state': 42 + 1,         # Seed untuk reproduktifitas (diganti dari 'seed')
    'n_jobs': -1,                   # Gunakan semua core
    'verbosity': 0,                 # 0 (silent), 1 (warning), 2 (info), 3 (debug) (diganti dari 'verbose')
    'colsample_bytree': 0.7,        # Persentase fitur yang digunakan per pohon
    'subsample': 0.7,               # Persentase data yang digunakan per pohon
    'reg_alpha': 0.1,               # Regularisasi L1 (di XGBoost juga 'alpha')
    'reg_lambda': 0.1,              # Regularisasi L2 (di XGBoost juga 'lambda')
    'use_label_encoder': False      # Untuk menghindari warning di versi XGBoost terbaru
}

# Hitung scale_pos_weight untuk data tidak seimbang (sama seperti di LightGBM)
counts = y.value_counts()
if counts.get(1, 0) > 0: # Pastikan ada kelas positif
    scale_pos_weight_val = counts[0] / counts[1]
    xgb_params['scale_pos_weight'] = scale_pos_weight_val
    print(f"Menggunakan scale_pos_weight untuk XGBoost: {scale_pos_weight_val:.2f}")
else:
    print("Peringatan: Tidak ada sampel kelas positif (1) dalam data latih. scale_pos_weight tidak diatur.")

# Parameter early stopping dan eval_metric yang dipindahkan ke constructor
early_stopping_rounds = 200
eval_metric = 'auc'

print("\nMemulai pelatihan model XGBoost dengan cross-validation...")
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(X_encoded, y)):
    X_train_fold, y_train_fold = X_encoded.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_encoded.iloc[valid_idx], y.iloc[valid_idx]

    # Inisialisasi model XGBoost dengan parameter termasuk early stopping dan eval_metric
    model_xgb = xgb.XGBClassifier(
        **xgb_params,
        early_stopping_rounds=early_stopping_rounds,
        eval_metric=eval_metric # Metrik evaluasi juga dipindahkan ke constructor
    )

    # Latih model
    # early_stopping_rounds dan eval_metric sudah diatur di constructor, tidak perlu di fit()
    model_xgb.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  verbose=False) # verbose di fit untuk mengontrol output selama training fold

    oof_preds_xgb[valid_idx] = model_xgb.predict_proba(X_valid_fold)[:, 1]
    sub_preds_xgb += model_xgb.predict_proba(X_test_encoded)[:, 1] / folds.n_splits
    # Menggunakan best_score dari model_xgb untuk metrik validasi fold
    print(f"Fold {n_fold+1} selesai. AUC: {model_xgb.best_score:.4f}")


overall_oof_auc_xgb = roc_auc_score(y, oof_preds_xgb)
print(f"\nPelatihan XGBoost selesai. Skor CV (OOF AUC) Keseluruhan: {overall_oof_auc_xgb:.4f}")

# ---------------------------------------------------------------------------
# 5. PEMBUATAN FILE SUBMISI
# ---------------------------------------------------------------------------
submission_df_xgb = pd.DataFrame({
    "customer_number": validation_df["customer_number"],
    "berlangganan_deposito": sub_preds_xgb # Menggunakan prediksi dari XGBoost
})

submission_file_path_xgb = "Validation_Result_Submission_XGBoost.csv" # Nama file diubah
submission_df_xgb.to_csv(submission_file_path_xgb, index=False)
print(f"\nFile prediksi XGBoost disimpan sebagai: {submission_file_path_xgb}")

# ---------------------------------------------------------------------------
# 6. UNDUH FILE HASIL PREDIKSI
# ---------------------------------------------------------------------------
try:
    files.download(submission_file_path_xgb)
    print(f"\nFile {submission_file_path_xgb} sedang diunduh...")
except Exception as e:
    print(f"Gagal mengunduh file secara otomatis: {e}")
    print("Anda dapat mengunduhnya secara manual dari panel file di sebelah kiri Colab.")

Silakan unggah file training_dataset.csv


Saving training_dataset.csv to training_dataset.csv

File training_dataset.csv berhasil diunggah.

Silakan unggah file validation_set.csv


Saving validation_set.csv to validation_set.csv

File validation_set.csv berhasil diunggah.

Dataset berhasil dimuat.

Pra-pemrosesan dan rekayasa fitur selesai.
One-hot encoding selesai.
Penyelarasan kolom selesai.
Menggunakan scale_pos_weight untuk XGBoost: 7.77

Memulai pelatihan model XGBoost dengan cross-validation...
Fold 1 selesai. AUC: 0.7854
Fold 2 selesai. AUC: 0.8020
Fold 3 selesai. AUC: 0.8068
Fold 4 selesai. AUC: 0.7949
Fold 5 selesai. AUC: 0.7956

Pelatihan XGBoost selesai. Skor CV (OOF AUC) Keseluruhan: 0.7733

File prediksi XGBoost disimpan sebagai: Validation_Result_Submission_XGBoost.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


File Validation_Result_Submission_XGBoost.csv sedang diunduh...


In [None]:
!pip install optuna
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna # Untuk optimasi hyperparameter

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, KBinsDiscretizer
from google.colab import files

# ---------------------------------------------------------------------------
# 1. UNGGAH FILE DATASET
# ---------------------------------------------------------------------------
print("Silakan unggah file training_dataset.csv")
uploaded_train = files.upload()
train_file_name = next(iter(uploaded_train))
print(f"\nFile {train_file_name} berhasil diunggah.")

print("\nSilakan unggah file validation_set.csv")
uploaded_validation = files.upload()
validation_file_name = next(iter(uploaded_validation))
print(f"\nFile {validation_file_name} berhasil diunggah.")

# ---------------------------------------------------------------------------
# 2. MUAT DATASET
# ---------------------------------------------------------------------------
try:
    train_df_orig = pd.read_csv(train_file_name)
    validation_df_orig = pd.read_csv(validation_file_name)
    print("\nDataset berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat dataset: {e}")
    raise

# ---------------------------------------------------------------------------
# 3. REKAYASA FITUR (FEATURE ENGINEERING) LANJUTAN
# ---------------------------------------------------------------------------
def feature_engineer(df_input):
    df = df_input.copy()

    # 1. Penanganan 'hari_sejak_kontak_sebelumnya'
    df['pernah_dikontak_sebelumnya'] = df['hari_sejak_kontak_sebelumnya'].apply(lambda x: 0 if x == 999 else 1)
    df['hari_sejak_kontak_valid'] = df['hari_sejak_kontak_sebelumnya'].replace(999, -1) # Ganti 999 dengan -1

    # 2. Binning Usia
    # KBinsDiscretizer mengharapkan input 2D
    age_bins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile', subsample=None) # subsample=None untuk menghindari error jika data kecil
    df['usia_bin'] = age_bins.fit_transform(df[['usia']]).astype(int)

    # 3. Fitur Interaksi Sederhana
    df['usia_x_jumlah_kontak_kampanye'] = df['usia'] * df['jumlah_kontak_kampanye_ini']
    df['kontak_sebelumnya_x_kampanye_ini'] = df['jumlah_kontak_sebelumnya'] * df['jumlah_kontak_kampanye_ini']

    # 4. Fitur Polinomial untuk beberapa fitur numerik
    # Pilih fitur numerik yang mungkin mendapat manfaat dari interaksi polinomial
    # Hati-hati dengan jumlah fitur yang dihasilkan
    poly_features_cols = ['usia', 'jumlah_kontak_kampanye_ini', 'suku_bunga_euribor_3bln']
    # Pastikan kolom ada sebelum membuat fitur polinomial
    poly_features_cols_exist = [col for col in poly_features_cols if col in df.columns]

    if poly_features_cols_exist:
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        df_poly = poly.fit_transform(df[poly_features_cols_exist])
        poly_feature_names = poly.get_feature_names_out(poly_features_cols_exist)
        df_poly = pd.DataFrame(df_poly, columns=poly_feature_names, index=df.index)
        # Gabungkan fitur polinomial baru, hindari duplikasi kolom asli
        for col in poly_feature_names:
            if col not in df.columns: # Hanya tambah jika belum ada (untuk interaksi/pangkat)
                 df[col] = df_poly[col]
            elif col in poly_features_cols_exist and '^2' in col : # Jika itu adalah kuadrat dari kolom asli
                 df[col + '_sq'] = df_poly[col] # Beri nama baru untuk fitur kuadrat


    # Indikator ekonomi mungkin penting
    df['rasio_harga_kepercayaan_konsumen'] = df['indeks_harga_konsumen'] / (df['indeks_kepercayaan_konsumen'] * -1 + 1e-6) # Tambah epsilon untuk hindari pembagian dengan nol

    # Fitur terkait pekerjaan (frekuensi encoding atau target encoding bisa lebih canggih,
    # tapi one-hot encoding akan menangani ini secara default)
    # df['pekerjaan_freq'] = df['pekerjaan'].map(df['pekerjaan'].value_counts(normalize=True))

    # Drop kolom asli jika sudah direpresentasikan dengan baik (misalnya 'hari_sejak_kontak_sebelumnya')
    # df = df.drop('hari_sejak_kontak_sebelumnya', axis=1) # Opsional

    return df

print("\nMenerapkan rekayasa fitur...")
train_df_fe = feature_engineer(train_df_orig)
validation_df_fe = feature_engineer(validation_df_orig)

# Pisahkan variabel target
X = train_df_fe.drop(["customer_number", "berlangganan_deposito"], axis=1)
y = train_df_fe["berlangganan_deposito"]
X_test = validation_df_fe.drop("customer_number", axis=1)

# One-Hot Encode fitur kategorikal
# Identifikasi kolom kategorikal (yang bertipe 'object')
categorical_cols = X.select_dtypes(include='object').columns.tolist()

X_encoded = pd.get_dummies(X, columns=categorical_cols, dummy_na=False, dtype=int)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_cols, dummy_na=False, dtype=int)
print("One-hot encoding selesai.")

# Menyelaraskan kolom setelah one-hot encoding
all_cols = X_encoded.columns.union(X_test_encoded.columns)
X_encoded = X_encoded.reindex(columns=all_cols, fill_value=0)
X_test_encoded = X_test_encoded.reindex(columns=all_cols, fill_value=0)
X_test_encoded = X_test_encoded[X_encoded.columns] # Pastikan urutan kolom sama
print("Penyelarasan kolom selesai.")


# ---------------------------------------------------------------------------
# 4. OPTIMASI HYPERPARAMETER DENGAN OPTUNA UNTUK LIGHTGBM
# ---------------------------------------------------------------------------
def objective_lgbm(trial):
    # Tentukan rentang hyperparameter yang akan diuji oleh Optuna
    lgbm_params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 800, 2500), # Lebih banyak estimators
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05), # Learning rate lebih kecil
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0), # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0), # L2 regularization
        'random_state': 42,
        'n_jobs': -1,
        'verbose': -1,
    }

    # Hitung scale_pos_weight untuk data tidak seimbang
    counts = y.value_counts()
    if counts.get(1, 0) > 0:
        lgbm_params['scale_pos_weight'] = counts[0] / counts[1]

    cv_scores = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=trial.number + 42) # Variasi seed untuk fold

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X_encoded, y)):
        X_train_fold, y_train_fold = X_encoded.iloc[train_idx], y.iloc[train_idx]
        X_valid_fold, y_valid_fold = X_encoded.iloc[valid_idx], y.iloc[valid_idx]

        model = lgb.LGBMClassifier(**lgbm_params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(100, verbose=False)]) # Early stopping lebih agresif

        preds_valid = model.predict_proba(X_valid_fold)[:, 1]
        auc_score = roc_auc_score(y_valid_fold, preds_valid)
        cv_scores.append(auc_score)

    return np.mean(cv_scores)

print("\nMemulai optimasi hyperparameter dengan Optuna...")
# Buat studi Optuna dan jalankan optimasi
# n_trials menentukan berapa banyak kombinasi hyperparameter yang akan dicoba.
# Semakin banyak, semakin lama, tapi potensi hasil lebih baik.
# Untuk percobaan awal, 20-50 trials mungkin cukup. Untuk hasil serius, 100+
study_lgbm = optuna.create_study(direction='maximize', study_name='LGBM_AUC_Optimization')
study_lgbm.optimize(objective_lgbm, n_trials=100) # Coba dengan 30 trials dulu, bisa ditingkatkan

print(f"\nOptimasi selesai. Jumlah trial: {len(study_lgbm.trials)}")
print(f"Best AUC (dari CV Optuna): {study_lgbm.best_value:.5f}")
print("Best hyperparameters:")
for key, value in study_lgbm.best_params.items():
    print(f"  {key}: {value}")

best_lgbm_params = study_lgbm.best_params
# Tambahkan parameter tetap yang tidak di-tune Optuna
best_lgbm_params['objective'] = 'binary'
best_lgbm_params['metric'] = 'auc'
best_lgbm_params['boosting_type'] = 'gbdt'
best_lgbm_params['random_state'] = 42
best_lgbm_params['n_jobs'] = -1
best_lgbm_params['verbose'] = -1
counts = y.value_counts()
if counts.get(1, 0) > 0:
    best_lgbm_params['scale_pos_weight'] = counts[0] / counts[1]


# ---------------------------------------------------------------------------
# 5. PELATIHAN MODEL FINAL DENGAN PARAMETER TERBAIK DAN PREDIKSI
# ---------------------------------------------------------------------------
print("\nMelatih model LightGBM final dengan parameter terbaik menggunakan Cross-Validation...")
NFOLDS_final = 5 # Bisa sama atau beda dengan CV di Optuna
final_folds = StratifiedKFold(n_splits=NFOLDS_final, shuffle=True, random_state=101) # Seed berbeda untuk final CV

oof_preds_final_lgbm = np.zeros(X_encoded.shape[0])
sub_preds_final_lgbm = np.zeros(X_test_encoded.shape[0])
models_final_lgbm = [] # Simpan model dari setiap fold jika diperlukan

for n_fold, (train_idx, valid_idx) in enumerate(final_folds.split(X_encoded, y)):
    X_train_fold, y_train_fold = X_encoded.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_encoded.iloc[valid_idx], y.iloc[valid_idx]

    model_final = lgb.LGBMClassifier(**best_lgbm_params) # Gunakan parameter terbaik
    model_final.fit(X_train_fold, y_train_fold,
                    eval_set=[(X_valid_fold, y_valid_fold)],
                    eval_metric='auc',
                    callbacks=[lgb.early_stopping(100, verbose=False)])

    oof_preds_final_lgbm[valid_idx] = model_final.predict_proba(X_valid_fold)[:, 1]
    sub_preds_final_lgbm += model_final.predict_proba(X_test_encoded)[:, 1] / NFOLDS_final
    models_final_lgbm.append(model_final)
    print(f"Fold {n_fold+1} (Final Model) selesai. OOF AUC Fold: {roc_auc_score(y_valid_fold, oof_preds_final_lgbm[valid_idx]):.5f}")

overall_oof_auc_final_lgbm = roc_auc_score(y, oof_preds_final_lgbm)
print(f"\nPelatihan model final LightGBM selesai. Skor CV (OOF AUC) Keseluruhan: {overall_oof_auc_final_lgbm:.5f}")

# ---------------------------------------------------------------------------
# 6. PEMBUATAN FILE SUBMISI
# ---------------------------------------------------------------------------
submission_df_final_lgbm = pd.DataFrame({
    "customer_number": validation_df_orig["customer_number"],
    "berlangganan_deposito": sub_preds_final_lgbm
})

submission_file_path_final_lgbm = "Validation_Result_Submission_LGBM_Tuned.csv"
submission_df_final_lgbm.to_csv(submission_file_path_final_lgbm, index=False)
print(f"\nFile prediksi LGBM Tuned disimpan sebagai: {submission_file_path_final_lgbm}")

# ---------------------------------------------------------------------------
# 7. UNDUH FILE HASIL PREDIKSI
# ---------------------------------------------------------------------------
try:
    files.download(submission_file_path_final_lgbm)
    print(f"\nFile {submission_file_path_final_lgbm} sedang diunduh...")
except Exception as e:
    print(f"Gagal mengunduh file secara otomatis: {e}")
    print("Anda dapat mengunduhnya secara manual dari panel file di sebelah kiri Colab.")

print("\n--- Proses Selesai (LGBM Tuned dengan Optuna dan FE Lanjutan) ---")

# Jika skor OOF AUC keseluruhan sudah > 0.8, ini adalah hasil yang baik.
# Jika belum, pertimbangkan:
# 1. Menambah n_trials di Optuna (misalnya, 50-100 atau lebih).
# 2. Rekayasa fitur yang lebih mendalam (domain-specific, target encoding, dll.).
# 3. Mencoba model lain dengan Optuna (misalnya XGBoost atau CatBoost).
# 4. Ensembling: Menggabungkan prediksi dari beberapa model kuat (misalnya, rata-rata dari LGBM dan XGBoost yang sudah di-tune).

Silakan unggah file training_dataset.csv


Saving training_dataset.csv to training_dataset.csv

File training_dataset.csv berhasil diunggah.

Silakan unggah file validation_set.csv


Saving validation_set.csv to validation_set.csv

File validation_set.csv berhasil diunggah.

Dataset berhasil dimuat.

Menerapkan rekayasa fitur...
One-hot encoding selesai.


[I 2025-05-31 08:19:08,624] A new study created in memory with name: LGBM_AUC_Optimization


Penyelarasan kolom selesai.

Memulai optimasi hyperparameter dengan Optuna...


[I 2025-05-31 08:19:13,102] Trial 0 finished with value: 0.7957698338956748 and parameters: {'n_estimators': 1782, 'learning_rate': 0.032667805206895946, 'num_leaves': 34, 'max_depth': 5, 'min_child_samples': 12, 'subsample': 0.817581544607243, 'colsample_bytree': 0.5201015549766175, 'reg_alpha': 0.9980776925205994, 'reg_lambda': 0.8898119786015879}. Best is trial 0 with value: 0.7957698338956748.
[I 2025-05-31 08:19:20,070] Trial 1 finished with value: 0.7960244900705019 and parameters: {'n_estimators': 2439, 'learning_rate': 0.015396314129590732, 'num_leaves': 86, 'max_depth': 4, 'min_child_samples': 12, 'subsample': 0.6385357652020964, 'colsample_bytree': 0.7438322181697076, 'reg_alpha': 0.15501897218317562, 'reg_lambda': 0.6823965371969085}. Best is trial 1 with value: 0.7960244900705019.
[I 2025-05-31 08:19:24,412] Trial 2 finished with value: 0.7967500265833488 and parameters: {'n_estimators': 1181, 'learning_rate': 0.04542893889395177, 'num_leaves': 90, 'max_depth': 4, 'min_chil


Optimasi selesai. Jumlah trial: 100
Best AUC (dari CV Optuna): 0.80090
Best hyperparameters:
  n_estimators: 1768
  learning_rate: 0.007604962028317158
  num_leaves: 69
  max_depth: 6
  min_child_samples: 85
  subsample: 0.7472739461260062
  colsample_bytree: 0.9319063519359018
  reg_alpha: 0.973084128465977
  reg_lambda: 0.6824010817994266

Melatih model LightGBM final dengan parameter terbaik menggunakan Cross-Validation...
Fold 1 (Final Model) selesai. OOF AUC Fold: 0.80285
Fold 2 (Final Model) selesai. OOF AUC Fold: 0.79866
Fold 3 (Final Model) selesai. OOF AUC Fold: 0.79976
Fold 4 (Final Model) selesai. OOF AUC Fold: 0.79418
Fold 5 (Final Model) selesai. OOF AUC Fold: 0.79289

Pelatihan model final LightGBM selesai. Skor CV (OOF AUC) Keseluruhan: 0.75012

File prediksi LGBM Tuned disimpan sebagai: Validation_Result_Submission_LGBM_Tuned.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


File Validation_Result_Submission_LGBM_Tuned.csv sedang diunduh...

--- Proses Selesai (LGBM Tuned dengan Optuna dan FE Lanjutan) ---
