In [1]:
!pip install catboost -q
!pip install optuna -q

import pandas as pd
import numpy as np
import catboost
from catboost import CatBoostClassifier, Pool, metrics, EShapCalcType, EFeaturesSelectionAlgorithm
import optuna # Untuk optimasi hyperparameter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import KBinsDiscretizer, PolynomialFeatures # Hanya untuk FE
from google.colab import files

# 1. Upload Dataset
print("Silakan unggah file training_dataset.csv")
uploaded_train = files.upload()
train_file_name = next(iter(uploaded_train))
print(f"\nFile {train_file_name} berhasil diunggah.")

print("\nSilakan unggah file validation_set.csv")
uploaded_validation = files.upload()
validation_file_name = next(iter(uploaded_validation))
print(f"\nFile {validation_file_name} berhasil diunggah.")

# 2. load Dataset

try:
    train_df_orig = pd.read_csv(train_file_name)
    validation_df_orig = pd.read_csv(validation_file_name)
    print("\nDataset berhasil dimuat.")
except Exception as e:
    print(f"Error saat memuat dataset: {e}")
    raise
# 3. Feature Engineering

def feature_engineer(df_input):
    df = df_input.copy()
    df['pernah_dikontak_sebelumnya'] = df['hari_sejak_kontak_sebelumnya'].apply(lambda x: 0 if x == 999 else 1)
    df['hari_sejak_kontak_valid'] = df['hari_sejak_kontak_sebelumnya'].replace(999, -1)

    age_bins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile', subsample=None)
    df['usia_bin'] = age_bins.fit_transform(df[['usia']]).astype(int)

    df['usia_x_jumlah_kontak_kampanye'] = df['usia'] * df['jumlah_kontak_kampanye_ini']
    df['kontak_sebelumnya_x_kampanye_ini'] = df['jumlah_kontak_sebelumnya'] * df['jumlah_kontak_kampanye_ini']

    poly_features_cols = ['usia', 'jumlah_kontak_kampanye_ini', 'suku_bunga_euribor_3bln']
    poly_features_cols_exist = [col for col in poly_features_cols if col in df.columns]
    if poly_features_cols_exist:
        poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
        df_poly = poly.fit_transform(df[poly_features_cols_exist])
        poly_feature_names = poly.get_feature_names_out(poly_features_cols_exist)
        df_poly = pd.DataFrame(df_poly, columns=poly_feature_names, index=df.index)
        for col in poly_feature_names:
            if col not in df.columns:
                 df[col] = df_poly[col]
            elif col in poly_features_cols_exist and ('^2' in col or ' ' in col): # Interaksi atau pangkat
                 df[col.replace(' ', '_').replace('^','_pow_')] = df_poly[col]


    df['rasio_harga_kepercayaan_konsumen'] = df['indeks_harga_konsumen'] / (df['indeks_kepercayaan_konsumen'] * -1 + 1e-6)
    return df

print("\nMenerapkan rekayasa fitur...")
train_df_fe = feature_engineer(train_df_orig)
validation_df_fe = feature_engineer(validation_df_orig)

# Pisahkan variabel target
X = train_df_fe.drop(["customer_number", "berlangganan_deposito"], axis=1)
y = train_df_fe["berlangganan_deposito"]
X_test = validation_df_fe.drop("customer_number", axis=1)

# Identifikasi fitur kategorikal untuk CatBoost (SETELAH feature engineering)
categorical_features_names = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Isi nilai NaN di fitur kategorikal dengan placeholder (CatBoost memerlukannya)
for col in categorical_features_names:
    X[col] = X[col].astype(str).fillna('NA_CAT') # Ubah ke string dan isi NaN
    X_test[col] = X_test[col].astype(str).fillna('NA_CAT')

common_cols = X.columns.intersection(X_test.columns).tolist()
X = X[common_cols]
X_test = X_test[common_cols]

# Dapatkan indeks fitur kategorikal (CatBoost bisa menerima nama atau indeks)
cat_features_indices = [X.columns.get_loc(col) for col in categorical_features_names if col in X.columns]

print(f"Fitur kategorikal yang akan digunakan CatBoost (indeks): {cat_features_indices}")
print(f"Nama fitur kategorikal: {[X.columns[i] for i in cat_features_indices]}")

# 4. OPTIMASI HYPERPARAMETER DENGAN OPTUNA UNTUK CATBOOST
def objective_catboost(trial):
    cb_params = {
        'iterations': trial.suggest_int('iterations', 500, 2500),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10.0), # Regularisasi L2
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True), # Mencegah overfitting
        'border_count': trial.suggest_int('border_count', 32, 255), # Jumlah split untuk fitur numerik
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0), # Kontrol intensitas bagging
        'od_type': 'Iter', # Early stopping type
        'od_wait': trial.suggest_int('od_wait', 20, 100), # Early stopping rounds (mirip patience)
        'random_seed': 42,
        'verbose': 0, # Silent mode
        # 'auto_class_weights': 'Balanced', # Atau gunakan scale_pos_weight
    }

    # Hitung scale_pos_weight untuk data tidak seimbang
    counts = y.value_counts()
    if counts.get(1, 0) > 0:
         cb_params['scale_pos_weight'] = counts[0] / counts[1]


    cv_scores = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=trial.number + 42)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
        X_valid_fold, y_valid_fold = X.iloc[valid_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**cb_params)
        model.fit(X_train_fold, y_train_fold,
                  eval_set=[(X_valid_fold, y_valid_fold)],
                  cat_features=cat_features_indices, # Berikan indeks fitur kategorikal
                  early_stopping_rounds=cb_params['od_wait'], # Gunakan od_wait dari trial
                  verbose=0)

        preds_valid = model.predict_proba(X_valid_fold)[:, 1]
        auc_score = roc_auc_score(y_valid_fold, preds_valid)
        cv_scores.append(auc_score)

    return np.mean(cv_scores)

print("\nMemulai optimasi hyperparameter CatBoost dengan Optuna...")
study_catboost = optuna.create_study(direction='maximize', study_name='CatBoost_AUC_Optimization')
study_catboost.optimize(objective_catboost, n_trials=10) # Coba dengan 30 trials dulu, bisa ditingkatkan

print(f"\nOptimasi CatBoost selesai. Jumlah trial: {len(study_catboost.trials)}")
print(f"Best AUC (dari CV Optuna): {study_catboost.best_value:.5f}")
print("Best hyperparameters CatBoost:")
best_catboost_params = study_catboost.best_params
for key, value in best_catboost_params.items():
    print(f"  {key}: {value}")

# Tambahkan parameter tetap
best_catboost_params['random_seed'] = 42
best_catboost_params['verbose'] = 0
# best_catboost_params['auto_class_weights'] = 'Balanced' # Pastikan ini konsisten
counts = y.value_counts()
if counts.get(1,0) > 0 :
    best_catboost_params['scale_pos_weight'] = counts[0]/counts[1]

# 5. Melatih Model Final Catboost Dengan Parameter terbaik

print("\nMelatih model CatBoost final dengan parameter terbaik menggunakan Cross-Validation...")
NFOLDS_final = 5
final_folds = StratifiedKFold(n_splits=NFOLDS_final, shuffle=True, random_state=101)

oof_preds_final_cb = np.zeros(X.shape[0])
sub_preds_final_cb = np.zeros(X_test.shape[0])
models_final_cb = []

for n_fold, (train_idx, valid_idx) in enumerate(final_folds.split(X, y)):
    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X.iloc[valid_idx], y.iloc[valid_idx]

    model_final = CatBoostClassifier(**best_catboost_params)
    model_final.fit(X_train_fold, y_train_fold,
                    eval_set=[(X_valid_fold, y_valid_fold)],
                    cat_features=cat_features_indices,
                    verbose=0)

    oof_preds_final_cb[valid_idx] = model_final.predict_proba(X_valid_fold)[:, 1]
    sub_preds_final_cb += model_final.predict_proba(X_test)[:, 1] / NFOLDS_final # Prediksi pada X_test keseluruhan
    models_final_cb.append(model_final)
    print(f"Fold {n_fold+1} (Final Model CatBoost) selesai. OOF AUC Fold: {roc_auc_score(y_valid_fold, oof_preds_final_cb[valid_idx]):.5f}")

overall_oof_auc_final_cb = roc_auc_score(y, oof_preds_final_cb)
print(f"\nPelatihan model final CatBoost selesai. Skor CV (OOF AUC) Keseluruhan: {overall_oof_auc_final_cb:.5f}")

# 6. Membuat File Submisi

DCM_DMU_2025_Model_Aryakk_Team = pd.DataFrame({
    "customer_number": validation_df_orig["customer_number"],
    "berlangganan_deposito": sub_preds_final_cb
})

submission_file_path_final_cb = "DCM_DMU_2025_Model_Aryakk_Team.csv"
DCM_DMU_2025_Model_Aryakk_Team.to_csv(submission_file_path_final_cb, index=False)
print(f"\nFile prediksi CatBoost Tuned disimpan sebagai: {submission_file_path_final_cb}")

# 7. Unduh File

try:
    files.download(submission_file_path_final_cb)
    print(f"\nFile {submission_file_path_final_cb} sedang diunduh...")
except Exception as e:
    print(f"Gagal mengunduh file secara otomatis: {e}")
    print("Anda dapat mengunduhnya secara manual dari panel file di sebelah kiri Colab.")

print("\n Proses Selesai ")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hSilakan unggah file training_dataset.csv


Saving training_dataset.csv to training_dataset.csv

File training_dataset.csv berhasil diunggah.

Silakan unggah file validation_set.csv


[I 2025-06-02 03:42:01,257] A new study created in memory with name: CatBoost_AUC_Optimization


Saving validation_set.csv to validation_set.csv

File validation_set.csv berhasil diunggah.

Dataset berhasil dimuat.

Menerapkan rekayasa fitur...
Fitur kategorikal yang akan digunakan CatBoost (indeks): [1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 19]
Nama fitur kategorikal: ['pekerjaan', 'status_perkawinan', 'pendidikan', 'gagal_bayar_sebelumnya', 'pinjaman_rumah', 'pinjaman_pribadi', 'jenis_kontak', 'bulan_kontak_terakhir', 'hari_kontak_terakhir', 'hasil_kampanye_sebelumnya', 'pulau']

Memulai optimasi hyperparameter CatBoost dengan Optuna...


[I 2025-06-02 03:43:02,669] Trial 0 finished with value: 0.7959242799406081 and parameters: {'iterations': 893, 'learning_rate': 0.022999871411426905, 'depth': 5, 'l2_leaf_reg': 6.721464103114385, 'random_strength': 0.003327382222409437, 'border_count': 118, 'bagging_temperature': 0.9477353106914485, 'od_wait': 57}. Best is trial 0 with value: 0.7959242799406081.
[I 2025-06-02 03:44:16,624] Trial 1 finished with value: 0.795727248266794 and parameters: {'iterations': 2021, 'learning_rate': 0.03607831015914568, 'depth': 6, 'l2_leaf_reg': 8.523140809915933, 'random_strength': 3.0451751826837063, 'border_count': 232, 'bagging_temperature': 0.12999745614657687, 'od_wait': 21}. Best is trial 0 with value: 0.7959242799406081.
[I 2025-06-02 03:47:42,103] Trial 2 finished with value: 0.7945757404748379 and parameters: {'iterations': 1961, 'learning_rate': 0.0065743747850618055, 'depth': 4, 'l2_leaf_reg': 7.539475442632465, 'random_strength': 0.3815052458461703, 'border_count': 255, 'bagging_te


Optimasi CatBoost selesai. Jumlah trial: 10
Best AUC (dari CV Optuna): 0.79832
Best hyperparameters CatBoost:
  iterations: 2411
  learning_rate: 0.020884780704458226
  depth: 8
  l2_leaf_reg: 0.6829142059046605
  random_strength: 0.003672521164924202
  border_count: 215
  bagging_temperature: 0.29700182132750363
  od_wait: 83

Melatih model CatBoost final dengan parameter terbaik menggunakan Cross-Validation...
Fold 1 (Final Model CatBoost) selesai. OOF AUC Fold: 0.80224
Fold 2 (Final Model CatBoost) selesai. OOF AUC Fold: 0.79356
Fold 3 (Final Model CatBoost) selesai. OOF AUC Fold: 0.80289
Fold 4 (Final Model CatBoost) selesai. OOF AUC Fold: 0.79665
Fold 5 (Final Model CatBoost) selesai. OOF AUC Fold: 0.79103

Pelatihan model final CatBoost selesai. Skor CV (OOF AUC) Keseluruhan: 0.79715

File prediksi CatBoost Tuned disimpan sebagai: DCM_DMU_2025_Model_Aryakk_Team.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


File DCM_DMU_2025_Model_Aryakk_Team.csv sedang diunduh...

 Proses Selesai 
