In [112]:
from __future__ import annotations
import pandas as pd
import importlib
import Alpha
importlib.reload(Alpha)
from Alpha import merge_df, completed

import numpy as np
import optuna

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# --------------------
# Load
# --------------------
train_raw = pd.read_csv("C:\\Users\\solba\\dacon-project\\data\\raw\\train.csv")
test_raw  = pd.read_csv("C:\\Users\\solba\\dacon-project\\data\\raw\\test.csv")

train_df = merge_df(train_raw)
test_df  = merge_df(test_raw)
train_df = completed(train_raw, train_df)

train_df = train_df.sort_values("ID").reset_index(drop=True)
test_df  = test_df.sort_values("ID").reset_index(drop=True)

In [18]:
total_df1 = train_df.drop(columns=["ID", "completed"])
total_df2 = test_df.drop(columns=["ID"])

In [27]:
train_data = total_df1
test_data = total_df2

In [20]:
target = train_df["completed"]

In [21]:
cat_cols = [
    "school1","job","nationality","High Tech","Data Friendly","Others",
    "hope_for_group","desired_career_path",
    "incumbents_level","incumbents_lecture","incumbents_company_level",
    "incumbents_lecture_type","incumbents_lecture_scale",
]
num_cols = ["count","time_input","want_count"]

In [22]:
len(cat_cols) + len(num_cols)

16

In [23]:
for col in num_cols:
    q3 = train_data[col].quantile(0.75)
    q1 = train_data[col].quantile(0.25)
    iqr = q3 - q1
    outlier_idx = train_data[col][(train_data[col]<q1 - 1.5*iqr)|(train_data[col]>q3 + 1.5*iqr)].index
    train_data.drop(outlier_idx, inplace=True)
    target.drop(outlier_idx, inplace=True)

In [25]:
print(train_data.shape, target.shape)

(700, 16) (700,)


In [26]:
column_names_to_normalize = num_cols
x = train_data[column_names_to_normalize].values
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
train_data_temp = pd.DataFrame(x_scaled, columns = column_names_to_normalize, index = train_data.index)
train_data[column_names_to_normalize] = train_data_temp

In [28]:
column_names_to_normalize = num_cols
x = test_data[column_names_to_normalize].values

test_scaled =  scaler.transform(x)
test_temp = pd.DataFrame(test_scaled, columns = column_names_to_normalize, index = test_data.index)
test_data[column_names_to_normalize] = test_temp

In [31]:
X_train, X_validation, y_train, y_validation = train_test_split(train_data, target, train_size = 0.7, random_state = 42)

In [32]:
train_pool  = Pool(X_train, y_train, cat_features=cat_cols)
eval_pool = Pool(X_validation, y_validation, cat_features=cat_cols)
test_pool = Pool(data = test_data, cat_features=cat_cols)

In [35]:
sampler = TPESampler(seed = 10)

In [230]:
# 함수 정의
def objective(trial):

    param = {
      "random_state" : 42,
      'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.2),
    } 
    model = CatBoostClassifier(**param)
    f1_list = []
    kf = KFold(n_splits=10)
    for tr_index,val_index in kf.split(train_data):
        X_train, y_train = train_data.iloc[tr_index], target.iloc[tr_index]
        X_valid , y_valid = train_data.iloc[val_index], target.iloc[val_index]
        model = model.fit(X_train,y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)],
                           verbose=False, early_stopping_rounds=35)                         
        f1_list.append(f1_score(y_valid, model.predict(X_valid),average='macro'))
    return np.mean(f1_list)

In [231]:
optuna_cbrm = optuna.create_study(direction="maximize", sampler=sampler)
optuna_cbrm.optimize(objective, n_trials = 30)

[32m[I 2026-01-27 14:27:24,962][0m A new study created in memory with name: no-name-eaa1c9df-1ae6-4099-82d0-471f584b8063[0m
  'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.2),
[32m[I 2026-01-27 14:27:25,966][0m Trial 0 finished with value: 0.4979507932070786 and parameters: {'learning_rate': 0.1550157115572994}. Best is trial 0 with value: 0.4979507932070786.[0m
  'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.2),
[32m[I 2026-01-27 14:27:26,916][0m Trial 1 finished with value: 0.4854137698640318 and parameters: {'learning_rate': 0.17683836211772308}. Best is trial 0 with value: 0.4979507932070786.[0m
  'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.2),
[32m[I 2026-01-27 14:27:27,983][0m Trial 2 finished with value: 0.45216673318641504 and parameters: {'learning_rate': 0.08932673732946071}. Best is trial 0 with value: 0.4979507932070786.[0m
  'learning_rate' : trial.suggest_uniform('learning_rate', 0.01, 0.2),
[32m[I 20

In [232]:
cbrm_trial = optuna_cbrm.best_trial
cbrm_trial_params = cbrm_trial.params

In [233]:
cbrm_trial_params

{'learning_rate': 0.1550157115572994}

In [234]:
#Optuna에서 가져온 최적의 파라미터들로 모델 학습
params = {
          'learning_rate': 0.1550157115572994,
          'eval_metric':'AUC',
          'early_stopping_rounds':50,
          'use_best_model': True,
          'random_seed': 42,
          'auto_class_weights':'Balanced',
          'verbose':200}
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=eval_pool,use_best_model=True)

0:	test: 0.5672049	best: 0.5672049 (0)	total: 8.5ms	remaining: 8.49s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.5864870424
bestIteration = 1

Shrink model to first 2 iterations.


<catboost.core.CatBoostClassifier at 0x1ec4e2ab510>

In [235]:
pred = model.predict(eval_pool)
print(classification_report(y_validation,pred,digits=5))

              precision    recall  f1-score   support

           0    0.74227   0.50350   0.60000       143
           1    0.37719   0.63235   0.47253        68

    accuracy                        0.54502       211
   macro avg    0.55973   0.56792   0.53626       211
weighted avg    0.62461   0.54502   0.55892       211



In [248]:
# ----------------제출용-----------------------------
mypredictions = model.predict(test_data)

In [249]:
ss = pd.read_csv('C:\\Users\\solba\\dacon-project\\result\\sample_submission.csv',header=0)
ss['completed'] = mypredictions

In [250]:
ss

Unnamed: 0,ID,completed
0,TEST_000,0
1,TEST_001,0
2,TEST_002,0
3,TEST_003,1
4,TEST_004,0
...,...,...
809,TEST_809,0
810,TEST_810,0
811,TEST_811,0
812,TEST_812,1


In [251]:
ss.to_csv('My_submission.csv',index=False)