In [1]:
import pandas as pd
import numpy as np
import warnings
import gc

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from utils.reduce_memory import trainform_columns_type

from catboost import CatBoostClassifier
import optuna
from optuna.samplers import TPESampler

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### 단일 값 가지는 컬럼 & NULL 값 가지는 컬럼 모두 제거.

In [2]:
train_df = pd.read_parquet('./data/train/train_filtered.parquet')
test_df = pd.read_parquet('./data/test/test_filtered.parquet')

# reduce memory by chaing data types of columns
train_df = trainform_columns_type(train_df)
test_df = trainform_columns_type(test_df)

print('shape of train dataset(drop colums)', train_df.shape)
print('shape of test dataset(drop colums)', test_df.shape)

shape of train dataset(drop colums) (2400000, 743)
shape of test dataset(drop colums) (600000, 742)


In [3]:
# 추가
train_df = train_df[train_df['기준년월'] == 201812]
test_df = test_df[test_df['기준년월'] == 201812]

In [4]:
train_df

Unnamed: 0,기준년월,ID,남녀구분코드,연령,Segment,회원여부_이용가능,회원여부_이용가능_CA,회원여부_이용가능_카드론,소지여부_신용,소지카드수_유효_신용,...,변동률_RV일시불평잔,변동률_할부평잔,변동률_CA평잔,변동률_RVCA평잔,변동률_카드론평잔,변동률_잔액_B1M,변동률_잔액_일시불_B1M,변동률_잔액_CA_B1M,혜택수혜율_R3M,혜택수혜율_B0M
2000000,201812,TRAIN_000000,2,40대,D,1,1,0,1,1,...,0.999998,0.591302,1.001020,0.999998,0.999998,-0.057150,0.061048,0.000000,0.878859,1.398627
2000001,201812,TRAIN_000001,1,30대,E,1,1,1,1,1,...,0.965251,0.901252,0.999998,0.999998,0.999998,-0.033906,-0.020131,0.000000,0.000000,0.000000
2000002,201812,TRAIN_000002,1,30대,C,1,1,0,1,1,...,1.005795,0.585823,0.997353,0.000000,0.999998,-0.097278,-0.076351,-0.115879,0.187467,-1.198788
2000003,201812,TRAIN_000003,2,40대,D,1,1,0,1,2,...,0.999998,0.774731,1.003519,0.999998,0.999998,0.142766,0.090599,0.000000,0.781401,1.282494
2000004,201812,TRAIN_000004,2,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,2,70대이상,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,
2399996,201812,TRAIN_399996,2,50대,D,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.921733,-0.203251,-0.159143,0.000000,1.377071,2.533815
2399997,201812,TRAIN_399997,1,30대,C,1,1,0,1,1,...,0.999998,0.345027,0.999998,0.999998,0.999998,0.027319,0.126581,0.000000,0.000000,0.000000
2399998,201812,TRAIN_399998,1,40대,E,1,1,1,1,1,...,0.999998,0.999998,0.999998,0.999998,0.999998,0.000000,0.000000,0.000000,,


In [5]:
# remove columns to have null values
null_cols = train_df.columns[train_df.isnull().any()]
train_df = train_df.drop(columns=null_cols)          
test_df = test_df.drop(columns=null_cols, errors='ignore')
print('shape of train dataset(drop colums to have null values)', train_df.shape)
print('shape of test dataset(drop colums to have null values)', test_df.shape)

shape of train dataset(drop colums to have null values) (400000, 712)
shape of test dataset(drop colums to have null values) (100000, 711)


In [6]:
train_df.dtypes

기준년월                int32
ID                 object
남녀구분코드              int32
연령                 object
Segment            object
                   ...   
변동률_RVCA평잔        float32
변동률_카드론평잔         float32
변동률_잔액_B1M        float32
변동률_잔액_일시불_B1M    float32
변동률_잔액_CA_B1M     float32
Length: 712, dtype: object

## Split X, y in train datasets 분리

In [7]:
# X: ID, Segement 제거
# y: Segment 추출
feature_cols = [col for col in train_df.columns if col not in ["ID", "Segment", '기준년월']]

X = train_df[feature_cols].copy() 
y = train_df["Segment"].copy() 
X_test = test_df.copy()

In [8]:
# Target Label Encoding
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
continous_features = X.select_dtypes(include=['float32', 'int32']).columns.tolist()


encoders = {}  # categorical encoders 저장
scalers = {}  #  continous encoders 저장
for col in categorical_features:
    le_train = LabelEncoder()
    X[col] = le_train.fit_transform(X[col])
    encoders[col] = le_train
    unseen_labels_val = set(X_test[col]) - set(le_train.classes_)
    if unseen_labels_val:
        le_train.classes_ = np.append(le_train.classes_, list(unseen_labels_val))
    X_test[col] = le_train.transform(X_test[col])
    
for col in continous_features:
    std_train = StandardScaler()
    X[col] = std_train.fit_transform(X[[col]])
    scalers[col] = std_train
    X_test[col] = std_train.transform(X_test[[col]])

In [9]:
gc.collect()

0

In [10]:
classes = np.unique(y_encoded)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_encoded)
class_weight_dict = dict(zip(classes, weights))

In [11]:
class_weight_dict

{0: 493.82716049382714,
 1: 3333.3333333333335,
 2: 3.7620503174229953,
 3: 1.3744051402752246,
 4: 0.2497330977517778}

In [12]:
sample_weights = np.array([class_weight_dict[y] for y in y_encoded])
sample_weights

array([1.37440514, 0.2497331 , 3.76205032, ..., 3.76205032, 0.2497331 ,
       0.2497331 ])

In [13]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def catboost_objective(trial, kfold=None):
    params = {
        "iterations": trial.suggest_int("iterations", 1000, 10000, step=1000),  # 전체 학습 횟수
        "depth": trial.suggest_int("depth", 4, 10),                             # 트리 깊이
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 1.0),      # 학습률
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 1e2, log=True),# L2 정규화
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli"]),  # 부트스트랩 방식
        "grow_policy": trial.suggest_categorical("grow_policy", ["SymmetricTree", "Depthwise", "Lossguide"]),  # 트리 확장 방식
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)    # 하나의 리프에 들어갈 최소 데이터 수
    }

    # 부트스트랩 방식에 따른 추가 조건
    if params["bootstrap_type"] == "Bayesian":
        params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif params["bootstrap_type"] == "Bernoulli":
        params["subsample"] = trial.suggest_float("subsample", 0.1, 1.0)

    scores = []
    
    # K-Fold
    if kfold != None:
        for train_idx, valid_idx in kf.split(X):
            X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
            y_train, y_valid = y_encoded[train_idx], y_encoded[valid_idx]

            model = CatBoostClassifier(**params,
                                        task_type="GPU",
                                         verbose=0,
                                         cat_features=categorical_features,
                                         random_seed=0)
            
            model.fit(X_train, y_train,
                      sample_weight=sample_weights[train_idx], 
                      early_stopping_rounds=40)
            preds = model.predict(X_valid)
            f1 = f1_score(y_valid, preds, average='macro')
            scores.append(f1)
            
        return np.mean(scores)

# optuna    
cat_study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
cat_study.optimize(lambda trial: catboost_objective(trial, kf), n_trials=30)

# best hyperparameter
best_cat_params = cat_study.best_params
print(f"Best CatBoost Parameters: {best_cat_params}")

[I 2025-04-07 17:08:10,117] A new study created in memory with name: no-name-8ce7adcd-7d9f-4dae-a2c8-70e8298648d0
[I 2025-04-07 17:17:26,811] Trial 0 finished with value: 0.5275858029262611 and parameters: {'iterations': 4000, 'depth': 10, 'learning_rate': 0.7322619478695936, 'l2_leaf_reg': 0.9846738873614566, 'bootstrap_type': 'Bayesian', 'grow_policy': 'Depthwise', 'min_data_in_leaf': 71, 'bagging_temperature': 0.20584494295802447}. Best is trial 0 with value: 0.5275858029262611.
[I 2025-04-07 18:03:10,874] Trial 1 finished with value: 0.5197287567184719 and parameters: {'iterations': 10000, 'depth': 9, 'learning_rate': 0.21312677156759788, 'l2_leaf_reg': 0.008111941985431923, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 62, 'subsample': 0.22554447458683766}. Best is trial 0 with value: 0.5275858029262611.
[I 2025-04-07 18:09:00,082] Trial 2 finished with value: 0.5348131842596902 and parameters: {'iterations': 3000, 'depth': 6, 'learning_rate': 

KeyboardInterrupt: 

In [13]:
X_test.drop(columns=['ID'],inplace=True)

In [14]:
n_classes = len(np.unique(y_encoded))
test_probabilities = np.zeros((len(X_test), n_classes)) # (600000, 5)

for model in models:
    test_probabilities += model.predict_proba(X_test) # (600000, 5)

test_probabilities /= len(models)
test_predictions = np.argmax(test_probabilities, axis=1)

print('Soft Voting Inference Done.')

Soft Voting Inference Done.
