밑의 첫 두개의 코드 블록은 Cardinality, Less important Feature를 구분하기 위해 활용했던 코드입니다. 특히 2번째 블록은 한번 Catboost 모델 학습 이후 알게된 결과값이니, 이 점 참조 바랍니다.

In [11]:
import pandas as pd

def calculate_nunique_chunks(csv_path, chunksize=50000):
    chunk_iter = pd.read_csv(csv_path, chunksize=chunksize)
    unique_counts = {}
    for chunk in chunk_iter:
        for col in chunk.select_dtypes(include='object').columns:
            if col not in unique_counts:
                unique_counts[col] = set()
            unique_counts[col].update(chunk[col].unique())

    unique_counts = {col: len(unique_set) for col, unique_set in unique_counts.items()}
    return unique_counts

if __name__ == "__main__":
    train_path = 'train.csv'
    nunique_counts = calculate_nunique_chunks(train_path)
    print(nunique_counts)

{'ID': 28605391, 'F01': 4760931, 'F02': 304405, 'F03': 64, 'F05': 5343557, 'F07': 151200, 'F08': 79, 'F09': 27551, 'F10': 1404255, 'F12': 4174064, 'F13': 1307, 'F15': 4, 'F16': 15467, 'F17': 10, 'F20': 178603, 'F21': 33, 'F22': 7187, 'F23': 950, 'F25': 10700, 'F26': 2205, 'F28': 55, 'F30': 19444, 'F31': 14, 'F34': 3165581, 'F35': 3, 'F37': 9423, 'F39': 6800}


In [10]:
from catboost import CatBoostClassifier

label = 'Click'
categorical_columns = [
    'F02', 'F03', 'F07', 'F08', 'F09', 'F13', 'F15', 'F16', 'F17',
    'F20', 'F21', 'F22', 'F23', 'F25', 'F26', 'F28', 'F30', 'F31',
    'F35', 'F37', 'F39'
]

best_model_path = "best_catboost_model.cbm"
model = CatBoostClassifier()
model.load_model(best_model_path)

feature_importances = model.get_feature_importance(type='FeatureImportance')
feature_names = model.feature_names_

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

    Feature  Importance
6      F09   11.044925
15     F20    7.612939
27     F32    6.910839
19     F24    5.379934
12     F17    5.195723
11     F16    5.039283
24     F29    4.950099
0      F02    4.917526
33     F39    4.065120
14     F19    3.389363
30     F36    3.306575
2      F04    2.992188
16     F21    2.894445
31     F37    2.873075
3      F06    2.848689
7      F11    2.574435
20     F25    2.432538
10     F15    2.332472
8      F13    2.321559
26     F31    2.320940
5      F08    1.839616
23     F28    1.807348
4      F07    1.686763
28     F33    1.526051
9      F14    1.455953
1      F03    1.297226
13     F18    1.275427
21     F26    1.099800
22     F27    1.077820
29     F35    0.823446
32     F38    0.552689
18     F23    0.064059
17     F22    0.049730
25     F30    0.041406


데이터 전처리 과정입니다. 결측치는 0으로, Categorical에 대해서는 'NAN'으로 처리했습니다. 그리고 Label Encoding시 Train에 없는 Test class 에 대해서도 'NAN'으로 처리하도록 했습니다.

In [None]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Class가 100만개 이상의 High Cardinal한 것, Feature Importance 낮았던 Feature 제거
columns_to_remove = ['F01', 'F05', 'F10', 'F12', 'F34', 'F23', 'F22', 'F30']

categorical_columns = ['F02', 'F03', 'F07', 'F08', 'F09', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F25', 'F26', 'F28', 'F31', 'F35', 'F37', 'F39']
numerical_columns = ['F04', 'F06', 'F11', 'F14', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']

# 메인메모리에 부담을 없애고자 Data Type downsizing 적용
def determine_dtype(series):
    if pd.api.types.is_float_dtype(series):
        min_val, max_val = series.min(), series.max()
        if min_val >= np.finfo(np.float16).min and max_val <= np.finfo(np.float16).max:
            return np.float16
        elif min_val >= np.finfo(np.float32).min and max_val <= np.finfo(np.float32).max:
            return np.float32
        else:
            return np.float64
    elif pd.api.types.is_integer_dtype(series):
        min_val, max_val = series.min(), series.max()
        if min_val >= np.iinfo(np.int8).min and max_val <= np.iinfo(np.int8).max:
            return np.int8
        elif min_val >= np.iinfo(np.int16).min and max_val <= np.iinfo(np.int16).max:
            return np.int16
        elif min_val >= np.iinfo(np.int32).min and max_val <= np.iinfo(np.int32).max:
            return np.int32
        else:
            return np.int64
    else:
        return series.dtype

train_path = 'train.csv'
test_path = 'test.csv'

unique_classes = {col: set() for col in categorical_columns}

#메모리 부족으로, Chunk로 진행
train_chunks = pd.read_csv(train_path, chunksize=1000000)
for chunk in train_chunks:
    for col in categorical_columns:
        #결측치 'NAN'으로 설정
        chunk[col].fillna('NAN', inplace=True)
        unique_classes[col].update(chunk[col].astype(str).unique())

label_encoders = {}
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    label_encoders[col].fit(list(unique_classes[col]) + ['NAN'])

def preprocess_chunk(chunk, label_encoders, is_train=True):
    if is_train:
        chunk.drop(columns=['ID'], inplace=True)
    chunk.drop(columns=columns_to_remove, inplace=True)

    for col in categorical_columns:
        chunk[col].fillna('NAN', inplace=True)

    for col in numerical_columns:
        chunk[col].fillna(0, inplace=True)

    for col in categorical_columns:
        if col in label_encoders:
            chunk[col] = chunk[col].apply(lambda x: x if x in label_encoders[col].classes_ else 'NAN')
            chunk[col] = label_encoders[col].transform(chunk[col].astype(str))

    for col in numerical_columns:
        dtype = determine_dtype(chunk[col])
        if pd.api.types.is_integer_dtype(dtype):
            min_val, max_val = np.iinfo(dtype).min, np.iinfo(dtype).max
            chunk[col] = chunk[col].clip(min_val, max_val)
        elif pd.api.types.is_float_dtype(dtype):
            min_val, max_val = np.finfo(dtype).min, np.finfo(dtype).max
            chunk[col] = chunk[col].clip(min_val, max_val)
            #결측값 0으로 설정
        chunk[col] = pd.to_numeric(chunk[col], errors='coerce').fillna(0).astype(dtype)

    return chunk

train_processed_chunks = []
train_chunks = pd.read_csv(train_path, chunksize=1000000)

for chunk in train_chunks:
    processed_chunk = preprocess_chunk(chunk, label_encoders, is_train=True)
    train_processed_chunks.append(processed_chunk)

train_final = pd.concat(train_processed_chunks)
train_final.to_csv('train_final.csv', index=False)

test_processed_chunks = []
test_chunks = pd.read_csv(test_path, chunksize=1000000)

for chunk in test_chunks:
    processed_chunk = preprocess_chunk(chunk, label_encoders, is_train=False)
    test_processed_chunks.append(processed_chunk)

test_final = pd.concat(test_processed_chunks)
test_final.to_csv('test_final.csv', index=False)


Memory에 대한 부담을 최소화 하기 위해 Train/val을 따로 저장해서, 새로운 Kernal 환경에서 Load해서 활용했습니다

In [None]:
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('train_final.csv')
train, val = train_test_split(train_df, test_size=0.05, stratify=train_df['Click'], random_state=42)
train.to_csv('train_sampled.csv', index=False)
val.to_csv('val_sampled.csv', index=False)

Data Type 축소

In [3]:
def downcast_df(df):
    for col in df.select_dtypes(include=['int']).columns:
        df[col] = pd.to_numeric(df[col], downcast='integer')
    
    for col in df.select_dtypes(include=['float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    
    return df

train_data = pd.read_csv('train_sampled.csv')
val_data = pd.read_csv('val_sampled.csv')

train_data = downcast_df(train_data)
val_data = downcast_df(val_data)

Catboost 공식 사이트에 scale_pos_weight는 Num_negative / Num_positive로 계산하라고 적혀있어, 그대로 적용했습니다

In [4]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

label = 'Click'
categorical_features = ['F02', 'F03', 'F07', 'F08', 'F09', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F25', 'F26', 'F28', 'F31', 'F35', 'F37', 'F39']

num_positive = train_data[train_data[label] == 1].shape[0]
num_negative = train_data[train_data[label] == 0].shape[0]
scale_pos_weight = num_negative / num_positive

train_pool = Pool(data=train_data.drop(columns=[label]), label=train_data[label], cat_features=categorical_features)
val_pool = Pool(data=val_data.drop(columns=[label]), label=val_data[label], cat_features=categorical_features)

print(f"weight: {scale_pos_weight}")

weight: 4.135746773943293


Optuna로 찾은 Hyperparameter로 학습 진행하였습니다. 다만, 그냥 진행할 경우 Model Shrink도중 메모리 용량 제한으로 인해 kernal이 꺼지게 됩니다. 이는 Category가 많을 때 발생하는데, Model_size_reg를 활용하면 이 문제를 해결할 수 있습니다 (성능 저하는 발생합니다). 데이터의 갯수가 많으므로, boosting type은 Plain으로 설정하여 조금이라도 훈련 속도를 가속화하고자 했고, 혹시나 Kernal이 꺼지게 될 경우가 있어, snapshot을 활용하였습니다. Border Count의 경우 Best Performance를 위해서는 254를 추천하고 있고, 저는 255로 해봤습니다. 

In [6]:
catboost_params = {
    'iterations': 5000,
    'depth': 10,
    'learning_rate': 0.110521996103408,
    'border_count': 255,
    'eval_metric': 'AUC',
    'verbose': True,
    'task_type': 'GPU',
    'devices': '0',
    'l2_leaf_reg': 3.322521996103408,
    'scale_pos_weight': scale_pos_weight,
    'bagging_temperature': 0.1277658067340062,
    'metric_period': 100,
    'save_snapshot': True,
    'snapshot_file': 'catboost_snapshot',
    'random_strength': 5,
    'snapshot_interval': 1200, 
    'use_best_model': True,
    'model_size_reg': 1,
    'boosting_type': 'Plain'
}

model = CatBoostClassifier(**catboost_params)

print("훈련 개시")
model.fit(train_pool, eval_set=val_pool, use_best_model=True, early_stopping_rounds=50)
print("종료")

훈련 개시
0:	test: 0.7268263	best: 0.7268263 (0)	total: 2.25s	remaining: 3h 7m 49s
100:	test: 0.7726049	best: 0.7726049 (100)	total: 3m 24s	remaining: 2h 44m 56s
200:	test: 0.7787056	best: 0.7787056 (200)	total: 6m 36s	remaining: 2h 37m 54s
300:	test: 0.7819455	best: 0.7819455 (300)	total: 9m 47s	remaining: 2h 32m 55s
400:	test: 0.7839791	best: 0.7839791 (400)	total: 13m	remaining: 2h 29m 9s
500:	test: 0.7852241	best: 0.7852241 (500)	total: 16m 6s	remaining: 2h 24m 42s
600:	test: 0.7862383	best: 0.7862383 (600)	total: 19m 27s	remaining: 2h 22m 24s
700:	test: 0.7870005	best: 0.7870005 (700)	total: 22m 34s	remaining: 2h 18m 25s
800:	test: 0.7876433	best: 0.7876433 (800)	total: 25m 42s	remaining: 2h 14m 48s
900:	test: 0.7881540	best: 0.7881540 (900)	total: 28m 52s	remaining: 2h 11m 23s
1000:	test: 0.7885418	best: 0.7885418 (1000)	total: 32m 5s	remaining: 2h 8m 13s
1100:	test: 0.7889055	best: 0.7889055 (1100)	total: 35m 15s	remaining: 2h 4m 50s
1200:	test: 0.7892120	best: 0.7892120 (1200)	tota

In [7]:
model.save_model('catboost_model.cbm')
print("모델 저장")

모델 저장


In [9]:
test_data = pd.read_csv("test_final.csv")

# 문제 발생했을 경우 모델 로드
#model = CatBoostClassifier()
#model.load_model("catboost_model.cbm")
#categorical_features = ['F02', 'F03', 'F07', 'F08', 'F09', 'F13', 'F15', 'F16', 'F17', 'F20', 'F21', 'F25', 'F26', 'F28', 'F31', 'F35', 'F37', 'F39']

test_pool = Pool(data=test_data.drop(columns=['ID']), cat_features=categorical_features)
predictions = model.predict_proba(test_pool)[:, 1]
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'Click': predictions
})
submission.to_csv("submission.csv", index=False)
print("저장 ")

저장 


In [10]:
print(submission)

                   ID     Click
0        TEST_0000000  0.773276
1        TEST_0000001  0.362602
2        TEST_0000002  0.276485
3        TEST_0000003  0.657640
4        TEST_0000004  0.702472
...               ...       ...
4538536  TEST_4538536  0.578357
4538537  TEST_4538537  0.578365
4538538  TEST_4538538  0.105785
4538539  TEST_4538539  0.666890
4538540  TEST_4538540  0.682025

[4538541 rows x 2 columns]
