In [3]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import gc

In [4]:
train = pd.read_pickle("../data/train_agg.pkl",compression = "gzip")
test = pd.read_pickle("../data/test_agg.pkl",compression = "gzip")

### 범주형 변수 분류

In [5]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
label_en = LabelEncoder()
for cf in cat_features: ### 두개의 63 64 last 값 category-> 라벨 인코딩
    train[cf] = label_en.fit_transform(train[cf])
    test[cf] = label_en.fit_transform(test[cf])

### 데이터 분류

In [6]:
X = train.drop('target',axis = 1)
y = train["target"]

In [7]:
x1, x2, y1, y2 = train_test_split(X, y, random_state=2022)

### 모델 학습

In [8]:
clf = CatBoostClassifier(iterations=5000, random_state=2022, nan_mode='Min')
clf.fit(x1, y1, eval_set=[(x2, y2)], cat_features=cat_features,  verbose=50)
preds = clf.predict_proba(x2)[:, 1]

Learning rate set to 0.066595
0:	learn: 0.6164535	test: 0.6162028	best: 0.6162028 (0)	total: 423ms	remaining: 35m 13s
50:	learn: 0.2379742	test: 0.2368111	best: 0.2368111 (50)	total: 19.1s	remaining: 30m 48s
100:	learn: 0.2283090	test: 0.2279880	best: 0.2279880 (100)	total: 37.6s	remaining: 30m 23s
150:	learn: 0.2242458	test: 0.2247777	best: 0.2247777 (150)	total: 56.5s	remaining: 30m 14s
200:	learn: 0.2211917	test: 0.2226610	best: 0.2226610 (200)	total: 1m 15s	remaining: 29m 52s
250:	learn: 0.2184978	test: 0.2211792	best: 0.2211792 (250)	total: 1m 33s	remaining: 29m 21s
300:	learn: 0.2161886	test: 0.2202339	best: 0.2202339 (300)	total: 1m 51s	remaining: 28m 56s
350:	learn: 0.2141665	test: 0.2195701	best: 0.2195701 (350)	total: 2m 9s	remaining: 28m 32s
400:	learn: 0.2123434	test: 0.2191675	best: 0.2191675 (400)	total: 2m 26s	remaining: 28m 4s
450:	learn: 0.2106073	test: 0.2187503	best: 0.2187503 (450)	total: 2m 44s	remaining: 27m 42s
500:	learn: 0.2088656	test: 0.2184223	best: 0.218422

### 데이터 추출

In [9]:
sub = pd.read_csv("../data/amex-default-prediction/sample_submission.csv")

In [10]:
y_preds = np.zeros(test.shape[0])
y_preds_ = np.zeros(test.shape[0])
y_preds = clf.predict_proba(test)[:, 1]
y_preds_ = sub['prediction'].to_numpy()
cust_id = sub['customer_ID'].to_numpy()
blend_preds = (y_preds_ * 0.955 + y_preds * 0.045)

In [12]:
test["prediction"] = blend_preds
test["prediction"].to_csv(f"submission4.csv", index=True)