In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import gc

In [2]:
train = pd.read_pickle("../data/train_agg.pkl",compression = "gzip")
test = pd.read_pickle("../data/test_agg.pkl",compression = "gzip")

### 범주형 변수 분류

In [3]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
label_en = LabelEncoder()
for cf in cat_features: ### 두개의 63 64 last 값 category-> 라벨 인코딩
    train[cf] = label_en.fit_transform(train[cf])
    test[cf] = label_en.fit_transform(test[cf])

### 데이터 분류

In [4]:
X = train.drop('target',axis = 1)
y = train["target"]

In [5]:
x1, x2, y1, y2 = train_test_split(X, y, random_state=2022)

### 모델 학습

In [6]:
clf = CatBoostClassifier(iterations=20000, random_state=2022, nan_mode='Min')
clf.fit(x1, y1, eval_set=[(x2, y2)], cat_features=cat_features,  verbose=50)
preds = clf.predict_proba(x2)[:, 1]

Learning rate set to 0.036437
0:	learn: 0.6498290	test: 0.6496878	best: 0.6496878 (0)	total: 548ms	remaining: 3h 2m 35s
50:	learn: 0.2545931	test: 0.2529470	best: 0.2529470 (50)	total: 22.8s	remaining: 2h 28m 42s
100:	learn: 0.2364406	test: 0.2352766	best: 0.2352766 (100)	total: 43.5s	remaining: 2h 22m 53s
150:	learn: 0.2306272	test: 0.2299752	best: 0.2299752 (150)	total: 1m 4s	remaining: 2h 20m 45s
200:	learn: 0.2273175	test: 0.2270953	best: 0.2270953 (200)	total: 1m 26s	remaining: 2h 21m 20s
250:	learn: 0.2249996	test: 0.2252728	best: 0.2252728 (250)	total: 1m 46s	remaining: 2h 20m 1s
300:	learn: 0.2231394	test: 0.2239152	best: 0.2239152 (300)	total: 2m 7s	remaining: 2h 18m 34s
350:	learn: 0.2215297	test: 0.2228843	best: 0.2228843 (350)	total: 2m 27s	remaining: 2h 17m 12s
400:	learn: 0.2199787	test: 0.2219767	best: 0.2219767 (400)	total: 2m 47s	remaining: 2h 16m 4s
450:	learn: 0.2185292	test: 0.2212081	best: 0.2212081 (450)	total: 3m 7s	remaining: 2h 15m 5s
500:	learn: 0.2172126	test

### 데이터 추출

In [10]:
sub = pd.read_csv("../data/amex-default-prediction/sample_submission.csv")

In [11]:
y_preds = np.zeros(test.shape[0])
y_preds_ = np.zeros(test.shape[0])
y_preds = clf.predict_proba(test)[:, 1]
y_preds_ = sub['prediction'].to_numpy()
cust_id = sub['customer_ID'].to_numpy()
blend_preds = (y_preds_ * 0.955 + y_preds * 0.045)

In [12]:
test["prediction"] = blend_preds
test["prediction"].to_csv(f"submission6.csv", index=True)