In [2]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import gc

In [3]:
train = pd.read_pickle("../data/train_agg.pkl",compression = "gzip")
test = pd.read_pickle("../data/test_agg.pkl",compression = "gzip")

### 범주형 변수 분류

In [4]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
label_en = LabelEncoder()
for cf in cat_features: ### 두개의 63 64 last 값 category-> 라벨 인코딩
    train[cf] = label_en.fit_transform(train[cf])
    test[cf] = label_en.fit_transform(test[cf])

### 데이터 분류

In [5]:
X = train.drop('target',axis = 1)
y = train["target"]

In [6]:
x1, x2, y1, y2 = train_test_split(X, y, random_state=2022)

### 모델 학습

In [7]:
clf = CatBoostClassifier(iterations=5000, random_state=2022, nan_mode='Min')
clf.fit(x1, y1, eval_set=[(x2, y2)], cat_features=cat_features,  verbose=50)
preds = clf.predict_proba(x2)[:, 1]

Learning rate set to 0.066595
0:	learn: 0.6164535	test: 0.6162028	best: 0.6162028 (0)	total: 623ms	remaining: 51m 55s
50:	learn: 0.2379742	test: 0.2368111	best: 0.2368111 (50)	total: 22.1s	remaining: 35m 41s
100:	learn: 0.2283090	test: 0.2279880	best: 0.2279880 (100)	total: 43s	remaining: 34m 45s
150:	learn: 0.2242458	test: 0.2247777	best: 0.2247777 (150)	total: 1m 3s	remaining: 34m 3s
200:	learn: 0.2211917	test: 0.2226610	best: 0.2226610 (200)	total: 1m 24s	remaining: 33m 38s
250:	learn: 0.2184978	test: 0.2211792	best: 0.2211792 (250)	total: 1m 45s	remaining: 33m 8s
300:	learn: 0.2161886	test: 0.2202339	best: 0.2202339 (300)	total: 2m 5s	remaining: 32m 40s
350:	learn: 0.2141665	test: 0.2195701	best: 0.2195701 (350)	total: 2m 26s	remaining: 32m 19s
400:	learn: 0.2123434	test: 0.2191675	best: 0.2191675 (400)	total: 2m 46s	remaining: 31m 46s
450:	learn: 0.2106073	test: 0.2187503	best: 0.2187503 (450)	total: 3m 6s	remaining: 31m 18s
500:	learn: 0.2088656	test: 0.2184223	best: 0.2184223 (5

ValueError: Found input variables with inconsistent numbers of samples: [344184, 114729]

In [9]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(clf,x1,y1)
print(score)

Learning rate set to 0.02593
0:	learn: 0.6618143	total: 177ms	remaining: 14m 43s
1:	learn: 0.6317250	total: 376ms	remaining: 15m 39s
2:	learn: 0.6043560	total: 568ms	remaining: 15m 46s
3:	learn: 0.5783008	total: 791ms	remaining: 16m 28s
4:	learn: 0.5550428	total: 998ms	remaining: 16m 36s
5:	learn: 0.5340468	total: 1.21s	remaining: 16m 43s
6:	learn: 0.5152161	total: 1.39s	remaining: 16m 34s
7:	learn: 0.4982116	total: 1.59s	remaining: 16m 29s
8:	learn: 0.4815785	total: 1.78s	remaining: 16m 29s
9:	learn: 0.4661713	total: 1.97s	remaining: 16m 24s
10:	learn: 0.4527578	total: 2.17s	remaining: 16m 23s
11:	learn: 0.4398604	total: 2.37s	remaining: 16m 26s
12:	learn: 0.4274945	total: 2.61s	remaining: 16m 42s
13:	learn: 0.4175330	total: 2.82s	remaining: 16m 43s
14:	learn: 0.4067457	total: 3.03s	remaining: 16m 47s
15:	learn: 0.3978008	total: 3.23s	remaining: 16m 46s
16:	learn: 0.3879330	total: 3.43s	remaining: 16m 45s
17:	learn: 0.3795140	total: 3.62s	remaining: 16m 41s
18:	learn: 0.3722759	total:

### 데이터 추출

In [10]:
sub = pd.read_csv("../data/amex-default-prediction/sample_submission.csv")

In [11]:
y_preds = np.zeros(test.shape[0])
y_preds_ = np.zeros(test.shape[0])
y_preds = clf.predict_proba(test)[:, 1]
y_preds_ = sub['prediction'].to_numpy()
cust_id = sub['customer_ID'].to_numpy()
blend_preds = (y_preds_ * 0.955 + y_preds * 0.045)

In [12]:
test["prediction"] = blend_preds
test["prediction"].to_csv(f"submission5.csv", index=True)

  test["prediction"] = blend_preds
