# 전역변수

In [37]:
DATA_PATH = "./"
SUBMIT_PATH = "./submit/"
MODEL_PATH = "./models/"
SEED = 42

# 라이브러리

In [7]:
import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
from tqdm import tqdm
 
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 

from catboost import Pool,CatBoostClassifier

print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Windows-10-10.0.19041-SP0
- python: 3.8.7 (tags/v3.8.7:6503f05, Dec 21 2020, 17:59:51) [MSC v.1928 64 bit (AMD64)]
- pandas: 1.2.0
- numpy: 1.19.5
- sklearn: 0.24.0


# 데이터 불러오기

In [8]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

code_d = pd.read_csv(f'{DATA_PATH}속성_D_코드.csv')
code_h = pd.read_csv(f'{DATA_PATH}속성_H_코드.csv').iloc[:,:-1]
code_l = pd.read_csv(f'{DATA_PATH}속성_L_코드.csv')

train_data.shape , test_data.shape

((501951, 35), (46404, 34))

In [9]:
train_data = train_data.assign(person_class=lambda x: x['person_rn'] // 100000)
train_data = train_data.assign(contents_class=lambda x: x['contents_rn'] // 100000)
test_data = test_data.assign(person_class=lambda x: x['person_rn'] // 100000)
test_data = test_data.assign(contents_class=lambda x: x['contents_rn'] // 100000)

In [10]:
train_cluster = pd.read_csv(f'{DATA_PATH}cluster.csv')
test_cluster = pd.read_csv(f'{DATA_PATH}test_cluster.csv')

In [11]:
train_cluster = train_cluster.applymap(lambda x : int(x))
test_cluster = test_cluster.applymap(lambda x : int(x))

In [12]:
train_data = pd.concat([train_data, train_cluster] , axis=1)
test_data = pd.concat([test_data, test_cluster] , axis=1)

# 속성 코드 데이터 컬럼명 변경


In [13]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h","attribute_h_p"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

# 속성코드 데이터 merge 함수

In [14]:
def merge_codes(df:pd.DataFrame,df_code:pd.DataFrame,col:str)->pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df,df_code,how="left",on=col)

# 데이터 전처리 함수

In [15]:
def preprocess_data(
                    df:pd.DataFrame,is_train:bool = True, cols_merge:List[Tuple[str,pd.DataFrame]] = []  , cols_equi:List[Tuple[str,str]]= [] ,
                    cols_drop:List[str] = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"]
                    )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()

    y_data = None
    if is_train:
        y_data = df["target"].to_numpy()
        df = df.drop(columns="target")

    for col, df_code in cols_merge:
        df = merge_codes(df,df_code,col)

    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    df = df.drop(columns=cols_drop)
    return (df , y_data)

# 전처리 컬럼명 정의

In [16]:
# 소분류 중분류 대분류 속성코드 merge 컬럼명 및 데이터 프레임 리스트
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),

]

# 학습에 필요없는 컬럼 리스트
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn", ]

# 학습및 추론셋 전처리 !!

In [17]:
x_train, y_train = preprocess_data(train_data, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = preprocess_data(test_data,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

((501951, 68), (501951,), (46404, 68))

# 범주형 컬럼 리스트(catboost 파라미터에 넣을 용도)

In [18]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()

# 학습 파라미터

In [19]:
is_holdout = False
n_splits = 5
iterations = 5000
patience = 300

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

# 학습 시작!!

In [20]:
scores = []
models = []


models = []
for tri, vai in cv.split(x_train):
    print("="*50)
    preds = []

    model = CatBoostClassifier(iterations=iterations,random_state=SEED,eval_metric="F1",cat_features=cat_features,one_hot_max_size=6,
                              boosting_type = 'Plain', bootstrap_type='Bernoulli', depth = 9, colsample_bylevel = 0.051442184840004124, 
                              subsample=0.9934302225658067)
    model.fit(x_train.iloc[tri], y_train[tri], 
            eval_set=[(x_train.iloc[vai], y_train[vai])], 
            early_stopping_rounds=patience ,
            verbose = 100
        )
    
    models.append(model)
    scores.append(model.get_best_score()["validation"]["F1"])
    if is_holdout:
        break    

Learning rate set to 0.06918
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 187ms	remaining: 15m 35s
100:	learn: 0.6664816	test: 0.6853277	best: 0.6854313 (98)	total: 1m 16s	remaining: 1h 1m 49s
200:	learn: 0.6777230	test: 0.6895179	best: 0.6897563 (196)	total: 2m 34s	remaining: 1h 1m 30s
300:	learn: 0.6867494	test: 0.6914426	best: 0.6918077 (284)	total: 3m 53s	remaining: 1h 42s
400:	learn: 0.6932630	test: 0.6923243	best: 0.6926288 (380)	total: 5m 14s	remaining: 1h 2s
500:	learn: 0.6992234	test: 0.6932069	best: 0.6932069 (499)	total: 6m 33s	remaining: 58m 49s
600:	learn: 0.7042409	test: 0.6934888	best: 0.6935262 (589)	total: 7m 45s	remaining: 56m 49s
700:	learn: 0.7089850	test: 0.6933198	best: 0.6936568 (621)	total: 9m 7s	remaining: 55m 58s
800:	learn: 0.7137174	test: 0.6929962	best: 0.6937163 (717)	total: 10m 28s	remaining: 54m 56s
900:	learn: 0.7175098	test: 0.6933079	best: 0.6937163 (717)	total: 11m 49s	remaining: 53m 47s
1000:	learn: 0.7221331	test: 0.6932733	best: 

# CV 결과 확인

In [21]:
print(scores)
print(np.mean(scores))

[0.693716290121827, 0.6986713673570806, 0.692712872787904, 0.6935040482018452, 0.6953982516924406]
0.6948005660322194


# threshold값 변경에 따른 검증점수 확인 및 추론

In [36]:
threshold=0.36
pred_list = []
scores = []
for i,(tri, vai) in enumerate( cv.split(x_train) ):
    pred = models[i].predict_proba(x_train.iloc[vai])[:, 1]
    pred = np.where(pred >= threshold , 1, 0)
    score = f1_score(y_train[vai],pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:, 1]
    pred_list.append(pred)
print(scores)
print(np.mean(scores))

[0.7199803539458466, 0.71892242265189, 0.717522143542491, 0.7180340073019871, 0.7165692255213829]
0.7182056305927194


# 산술평균 앙상블!!

In [33]:
pred = np.mean( pred_list , axis = 0 )
pred = np.where(pred >= threshold , 1, 0)

# 제출파일!!

In [34]:
sample_submission = pd.read_csv(f'{DATA_PATH}sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,1
1,1,0
2,2,1
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


# 저장

In [35]:
sample_submission.to_csv(f"{SUBMIT_PATH}prediction.csv", index=False)

In [38]:
for i in range (5):
    models[i].save_model(f"{MODEL_PATH}{i}th_cv_0.69480.cbm")