# 1. 라이브러리 및 데이터

In [1]:
import os
os.chdir('D:/star2data')
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option('display.max_row', 500)

import numpy as np
import random
import time   # 시간 측정
import re     # 정규 표현식

random.seed(2020)
random_seed = 2020

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn import metrics

from catboost import CatBoostClassifier, Pool
from bayes_opt import BayesianOptimization # 베이지안 최적화 라이브러리
from functools import partial   # 함수 변수 고정

import lightgbm as lgb


## 데이터 불러오기 

In [3]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 2. Data Cleansing & Preprocessing

In [4]:
train.head(100)

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"
5,0,1,0.14,0,T,Camera,"at (142.5078125, 24.98828125)"
6,0,1,0.14,0,T,Camera,"at (139.6171875, 27.8828125)"
7,0,1,0.14,0,T,Camera,"at (138.3359375, 29.1640625)"
8,0,1,0.14,0,T,Camera,"at (136.23828125, 31.26171875)"
9,0,1,0.14,0,T,Camera,"at (135.23828125, 32.26171875)"


In [5]:
train.shape

(67091776, 7)

In [6]:
train

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.00,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.00,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"
...,...,...,...,...,...,...,...
67091771,38871,0,8.51,0,Z,Camera,"at (139.578125, 62.58203125)"
67091772,38871,0,8.52,1,T,GetControlGroup,
67091773,38871,0,8.52,0,Z,Camera,"at (122.42578125, 45.4296875)"
67091774,38871,0,8.52,0,Z,Camera,"at (122.42578125, 43.25390625)"


In [7]:
test.head(100)

Unnamed: 0,game_id,time,player,species,event,event_contents
0,38872,0.0,0,P,Camera,"at (22.25, 81.5078125)"
1,38872,0.0,1,P,Camera,"at (120.25, 153.83984375)"
2,38872,0.01,1,P,Selection,['Nexus [3100001]']
3,38872,0.01,1,P,Ability,(15E0) - TrainProbe
4,38872,0.01,1,P,AddToControlGroup,
5,38872,0.03,0,P,SetControlGroup,
6,38872,0.03,1,P,Camera,"at (120.25, 153.25390625)"
7,38872,0.03,1,P,Camera,"at (120.25, 152.43359375)"
8,38872,0.03,1,P,Camera,"at (117.96875, 152.765625)"
9,38872,0.03,1,P,Camera,"at (117.09765625, 153.890625)"


In [8]:
test.shape

(28714849, 6)

In [9]:
test

Unnamed: 0,game_id,time,player,species,event,event_contents
0,38872,0.00,0,P,Camera,"at (22.25, 81.5078125)"
1,38872,0.00,1,P,Camera,"at (120.25, 153.83984375)"
2,38872,0.01,1,P,Selection,['Nexus [3100001]']
3,38872,0.01,1,P,Ability,(15E0) - TrainProbe
4,38872,0.01,1,P,AddToControlGroup,
...,...,...,...,...,...,...
28714844,55658,4.54,1,T,Right Click,"Location: (120.584228515625, 56.930419921875, ..."
28714845,55658,4.54,0,Z,Camera,"at (70.9921875, 117.65234375)"
28714846,55658,4.54,1,T,Right Click,"Location: (122.146728515625, 55.52099609375, 3..."
28714847,55658,4.54,1,T,Right Click,"Location: (123.6767578125, 54.140625, 33215)"


In [10]:
# 반응변수 전처리
def preprocess_y(data, exchange_player=False):
    y = data.drop_duplicates(['game_id', 'winner']).winner.reset_index(drop=True)
    if (exchange_player == True):
        y = y.append(-(y - 1)).reset_index(drop = True)
    return y

In [11]:
# 설명변수 전처리
def preprocess_X(data, exchange_player=False):
    
    # game_id 개수만큼의 index를 가진 DataFrame X 생성
    n = data.game_id.max() + 1
    X = pd.DataFrame(index=range(n)[data.game_id.min():])
    
    # time 변수
    X['time'] = data.drop_duplicates(['game_id'],keep='last').set_index('game_id').time
    X['time'] = (X.time*100//100*60 + X.time*100%100).astype(int)
    
    # species 더미 변수
    X = pd.concat([pd.get_dummies(data[data.player == 0].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'0_protoss','T':'0_terran','Z':'0_zerg'}),
                pd.get_dummies(data[data.player == 1].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'1_protoss','T':'1_terran','Z':'1_zerg'}),
                X],axis=1)
    
    # event 카운트
    contents = data.loc[:,['player','game_id','time']].groupby(['player', 'game_id']).count().unstack(level=0)
    contents.columns = ['0_event', '1_event']
    X['0_event'], X['1_event'] = contents['0_event'], contents['1_event']

    # event 카운트 / time
    X['0_event_per_sec'], X['1_event_per_sec'] = X['0_event'] /X.time, X['1_event'] /X.time

    # event == Ability, AddToControlGroup, Camera, ControlGroup, GetControlGroup, Right Click, Selection, SetControlGroup일 때 각각 카운트
    contents = data.loc[:,['player','event','game_id','time']].groupby(['player', 'event', 'game_id']).count().unstack(level=[0,1]).fillna(0).astype(int)
    contents.columns = ['0_'+x for x in sorted(data.event.unique())] + ['1_'+x for x in sorted(data.event.unique())]
    for i in contents.columns:
        X[i] = contents[i]
    
    # event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max
    def move_sum(i):
        return sum(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_min(i):
        if len(i) == 1:
            return 0
        return min(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_median(i):
        if len(i) == 1:
            return 0
        return np.median(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 + 
                             np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_max(i):
        if len(i) == 1:
            return 0
        return max(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    contents = (data[data.event == 'Camera'].loc[:,['player','game_id','event_contents']].
              groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
    contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
    for i in contents.columns:
        X[i] = contents[i].fillna(0)

    # 30초 이내 move_sum
    contents = (data[(data.time < (data.event == 'Camera'))].loc[:,['player','game_id','event_contents']].groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
    contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
    for i in contents.columns:
        X[i] = contents[i]

    # event == Ability일 때 event_contents 더미 변수 생성, 카운트
    contents = pd.DataFrame(data.event_contents[(data.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  # event_contents의 16진수 코드만 추출
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # event == Ability일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        X[i+'_div_time'] = X[i] /X.time

    # event == Selection일 때 event_contents 더미 변수 생성, 카운트
    contents = data[data.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
    contents = contents.str.split(',')
    max_num = max(contents.map(lambda x: len(x)))
    t = [0 for x in range(max_num)]
    for i in range(max_num):
        t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
    contents = pd.concat([t[i] for i in range(max_num)])
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # event == Selection일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        X[i+'_div_time'] = X[i] /X.time

    # 30초 이내 event == Selection일 때 event_contents 더미 변수 생성, 카운트
    contents = data[(data.time < (data.event == 'Selection'))].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                                      replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
    contents = contents.str.split(',')
    max_num = max(contents.map(lambda x: len(x)))
    t = [0 for x in range(max_num)]
    for i in range(max_num):
        t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
    contents = pd.concat([t[i] for i in range(max_num)])
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i+'_30sec'] = contents_X[i]
        X[i+'_30sec'] = X[i+'_30sec'].fillna(0).astype(int)

    # event == Right Click일 때 Target 이름 더미 변수 생성, 카운트
    contents = pd.DataFrame(data.event_contents[(data.event == 'Right Click') & (data.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # 컬럼 이름 순서로 정렬
    X = X[sorted(X.columns)]

    # player 0,1 자리 바꾼 X1생성, X와 행 병합해 데이터 2배로 만들기
    if (exchange_player == True):
        c = X.shape[1]//2
        X1 = X.copy()
        X1.columns = list(X.columns[c:2*c])+list(X.columns[:c])+['time']
        X1.index = [x+n for x in range(n)]
        X = pd.concat([X, X1])

    return X

###  모든 데이터를 쪼갠다

...0_	0_1000	0_1000_div_time	0_1020	0_1020_div_time	0_1021	0_1021_div_time	0_1022	0_1022_div_time	0_1023	...	1_event_per_sec	1_move_max	1_move_median	1_move_min	1_move_sum	1_move_sum_30sec	1_protoss	1_terran	1_zerg	time ...

인덱스는, game_id 개수만큼의 index를 가진 DataFrame X 생성

In [12]:
# Train, Test 전처리 수행, y, X, test_X 생성
y = preprocess_y(train, True) # 플레이어 1
X = preprocess_X(train, True) # 플레이어 1
test_X = preprocess_X(test, False) # 플레이어 0


# 메모리 효율을 위해 Original Train, Test data 삭제
del train, test

In [13]:
X

Unnamed: 0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,...,1_event_per_sec,1_move_max,1_move_median,1_move_min,1_move_sum,1_move_sum_30sec,1_protoss,1_terran,1_zerg,time
0,5,0,0.0,1,0.002252,4,0.009009,2,0.004505,2,...,1.234234,10.136719,1.664816,0.226562,845.484223,45.381415,0,1,0,444
1,9,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,2.021703,172.687456,1.629660,0.050781,6283.402180,185.941775,0,1,0,599
2,4,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,2.288815,135.928846,3.023498,0.003906,5422.672223,83.727884,0,0,1,599
3,9,0,0.0,2,0.003339,6,0.010017,1,0.001669,3,...,2.707846,149.132687,2.203295,0.000000,4831.495281,364.933032,1,0,0,599
4,3,0,0.0,1,0.002915,2,0.005831,2,0.005831,1,...,1.897959,128.656583,24.699837,0.000000,6175.010389,1719.574150,0,0,1,343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77739,2,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,1.655378,150.695528,1.726562,0.023438,1830.368939,116.045365,0,1,0,502
77740,0,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,1.345576,167.326058,1.562500,0.382812,1332.075703,233.165970,0,1,0,599
77741,7,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,2.429703,151.112551,1.519175,0.023438,3500.897532,151.399349,1,0,0,505
77742,3,0,0.0,0,0.000000,0,0.000000,0,0.000000,0,...,1.140449,18.675781,3.605469,0.000000,1401.942248,114.863214,1,0,0,356


In [14]:
y

0        1
1        1
2        0
3        0
4        0
        ..
77739    0
77740    1
77741    1
77742    0
77743    1
Name: winner, Length: 77744, dtype: int64

In [15]:
test_X

Unnamed: 0,0_,0_1000,0_1000_div_time,0_1020,0_1020_div_time,0_1021,0_1021_div_time,0_1022,0_1022_div_time,0_1023,...,1_event_per_sec,1_move_max,1_move_median,1_move_min,1_move_sum,1_move_sum_30sec,1_protoss,1_terran,1_zerg,time
38872,4,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,1.678404,126.251407,1.691406,0.000000,3657.453866,106.488723,1,0,0,426
38873,7,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,1.059946,166.073907,2.245074,0.410156,2105.334646,449.889201,1,0,0,367
38874,1,0,0.0,0,0.0,3,0.010676,2,0.007117,1,...,2.932384,90.214032,1.382812,0.128906,903.872600,97.016800,0,1,0,281
38875,9,0,0.0,0,0.0,6,0.012552,2,0.004184,2,...,1.805439,161.667451,2.945312,0.000000,3977.746880,189.929042,1,0,0,478
38876,0,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,1.252941,6.656250,1.451031,0.203125,316.519178,112.194960,0,0,1,170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55654,24,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,3.133690,139.716244,4.003906,0.000000,4487.602561,432.603384,0,1,0,374
55655,7,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,1.742475,103.040503,1.339639,0.000000,1878.000899,79.260655,0,1,0,598
55656,6,0,0.0,0,0.0,0,0.000000,0,0.000000,0,...,1.815104,165.277559,2.398438,0.000000,4420.024763,967.359201,1,0,0,384
55657,0,0,0.0,0,0.0,1,0.012987,0,0.000000,1,...,2.792208,138.695125,5.785513,0.000000,808.591096,501.611450,0,1,0,77


In [16]:
# X, test_X에만 있는 컬럼 삭제 (맞춰서 Train Test 하기 위히)
X.drop(set(X.columns) - set(test_X.columns), axis=1, inplace=True)
test_X.drop(set(test_X.columns) - set(X.columns), axis=1, inplace=True)

# 3. EDA

# 4. Feature Engineering & Initial Modeling

In [17]:
# CatBoost 모델링
def catboost_modeling(x_train, y_train, x_test, grow_policy, depth, learning_rate, l2_leaf_reg, random_seed, n):
    
    # 빈 Series인 test_pred 생성
    test_pred = pd.Series([0 for x in range(len(x_test))], index=x_test.index)
    
    # 10-fold 모델링을 n회 반복할 것
    for i in range(n):
        kf = KFold(n_splits=10, random_state=random_seed+i)
        for train_index, valid_index in kf.split(x_train):
            train_X, train_y = x_train.iloc[train_index], y_train[train_index]
            valid_X, valid_y = x_train.iloc[valid_index], y_train[valid_index]
            
            # catBoost(grow_policy='Depthwise')
            model = CatBoostClassifier(eval_metric = 'AUC',              # AUC로 성능 측정
                                       iterations = 250,               # 반복횟수 최대 25000
                                       metric_period = 250,            # 중간결과 출력X
                                       early_stopping_rounds = 100,     # 1000iteration 동안 AUC 증가 없으면 학습 중단
                                       #task_type = 'GPU',                # GPU 사용
                                       grow_policy = grow_policy,        # 트리 노드 생성 방식
                                                                     # 1) Depthwise(지정한 depth에 이를 때까지 level 순으로 노드 분할)
                                                                     # 2) Lossguide(loss 변화가 큰 순으로 노드 분할)
                                       depth = depth,                    # 트리 깊이
                                       learning_rate = learning_rate,    # 러닝레이트
                                       l2_leaf_reg = l2_leaf_reg,        # L2 정규화
                                       random_seed = random_seed+i,      # 랜덤시드 고정
                                       )
        
            # 모델 학습
            model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
            # 모델 적용
            test_pred += model.predict_proba(x_test)[:,1] /(10*n)
        
    # 설정된 디렉토리에 결과물 저장
    sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
    submission = pd.DataFrame(data=test_pred, columns=sample_submission.columns, index=sample_submission.index)
    submission.to_csv('CatBoost_'+grow_policy+'_'+str(depth)+'.csv', index=True)

    return test_pred

In [18]:
data1 = catboost_modeling(X, y, test_X, 'Depthwise', 10, 0.02423, 20.35, 2014, 2)



0:	test: 0.6153353	best: 0.6153353 (0)	total: 3s	remaining: 12m 25s
249:	test: 0.7194022	best: 0.7194022 (249)	total: 12m 21s	remaining: 0us

bestTest = 0.7194021612
bestIteration = 249





0:	test: 0.6229307	best: 0.6229307 (0)	total: 2.95s	remaining: 12m 14s
249:	test: 0.7304529	best: 0.7304529 (249)	total: 12m 35s	remaining: 0us

bestTest = 0.7304529077
bestIteration = 249





0:	test: 0.6373337	best: 0.6373337 (0)	total: 2.86s	remaining: 11m 52s
249:	test: 0.7481951	best: 0.7481951 (249)	total: 12m 22s	remaining: 0us

bestTest = 0.7481951097
bestIteration = 249





0:	test: 0.6293211	best: 0.6293211 (0)	total: 3.13s	remaining: 12m 58s
249:	test: 0.7299080	best: 0.7299080 (249)	total: 12m 45s	remaining: 0us

bestTest = 0.7299079599
bestIteration = 249





0:	test: 0.6196783	best: 0.6196783 (0)	total: 2.84s	remaining: 11m 48s
249:	test: 0.7323880	best: 0.7324809 (248)	total: 12m 38s	remaining: 0us

bestTest = 0.7324809433
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6232320	best: 0.6232320 (0)	total: 3.27s	remaining: 13m 33s
249:	test: 0.7195816	best: 0.7195816 (249)	total: 12m 49s	remaining: 0us

bestTest = 0.719581588
bestIteration = 249





0:	test: 0.6186208	best: 0.6186208 (0)	total: 2.78s	remaining: 11m 32s
249:	test: 0.7296651	best: 0.7296651 (249)	total: 12m 34s	remaining: 0us

bestTest = 0.7296651285
bestIteration = 249





0:	test: 0.6321081	best: 0.6321081 (0)	total: 3.78s	remaining: 15m 40s
249:	test: 0.7494059	best: 0.7494176 (248)	total: 14m 23s	remaining: 0us

bestTest = 0.7494176241
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6383464	best: 0.6383464 (0)	total: 3.54s	remaining: 14m 42s
249:	test: 0.7322259	best: 0.7322259 (249)	total: 13m 22s	remaining: 0us

bestTest = 0.7322258582
bestIteration = 249





0:	test: 0.6403191	best: 0.6403191 (0)	total: 2.47s	remaining: 10m 15s
249:	test: 0.7324413	best: 0.7324413 (249)	total: 13m 1s	remaining: 0us

bestTest = 0.7324413319
bestIteration = 249





0:	test: 0.6299581	best: 0.6299581 (0)	total: 3.79s	remaining: 15m 44s
249:	test: 0.7193969	best: 0.7193969 (249)	total: 13m 10s	remaining: 0us

bestTest = 0.7193969338
bestIteration = 249





0:	test: 0.6281442	best: 0.6281442 (0)	total: 3.7s	remaining: 15m 21s
249:	test: 0.7318963	best: 0.7318963 (249)	total: 13m 25s	remaining: 0us

bestTest = 0.7318962967
bestIteration = 249





0:	test: 0.6329974	best: 0.6329974 (0)	total: 3.58s	remaining: 14m 50s
249:	test: 0.7486724	best: 0.7487083 (248)	total: 15m 17s	remaining: 0us

bestTest = 0.7487083349
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6302545	best: 0.6302545 (0)	total: 3.83s	remaining: 15m 53s
249:	test: 0.7311165	best: 0.7311165 (249)	total: 15m 39s	remaining: 0us

bestTest = 0.7311165275
bestIteration = 249





0:	test: 0.6300880	best: 0.6300880 (0)	total: 3.81s	remaining: 15m 48s
249:	test: 0.7325851	best: 0.7325940 (247)	total: 15m 44s	remaining: 0us

bestTest = 0.7325939905
bestIteration = 247

Shrink model to first 248 iterations.




0:	test: 0.6256765	best: 0.6256765 (0)	total: 3.61s	remaining: 14m 58s
249:	test: 0.7185243	best: 0.7185243 (249)	total: 15m 42s	remaining: 0us

bestTest = 0.7185243194
bestIteration = 249





0:	test: 0.6237461	best: 0.6237461 (0)	total: 3.97s	remaining: 16m 27s
249:	test: 0.7318461	best: 0.7318461 (249)	total: 15m 47s	remaining: 0us

bestTest = 0.7318460891
bestIteration = 249





0:	test: 0.6454465	best: 0.6454465 (0)	total: 3.51s	remaining: 14m 35s
249:	test: 0.7492345	best: 0.7492440 (248)	total: 15m 52s	remaining: 0us

bestTest = 0.7492440117
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6193991	best: 0.6193991 (0)	total: 3.86s	remaining: 16m 2s
249:	test: 0.7287292	best: 0.7287292 (249)	total: 16m 11s	remaining: 0us

bestTest = 0.7287292233
bestIteration = 249





0:	test: 0.6343809	best: 0.6343809 (0)	total: 4.23s	remaining: 17m 33s
249:	test: 0.7342964	best: 0.7342964 (249)	total: 16m	remaining: 0us

bestTest = 0.7342964196
bestIteration = 249



In [19]:
data2 = catboost_modeling(X, y, test_X, 'Lossguide', 8, 0.01063, 5.127, 2014, 2)



0:	test: 0.6142121	best: 0.6142121 (0)	total: 924ms	remaining: 3m 50s
249:	test: 0.6925307	best: 0.6925307 (249)	total: 3m 52s	remaining: 0us

bestTest = 0.6925307405
bestIteration = 249





0:	test: 0.6167211	best: 0.6167211 (0)	total: 955ms	remaining: 3m 57s
249:	test: 0.7088263	best: 0.7088263 (249)	total: 3m 47s	remaining: 0us

bestTest = 0.7088262911
bestIteration = 249





0:	test: 0.6407961	best: 0.6407961 (0)	total: 890ms	remaining: 3m 41s
249:	test: 0.7235994	best: 0.7235994 (249)	total: 3m 51s	remaining: 0us

bestTest = 0.7235994102
bestIteration = 249





0:	test: 0.6282832	best: 0.6282832 (0)	total: 917ms	remaining: 3m 48s
249:	test: 0.7055534	best: 0.7055534 (249)	total: 3m 47s	remaining: 0us

bestTest = 0.7055533763
bestIteration = 249





0:	test: 0.6132036	best: 0.6132036 (0)	total: 909ms	remaining: 3m 46s
249:	test: 0.7067094	best: 0.7067167 (248)	total: 3m 47s	remaining: 0us

bestTest = 0.7067167048
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6059035	best: 0.6059035 (0)	total: 899ms	remaining: 3m 43s
249:	test: 0.6935725	best: 0.6935725 (249)	total: 3m 45s	remaining: 0us

bestTest = 0.693572477
bestIteration = 249





0:	test: 0.6404242	best: 0.6404242 (0)	total: 952ms	remaining: 3m 57s
249:	test: 0.7086936	best: 0.7086936 (249)	total: 3m 44s	remaining: 0us

bestTest = 0.7086935946
bestIteration = 249





0:	test: 0.6294622	best: 0.6294622 (0)	total: 857ms	remaining: 3m 33s
249:	test: 0.7229694	best: 0.7229694 (249)	total: 3m 43s	remaining: 0us

bestTest = 0.7229693688
bestIteration = 249





0:	test: 0.6064800	best: 0.6064800 (0)	total: 858ms	remaining: 3m 33s
249:	test: 0.7051668	best: 0.7051668 (249)	total: 3m 44s	remaining: 0us

bestTest = 0.7051667832
bestIteration = 249





0:	test: 0.6224937	best: 0.6224937 (0)	total: 897ms	remaining: 3m 43s
249:	test: 0.7080685	best: 0.7080685 (249)	total: 3m 41s	remaining: 0us

bestTest = 0.7080684532
bestIteration = 249





0:	test: 0.6135116	best: 0.6135116 (0)	total: 878ms	remaining: 3m 38s
249:	test: 0.6928257	best: 0.6928257 (249)	total: 3m 47s	remaining: 0us

bestTest = 0.6928256593
bestIteration = 249





0:	test: 0.6289895	best: 0.6289895 (0)	total: 903ms	remaining: 3m 44s
249:	test: 0.7076322	best: 0.7076322 (249)	total: 3m 42s	remaining: 0us

bestTest = 0.7076321684
bestIteration = 249





0:	test: 0.6229243	best: 0.6229243 (0)	total: 900ms	remaining: 3m 44s
249:	test: 0.7236528	best: 0.7236528 (249)	total: 3m 45s	remaining: 0us

bestTest = 0.7236528105
bestIteration = 249





0:	test: 0.6292144	best: 0.6292144 (0)	total: 839ms	remaining: 3m 28s
249:	test: 0.7055442	best: 0.7055442 (249)	total: 3m 43s	remaining: 0us

bestTest = 0.7055442445
bestIteration = 249





0:	test: 0.6184945	best: 0.6184945 (0)	total: 807ms	remaining: 3m 20s
249:	test: 0.7082657	best: 0.7082657 (249)	total: 3m 43s	remaining: 0us

bestTest = 0.7082656765
bestIteration = 249





0:	test: 0.6052939	best: 0.6052939 (0)	total: 845ms	remaining: 3m 30s
249:	test: 0.6935880	best: 0.6935880 (249)	total: 3m 39s	remaining: 0us

bestTest = 0.6935879647
bestIteration = 249





0:	test: 0.6221975	best: 0.6221975 (0)	total: 925ms	remaining: 3m 50s
249:	test: 0.7089251	best: 0.7089251 (249)	total: 3m 46s	remaining: 0us

bestTest = 0.7089251202
bestIteration = 249





0:	test: 0.6245155	best: 0.6245155 (0)	total: 903ms	remaining: 3m 44s
249:	test: 0.7227106	best: 0.7227106 (249)	total: 3m 40s	remaining: 0us

bestTest = 0.7227105719
bestIteration = 249





0:	test: 0.6337283	best: 0.6337283 (0)	total: 895ms	remaining: 3m 42s
249:	test: 0.7050674	best: 0.7050674 (249)	total: 3m 41s	remaining: 0us

bestTest = 0.7050673672
bestIteration = 249





0:	test: 0.6190620	best: 0.6190620 (0)	total: 843ms	remaining: 3m 29s
249:	test: 0.7088689	best: 0.7088689 (249)	total: 3m 42s	remaining: 0us

bestTest = 0.7088689181
bestIteration = 249



In [20]:
data3 = catboost_modeling(X, y, test_X, 'Depthwise', 12, 0.01564, 49.99, 2022, 2)



0:	test: 0.6270105	best: 0.6270105 (0)	total: 7.11s	remaining: 29m 30s
249:	test: 0.7153928	best: 0.7153928 (249)	total: 38m 8s	remaining: 0us

bestTest = 0.7153928011
bestIteration = 249





0:	test: 0.6438431	best: 0.6438431 (0)	total: 8.4s	remaining: 34m 51s
249:	test: 0.7296075	best: 0.7296075 (249)	total: 39m 23s	remaining: 0us

bestTest = 0.7296075064
bestIteration = 249





0:	test: 0.6361481	best: 0.6361481 (0)	total: 8.56s	remaining: 35m 31s
249:	test: 0.7455584	best: 0.7455584 (249)	total: 39m 12s	remaining: 0us

bestTest = 0.7455583792
bestIteration = 249





0:	test: 0.6471517	best: 0.6471517 (0)	total: 8.12s	remaining: 33m 41s
249:	test: 0.7264648	best: 0.7264648 (249)	total: 40m 16s	remaining: 0us

bestTest = 0.7264648258
bestIteration = 249





0:	test: 0.6364877	best: 0.6364877 (0)	total: 8s	remaining: 33m 12s
249:	test: 0.7312666	best: 0.7312666 (249)	total: 38m 23s	remaining: 0us

bestTest = 0.7312666125
bestIteration = 249





0:	test: 0.6421707	best: 0.6421707 (0)	total: 8.18s	remaining: 33m 57s
249:	test: 0.7175435	best: 0.7175435 (249)	total: 39m 28s	remaining: 0us

bestTest = 0.7175434967
bestIteration = 249





0:	test: 0.6501887	best: 0.6501887 (0)	total: 6.22s	remaining: 25m 48s
249:	test: 0.7268971	best: 0.7268971 (249)	total: 38m 52s	remaining: 0us

bestTest = 0.7268971465
bestIteration = 249





0:	test: 0.6664365	best: 0.6664365 (0)	total: 7.68s	remaining: 31m 52s
249:	test: 0.7445082	best: 0.7445082 (249)	total: 38m 1s	remaining: 0us

bestTest = 0.7445081597
bestIteration = 249





0:	test: 0.6312765	best: 0.6312765 (0)	total: 8.12s	remaining: 33m 41s
249:	test: 0.7283637	best: 0.7283637 (249)	total: 41m 51s	remaining: 0us

bestTest = 0.7283636612
bestIteration = 249





0:	test: 0.6473315	best: 0.6473315 (0)	total: 6.31s	remaining: 26m 10s
249:	test: 0.7309564	best: 0.7309564 (249)	total: 40m 26s	remaining: 0us

bestTest = 0.7309564278
bestIteration = 249





0:	test: 0.6320791	best: 0.6320791 (0)	total: 8.43s	remaining: 34m 58s
249:	test: 0.7168514	best: 0.7168514 (249)	total: 40m 40s	remaining: 0us

bestTest = 0.7168513819
bestIteration = 249





0:	test: 0.6178732	best: 0.6178732 (0)	total: 6.41s	remaining: 26m 35s
249:	test: 0.7282955	best: 0.7282955 (249)	total: 39m 24s	remaining: 0us

bestTest = 0.7282955331
bestIteration = 249





0:	test: 0.6600631	best: 0.6600631 (0)	total: 7.83s	remaining: 32m 29s
249:	test: 0.7452871	best: 0.7452871 (249)	total: 40m 35s	remaining: 0us

bestTest = 0.7452870765
bestIteration = 249





0:	test: 0.6211586	best: 0.6211586 (0)	total: 7.27s	remaining: 30m 11s
249:	test: 0.7297433	best: 0.7297697 (248)	total: 41m 34s	remaining: 0us

bestTest = 0.7297696601
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6358297	best: 0.6358297 (0)	total: 8.38s	remaining: 34m 46s
249:	test: 0.7298778	best: 0.7298778 (249)	total: 39m 42s	remaining: 0us

bestTest = 0.7298778131
bestIteration = 249





0:	test: 0.6260292	best: 0.6260292 (0)	total: 7.86s	remaining: 32m 38s
249:	test: 0.7180968	best: 0.7180968 (249)	total: 41m 9s	remaining: 0us

bestTest = 0.7180968186
bestIteration = 249





0:	test: 0.6260173	best: 0.6260173 (0)	total: 7.87s	remaining: 32m 39s
249:	test: 0.7287610	best: 0.7287610 (249)	total: 42m 10s	remaining: 0us

bestTest = 0.7287610004
bestIteration = 249





0:	test: 0.6363456	best: 0.6363456 (0)	total: 8.59s	remaining: 35m 39s
249:	test: 0.7449820	best: 0.7449820 (249)	total: 41m 53s	remaining: 0us

bestTest = 0.744982003
bestIteration = 249





0:	test: 0.6519272	best: 0.6519272 (0)	total: 6.43s	remaining: 26m 40s
249:	test: 0.7280975	best: 0.7280975 (249)	total: 40m 31s	remaining: 0us

bestTest = 0.728097515
bestIteration = 249





0:	test: 0.6455449	best: 0.6455449 (0)	total: 8.55s	remaining: 35m 29s
249:	test: 0.7303315	best: 0.7303315 (249)	total: 41m 16s	remaining: 0us

bestTest = 0.7303314907
bestIteration = 249



In [21]:
data4 = catboost_modeling(X, y, test_X, 'Lossguide', 16, 0.01213, 5.027, 2022, 2)



0:	test: 0.6109606	best: 0.6109606 (0)	total: 1.21s	remaining: 5m
249:	test: 0.6993290	best: 0.6993290 (249)	total: 4m 54s	remaining: 0us

bestTest = 0.6993289589
bestIteration = 249





0:	test: 0.6127867	best: 0.6127867 (0)	total: 1.18s	remaining: 4m 54s
249:	test: 0.7156043	best: 0.7156043 (249)	total: 4m 42s	remaining: 0us

bestTest = 0.7156042574
bestIteration = 249





0:	test: 0.6208614	best: 0.6208614 (0)	total: 1.1s	remaining: 4m 33s
249:	test: 0.7287900	best: 0.7287900 (249)	total: 4m 43s	remaining: 0us

bestTest = 0.7287899586
bestIteration = 249





0:	test: 0.6232773	best: 0.6232773 (0)	total: 1.1s	remaining: 4m 33s
249:	test: 0.7122781	best: 0.7122781 (249)	total: 4m 41s	remaining: 0us

bestTest = 0.7122781183
bestIteration = 249





0:	test: 0.6327244	best: 0.6327244 (0)	total: 1.13s	remaining: 4m 42s
249:	test: 0.7144522	best: 0.7144522 (249)	total: 4m 37s	remaining: 0us

bestTest = 0.7144521649
bestIteration = 249





0:	test: 0.6141842	best: 0.6141842 (0)	total: 1.12s	remaining: 4m 39s
249:	test: 0.7006672	best: 0.7006672 (249)	total: 4m 44s	remaining: 0us

bestTest = 0.7006671766
bestIteration = 249





0:	test: 0.6140670	best: 0.6140670 (0)	total: 1.11s	remaining: 4m 35s
249:	test: 0.7152656	best: 0.7152656 (249)	total: 4m 37s	remaining: 0us

bestTest = 0.7152655992
bestIteration = 249





0:	test: 0.6176935	best: 0.6176935 (0)	total: 1.08s	remaining: 4m 30s
249:	test: 0.7277872	best: 0.7277872 (249)	total: 4m 34s	remaining: 0us

bestTest = 0.7277871622
bestIteration = 249





0:	test: 0.6288913	best: 0.6288913 (0)	total: 1.08s	remaining: 4m 28s
249:	test: 0.7129319	best: 0.7129319 (249)	total: 4m 37s	remaining: 0us

bestTest = 0.7129319497
bestIteration = 249





0:	test: 0.6273262	best: 0.6273262 (0)	total: 1.09s	remaining: 4m 32s
249:	test: 0.7147952	best: 0.7147952 (249)	total: 4m 39s	remaining: 0us

bestTest = 0.7147952308
bestIteration = 249





0:	test: 0.6089990	best: 0.6089990 (0)	total: 1.11s	remaining: 4m 37s
249:	test: 0.7015844	best: 0.7015844 (249)	total: 4m 38s	remaining: 0us

bestTest = 0.7015843563
bestIteration = 249





0:	test: 0.6080061	best: 0.6080061 (0)	total: 1.1s	remaining: 4m 34s
249:	test: 0.7161397	best: 0.7161397 (249)	total: 4m 35s	remaining: 0us

bestTest = 0.7161397135
bestIteration = 249





0:	test: 0.6214850	best: 0.6214850 (0)	total: 1.04s	remaining: 4m 20s
249:	test: 0.7286000	best: 0.7286000 (249)	total: 4m 41s	remaining: 0us

bestTest = 0.7286000467
bestIteration = 249





0:	test: 0.6211127	best: 0.6211127 (0)	total: 1.05s	remaining: 4m 21s
249:	test: 0.7123426	best: 0.7123426 (249)	total: 4m 35s	remaining: 0us

bestTest = 0.7123425699
bestIteration = 249





0:	test: 0.6414526	best: 0.6414526 (0)	total: 1.11s	remaining: 4m 37s
249:	test: 0.7143690	best: 0.7143690 (249)	total: 4m 35s	remaining: 0us

bestTest = 0.7143690341
bestIteration = 249





0:	test: 0.6201040	best: 0.6201040 (0)	total: 1.16s	remaining: 4m 48s
249:	test: 0.7004572	best: 0.7004572 (249)	total: 4m 36s	remaining: 0us

bestTest = 0.7004572319
bestIteration = 249





0:	test: 0.6114292	best: 0.6114292 (0)	total: 1.11s	remaining: 4m 36s
249:	test: 0.7153095	best: 0.7153095 (249)	total: 4m 39s	remaining: 0us

bestTest = 0.715309548
bestIteration = 249





0:	test: 0.6290625	best: 0.6290625 (0)	total: 1.2s	remaining: 4m 59s
249:	test: 0.7278087	best: 0.7278649 (248)	total: 4m 39s	remaining: 0us

bestTest = 0.7278649337
bestIteration = 248

Shrink model to first 249 iterations.




0:	test: 0.6199489	best: 0.6199489 (0)	total: 1.09s	remaining: 4m 31s
249:	test: 0.7125244	best: 0.7125244 (249)	total: 4m 35s	remaining: 0us

bestTest = 0.7125244238
bestIteration = 249





0:	test: 0.6422138	best: 0.6422138 (0)	total: 1.17s	remaining: 4m 50s
249:	test: 0.7149988	best: 0.7149988 (249)	total: 4m 35s	remaining: 0us

bestTest = 0.7149987557
bestIteration = 249



# 5. Ensemble

In [24]:
# 최종 모델 앙상블
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
data_final = pd.DataFrame((data1+data2)/2 *1/3 + (data3+data4)/2 *2/3)
data_final.columns = sample_submission.columns
data_final.to_csv('data_final_new_new.csv', index =True)
data_final

Unnamed: 0,winner
38872,0.547855
38873,0.565183
38874,0.496100
38875,0.306808
38876,0.585904
...,...
55654,0.624778
55655,0.229581
55656,0.664449
55657,0.597755


 - Reference
 
 [2등][도발하려던건 아니었습니다만]Ensembled CatBoost Model
 https://dacon.io/competitions/official/235583/codeshare/981?page=1&dtype=recent&ptype=pub