In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import os
import random

# 데이터 준비


In [2]:
train_df = pd.read_csv("/opt/ml/input/data/train_data.csv")
test_df = pd.read_csv("/opt/ml/input/data/test_data.csv")

In [3]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

In [4]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [5]:
test_df_pre = feature_engineering(test_df)
df_infer = test_df_pre.loc[test_df['answerCode'] == -1].reset_index(drop=True)
df_infer

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,717.0,1035,0.692754,0.661765,90,0.542662,159
1,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,465.0,670,0.694030,0.740385,77,0.565693,155
2,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,915.0,1316,0.695289,0.417857,117,0.446753,172
3,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,0.625000,30,0.514286,36
4,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,293.0,386,0.759067,0.678571,133,0.602767,305
...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,7.0,23,0.304348,0.753846,147,0.654902,167
740,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,7.0,14,0.500000,0.866667,156,0.834661,419
741,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,7.0,14,0.500000,0.750000,75,0.792517,233
742,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,2.0,14,0.142857,0.750000,75,0.792517,233


# 캣부스트

In [12]:
from catboost import CatBoostClassifier, Pool
train_df = feature_engineering(train_df)
train, test = custom_train_test_split(train_df)

In [13]:
# categorical_cols =  ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag']
categorical_cols =  ['assessmentItemID', 'testId', 'KnowledgeTag']
train = train[['assessmentItemID', 'testId', 'KnowledgeTag','answerCode', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]
train[categorical_cols] = train[categorical_cols].fillna("")
# train.drop(columns=['Timestamp'], inplace=True)
train

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,answerCode,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
745,A040013001,A040000013,2048,1,,0,,0.595472,789,0.616900,971
746,A040013002,A040000013,2048,1,1.0,1,1.000000,0.595472,789,0.616900,971
747,A040013003,A040000013,2047,1,2.0,2,1.000000,0.595472,789,0.570962,1050
748,A040013004,A040000013,2047,1,3.0,3,1.000000,0.595472,789,0.570962,1050
749,A040013005,A040000013,2047,0,4.0,4,1.000000,0.595472,789,0.570962,1050
...,...,...,...,...,...,...,...,...,...,...,...
2266572,A030197001,A030000197,1984,1,5.0,10,0.500000,0.738462,1008,0.749632,2039
2266573,A030197002,A030000197,1984,0,6.0,11,0.545455,0.738462,1008,0.749632,2039
2266574,A030197003,A030000197,1984,0,6.0,12,0.500000,0.738462,1008,0.749632,2039
2266575,A030197004,A030000197,1984,0,6.0,13,0.461538,0.738462,1008,0.749632,2039


In [14]:
TARGET_COL = 'answerCode'
X_train = train.drop([TARGET_COL],axis=1)
y_train = train[TARGET_COL]

train_pool = Pool(data=X_train,label = y_train,cat_features=categorical_cols)

In [15]:
# test_pool
# categorical_cols =  ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag']
categorical_cols =  ['assessmentItemID', 'testId', 'KnowledgeTag']
test = test[['assessmentItemID', 'testId', 'KnowledgeTag','answerCode', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]
test[categorical_cols] = test[categorical_cols].fillna("")
# test.drop(columns=['Timestamp'], inplace=True)

TARGET_COL = 'answerCode'
X_test = test.drop([TARGET_COL],axis=1)
y_test = test[TARGET_COL]

test_pool = Pool(data=X_test,label = y_test,cat_features=categorical_cols)

In [18]:
# infer_pool
# df_infer = feature_engineering(df_infer)
# categorical_cols =  ['userID', 'assessmentItemID', 'testId', 'KnowledgeTag']
categorical_cols =  ['assessmentItemID', 'testId', 'KnowledgeTag']
df_infer = df_infer[['assessmentItemID', 'testId', 'KnowledgeTag','answerCode', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]
df_infer[categorical_cols] = df_infer[categorical_cols].fillna("")
# df_infer.drop(columns=['Timestamp'], inplace=True)

TARGET_COL = 'answerCode'
X_infer = df_infer.drop([TARGET_COL],axis=1)
y_infer = df_infer[TARGET_COL]

infer_pool = Pool(data=X_infer,label = y_infer,cat_features=categorical_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_infer[categorical_cols] = df_infer[categorical_cols].fillna("")


In [14]:
params = {'iterations':5000,
          'random_seed':63,
          'learning_rate':0.01,
          # 'max_depth' : 10,
          'eval_metric' : 'AUC',
          # 'objective' : 'CrossEntropy',
        #   'loss_function':'CrossEntropy',
          # 'custom_metric':['Logloss','AUC'],##, '
          'early_stopping_rounds':100,
          'use_best_model': True,
          'task_type':"GPU",
        #   'bagging_temperature':1,
          'verbose':100}

In [15]:
model_basic = CatBoostClassifier(**params)#,learning_rate=0.1, task_type="GPU",)
# model_basic = CatBoostClassifier(verbose=50)#,learning_rate=0.1, task_type="GPU",)
model_basic.fit(train_pool, eval_set=test_pool, use_best_model=True)
print(model_basic.get_best_score())

0:	test: 0.7096283	best: 0.7096283 (0)	total: 45ms	remaining: 3m 45s
100:	test: 0.7335236	best: 0.7335236 (100)	total: 4.46s	remaining: 3m 36s
200:	test: 0.7362826	best: 0.7362826 (200)	total: 9.75s	remaining: 3m 52s
300:	test: 0.7383969	best: 0.7384347 (299)	total: 15.1s	remaining: 3m 55s
400:	test: 0.7396531	best: 0.7396531 (400)	total: 20.4s	remaining: 3m 53s
500:	test: 0.7404800	best: 0.7405019 (490)	total: 25.8s	remaining: 3m 51s
600:	test: 0.7409740	best: 0.7409938 (594)	total: 31.1s	remaining: 3m 47s
700:	test: 0.7413178	best: 0.7413178 (700)	total: 36.3s	remaining: 3m 42s
800:	test: 0.7416145	best: 0.7416508 (792)	total: 41.6s	remaining: 3m 37s
900:	test: 0.7417924	best: 0.7418152 (882)	total: 46.8s	remaining: 3m 33s
bestTest = 0.7418152392
bestIteration = 882
Shrink model to first 883 iterations.
{'learn': {'Logloss': 0.5260044008884367}, 'validation': {'Logloss': 0.6022343428858371, 'AUC': 0.7418152391910553}}


In [16]:
print("train ACC :", model_basic.score(train_pool))
print("test ACC :", model_basic.score(test_pool))

pred = model_basic.predict_proba(test_pool)[:,1]
print("test AUC :", roc_auc_score(y_test, pred))

train ACC : 0.7440782974194862
test ACC : 0.6781265570503239
test AUC : 0.7418162393162394


In [19]:
print("test set 평균, 최대값, 최소값")
print(pred.mean())
print(pred.max())
print(pred.min())

test set 평균, 최대값, 최소값
0.5296340005876892
0.9630352145295576
0.06839380770802914


In [77]:
# inference
infer_cbc = model_basic.predict_proba(infer_pool)[:,1]
infer_cbc[:10]

array([0.59013311, 0.69597954, 0.46701134, 0.65193911, 0.37170677,
       0.8175224 , 0.56512769, 0.26366763, 0.25907911, 0.8223259 ])

In [78]:
print("infer set 평균, 최대값, 최소값")
print(infer_cbc.mean())
print(infer_cbc.max())
print(infer_cbc.min())

infer set 평균, 최대값, 최소값
0.5205111221616208
0.9589302419928564
0.09271138075620447


In [88]:
output = pd.read_csv("xgbc.csv")

output['prediction'] = infer_cbc
output.to_csv("cbc2.csv", index=False)

In [91]:
bm = pd.read_csv("lgbm_sunny.csv")
cn = pd.read_csv("lgcn_sunny.csv")

total = (bm['prediction'] * 0.9 + cn['prediction'] * 0.9 + infer_cbc * 1.2) / 3
total

0      0.592945
1      0.754369
2      0.451799
3      0.749818
4      0.431325
         ...   
739    0.143949
740    0.757912
741    0.777485
742    0.480022
743    0.539645
Name: prediction, Length: 744, dtype: float64

In [92]:
print("infer set 평균, 최대값, 최소값")
print(total.mean())
print(total.max())
print(total.min())

infer set 평균, 최대값, 최소값
0.5151900090383756
0.9535746351924256
0.061973999622638665


In [None]:
output['prediction'] = total
output.to_csv("ensemble3.csv", index=False)

# 랜덤 포레스트

In [50]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=42, min_impurity_decrease = 0.0000002)

In [6]:
df = pd.concat([train_df, test_df]).reset_index(drop=True)
# df = feature_engineering(df)

user2idx = {j:i for i, j in enumerate(sorted(df['userID'].unique()))}
assess2idx = {j:i for i, j in enumerate(sorted(df['assessmentItemID'].unique()))}
test2idx = {j:i for i, j in enumerate(sorted(df['testId'].unique()))}
know2idx = {j:i for i, j in enumerate(sorted(df['KnowledgeTag'].unique()))}

---

In [9]:
# df_temp = df.drop_duplicates(subset=['userID', 'assessmentItemID']).reset_index(drop=True)
df_temp = feature_engineering(df)

# infer
df_infer_xg = df_temp.loc[df_temp['answerCode'] == -1].reset_index(drop=True)
df_infer_xg = change2idx(df_infer_xg)
df_infer_xg = df_infer_xg[['assessmentItemID', 'testId', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]


In [10]:
df_infer_xg

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,4965,914,469,717.0,1035,0.692754,0.654500,1309,0.559077,1817
1,7748,1306,781,465.0,670,0.694030,0.645000,774,0.541569,1381
2,7484,1271,820,915.0,1316,0.695289,0.456000,912,0.492333,1477
3,9381,1526,309,1031.0,1259,0.818904,0.443333,266,0.417647,355
4,6231,1109,183,293.0,386,0.759067,0.639429,1119,0.609263,2894
...,...,...,...,...,...,...,...,...,...,...
739,3692,705,883,7.0,23,0.304348,0.790000,1185,0.700952,1472
740,2590,496,589,7.0,14,0.500000,0.866000,1299,0.824167,3956
741,5353,974,865,7.0,14,0.500000,0.746000,746,0.820896,2750
742,5353,974,865,2.0,14,0.142857,0.746000,746,0.820896,2750


In [11]:

df_temp = df_temp.loc[df_temp['answerCode'] != -1].reset_index(drop=True)
df_temp

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.952667,1429,0.957333,718
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,0.952667,1429,0.917067,3439
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,0.952667,1429,0.917067,3439
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,0.952667,1429,0.917067,3439
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,0.952667,1429,0.917067,3439
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2525951,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1.0,4,0.250000,0.666000,999,0.694889,3127
2525952,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1.0,5,0.200000,0.652500,783,0.698551,2410
2525953,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,2.0,6,0.333333,0.652500,783,0.698551,2410
2525954,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,3.0,7,0.428571,0.652500,783,0.698551,2410


In [12]:
train_xg, test_xg = custom_train_test_split(df_temp)
print(train_xg.shape, test_xg.shape)

(1768018, 13) (2235, 13)


In [13]:
train_xg = change2idx(train_xg.reset_index(drop=True))
test_xg = change2idx(test_xg.reset_index(drop=True))

In [14]:
train_xg.dropna(inplace=True)
test_xg.dropna(inplace=True)

In [15]:
train_xg_X = train_xg[['assessmentItemID', 'testId', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]
train_xg_y = train_xg['answerCode']

test_xg_X = test_xg[['assessmentItemID', 'testId', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]
test_xg_y = test_xg['answerCode']

In [52]:
# # infer
# df_infer_xg = df_temp.loc[df_temp['answerCode'] == -1].reset_index(drop=True)
# df_infer_xg = change2idx(df_infer_xg)
# df_infer_xg = df_infer_xg[['assessmentItemID', 'testId', 'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 
#          'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum']]

In [16]:
train_xg_X

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
1,5355,975,557,1.0,1,1.000000,0.952667,1429,0.917067,3439
2,5356,975,557,2.0,2,1.000000,0.952667,1429,0.917067,3439
3,5357,975,557,3.0,3,1.000000,0.952667,1429,0.917067,3439
4,5358,975,557,4.0,4,1.000000,0.952667,1429,0.917067,3439
5,5359,975,557,5.0,5,1.000000,0.952667,1429,0.917067,3439
...,...,...,...,...,...,...,...,...,...,...
1768013,3047,582,236,5.0,10,0.500000,0.728000,1092,0.741356,2187
1768014,3048,582,236,6.0,11,0.545455,0.728000,1092,0.741356,2187
1768015,3049,582,236,6.0,12,0.500000,0.728000,1092,0.741356,2187
1768016,3050,582,236,6.0,13,0.461538,0.728000,1092,0.741356,2187


In [17]:
test_xg_X

Unnamed: 0,assessmentItemID,testId,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,7747,1306,781,464.0,669,0.693572,0.645000,774,0.541569,1381
1,7087,1220,167,344.0,535,0.642991,0.517333,776,0.462000,3696
2,7872,1322,792,407.0,605,0.672727,0.550000,440,0.360571,631
3,6230,1109,183,293.0,385,0.761039,0.639429,1119,0.609263,2894
4,7756,1307,783,317.0,689,0.460087,0.501500,1003,0.544706,926
...,...,...,...,...,...,...,...,...,...,...
2230,3824,732,283,11.0,14,0.785714,0.780000,1170,0.718958,3451
2231,6376,1130,192,7.0,14,0.500000,0.689600,862,0.714286,1250
2232,2093,404,20,7.0,14,0.500000,0.687333,1031,0.733333,1100
2233,3731,713,697,10.0,14,0.714286,0.639333,959,0.762500,915


---

In [7]:
def change2idx(df):
    # df['userID'] = df['userID'].apply(lambda x : user2idx[x])
    df['assessmentItemID'] = df['assessmentItemID'].apply(lambda x : assess2idx[x])
    df['testId'] = df['testId'].apply(lambda x : test2idx[x])
    df['KnowledgeTag'] = df['KnowledgeTag'].apply(lambda x : know2idx[x])
    return df

In [18]:
train_rf = train.dropna()
train_rf = change2idx(train_rf)
X_train_rf = train_rf.drop([TARGET_COL], axis=1).values
y_train_rf = train_rf[TARGET_COL].values

NameError: name 'train' is not defined

In [56]:
rf.fit(X_train_rf, y_train_rf)

RandomForestClassifier(min_impurity_decrease=2e-07, n_jobs=-1, random_state=42)

In [20]:
test_rf = test.dropna()
test_rf = change2idx(test_rf)
X_test_rf = test_rf.drop([TARGET_COL], axis=1).values
y_test_rf = test_rf[TARGET_COL].values

In [21]:
infer_rf = change2idx(df_infer)
X_infer_rf = infer_rf.drop([TARGET_COL], axis=1).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['assessmentItemID'] = df['assessmentItemID'].apply(lambda x : assess2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['testId'] = df['testId'].apply(lambda x : test2idx[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KnowledgeTag'] = df['KnowledgeTag'].apply(lambda x : know2idx[x

In [61]:
print("train ACC :", rf.score(X_train_rf, y_train_rf))
print("test ACC :", rf.score(X_test_rf, y_test_rf))

pred = rf.predict_proba(X_test_rf)[:,1]
print("test AUC :", roc_auc_score(y_test_rf, pred))


train ACC : 0.9605840403777255
test ACC : 0.615346287992028
test AUC : 0.669869310276287


In [80]:
infer_rfc = rf.predict_proba(X_infer_rf)[:, 1]
infer_rfc[:10]

array([0.52282057, 0.58698249, 0.45425311, 0.60221068, 0.80316091,
       0.63400228, 0.61209546, 0.55617954, 0.30067077, 0.73985769])

In [81]:
print("infer set 평균, 최대값, 최소값")
print(infer_rfc.mean())
print(infer_rfc.max())
print(infer_rfc.min())

infer set 평균, 최대값, 최소값
0.57053491079615
0.9246656634739304
0.12589701760265806


# XG부스트

---

In [27]:
import xgboost as xgb
dtrain = xgb.DMatrix(train_xg_X, train_xg_y)
dval = xgb.DMatrix(test_xg_X, test_xg_y)
dtest = xgb.DMatrix(df_infer_xg)


In [None]:

# 모델 생성
# num_boost_round 만큼 반복하는데 early_stopping_rounds 만큼 성능 향상이 없으면 중단
# early_stopping_rounds를 사용하려면 eval 데이터 셋을 명기해야함
# param = {파라미터 설정}
params1 = {
    'seed' : 42,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'min_child_weight': 1,
    'eval_metric': 'auc',
    # 'n_estimators': 2000,
    # 'early_stopping_rounds': 100,
    'max_depth': 5,
    'lambda' : 0.5 
    # 'max_leaves':5, 
    }

xgb_model = xgb.train(
                    params = params1, 
                    dtrain = dtrain, 
                    num_boost_round = 400, 
                    early_stopping_rounds = 30, 
                    evals=[(dtrain,'train'),(dval,'eval')]
                    )


In [20]:
# 예측하기, 확률값으로 반환됨
train_pred = xgb_model.predict(dtrain)
test_pred = xgb_model.predict(dval)

In [21]:
print(roc_auc_score(train_xg_y, train_pred))
print(roc_auc_score(test_xg_y, test_pred))

0.7501453341890393
0.6986728795942324


In [29]:
xgb_model.predict(dtest)

array([0.6714251 , 0.69186723, 0.58297205, 0.8965642 , 0.6940091 ,
       0.8096116 , 0.71061635, 0.6534531 , 0.23402894, 0.8730594 ,
       0.7091386 , 0.67900926, 0.9449733 , 0.5633424 , 0.8788543 ,
       0.867618  , 0.26812026, 0.8672425 , 0.7820452 , 0.45185453,
       0.8322462 , 0.5425873 , 0.48899218, 0.6851879 , 0.55609727,
       0.7269843 , 0.91548985, 0.88791263, 0.60836583, 0.76859176,
       0.5750762 , 0.78953654, 0.78450143, 0.45800975, 0.78571784,
       0.70640355, 0.69666076, 0.5800986 , 0.62601346, 0.5750419 ,
       0.6747439 , 0.47476095, 0.34010002, 0.38200086, 0.46427017,
       0.73143053, 0.7476703 , 0.2431307 , 0.9232747 , 0.7372267 ,
       0.5925137 , 0.5246484 , 0.64087296, 0.45996702, 0.7155116 ,
       0.6703693 , 0.60551846, 0.9385981 , 0.5496929 , 0.61548656,
       0.8417219 , 0.93898624, 0.43242228, 0.43038586, 0.18434143,
       0.5853126 , 0.7739745 , 0.34861752, 0.32737848, 0.50347584,
       0.59834456, 0.8518308 , 0.57753986, 0.49151623, 0.77743

---

In [30]:
params = {
    'seed' : 42,
    'learning_rate': 0.05,
    'objective': 'binary:logistic',
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'n_estimators': 2000,
    'early_stopping_rounds': 100,
    'max_depth': 12, 
    'max_leaves':5, 
    # 'min_child_weigh'1
    }

In [31]:
import xgboost as xgb
from xgboost import XGBClassifier
xg = XGBClassifier(**params)
# xg = XGBClassifier(eval_metric='auc')

In [32]:
xg.fit(train_xg_X, train_xg_y, eval_set=[(test_xg_X, test_xg_y)])

[0]	validation_0-auc:0.68406
[1]	validation_0-auc:0.68820
[2]	validation_0-auc:0.68873
[3]	validation_0-auc:0.68710
[4]	validation_0-auc:0.68641
[5]	validation_0-auc:0.68675
[6]	validation_0-auc:0.68744
[7]	validation_0-auc:0.68750
[8]	validation_0-auc:0.68817
[9]	validation_0-auc:0.68815
[10]	validation_0-auc:0.68886
[11]	validation_0-auc:0.68946
[12]	validation_0-auc:0.68934
[13]	validation_0-auc:0.68960
[14]	validation_0-auc:0.68945
[15]	validation_0-auc:0.68915
[16]	validation_0-auc:0.68931
[17]	validation_0-auc:0.68946
[18]	validation_0-auc:0.68948
[19]	validation_0-auc:0.68964
[20]	validation_0-auc:0.68963
[21]	validation_0-auc:0.68956
[22]	validation_0-auc:0.68930
[23]	validation_0-auc:0.68960
[24]	validation_0-auc:0.68949
[25]	validation_0-auc:0.68979
[26]	validation_0-auc:0.68990
[27]	validation_0-auc:0.68981
[28]	validation_0-auc:0.69013
[29]	validation_0-auc:0.69056
[30]	validation_0-auc:0.69063
[31]	validation_0-auc:0.69104
[32]	validation_0-auc:0.69118
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=100, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.05, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=5, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=2000,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, ...)

In [33]:
print("train ACC :", xg.score(train_xg_X, train_xg_y))
print("test ACC :", xg.score(test_xg_X, test_xg_y))

pred = xg.predict_proba(test_xg_X)[:,1]
print("test AUC :", roc_auc_score(test_xg_y, pred))


train ACC : 0.7965766040715653
test ACC : 0.6407158836689038
test AUC : 0.7213415503277563


In [None]:
print("train ACC :", xg.score(X_train_rf, y_train_rf))
print("test ACC :", xg.score(X_test_rf, y_test_rf))

pred = xg.predict_proba(X_test_rf)[:,1]
print("test AUC :", roc_auc_score(y_test_rf, pred))


train ACC : 0.7770050067469044
test ACC : 0.6218236173393124
test AUC : 0.6997530312065194


In [34]:
infer_xgc = xg.predict_proba(df_infer_xg)[:, 1]
infer_xgc[:10]

array([0.7496511 , 0.6661245 , 0.48368952, 0.97167146, 0.4896818 ,
       0.86565614, 0.675887  , 0.4423526 , 0.26797757, 0.8934992 ],
      dtype=float32)

In [35]:
print("infer set 평균, 최대값, 최소값")
print(infer_xgc.mean())
print(infer_xgc.max())
print(infer_xgc.min())

infer set 평균, 최대값, 최소값
0.5839226
0.9801948
0.029627837


In [36]:
xg_output = pd.read_csv("cbc.csv")
xg_output['prediction'] = infer_xgc
xg_output

Unnamed: 0,id,prediction
0,0,0.749651
1,1,0.666125
2,2,0.483690
3,3,0.971671
4,4,0.489682
...,...,...
739,739,0.449469
740,740,0.733220
741,741,0.803654
742,742,0.408220


In [37]:
xg_output.to_csv('xgbc.csv', index=False)

In [38]:
cb_output = pd.read_csv("catboost.csv")
cb_output

Unnamed: 0,id,prediction
0,0,0.590133
1,1,0.695980
2,2,0.467011
3,3,0.651939
4,4,0.371707
...,...,...
739,739,0.240993
740,740,0.830760
741,741,0.751222
742,742,0.554715


In [39]:
temp = (infer_xgc + cb_output['prediction']) / 2
cb_output['prediction'] = temp
cb_output.to_csv('cb_xg.csv', index=False)