In [1]:
import pandas as pd
import os
import random
import numpy as np

In [48]:
data_dir = '/opt/ml/input/data' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path) 

# 전처리

In [49]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    k_n = pd.DataFrame(df.groupby(['userID'])['KnowledgeTag'].nunique())
    k_n.columns = ['unique_tag_cnt']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, k_n, on=['userID'], how="left")
    
    return df

In [50]:
df = feature_engineering(df)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,unique_tag_cnt
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637,165
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,0.947683,1268,0.913187,3040,165
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,0.947683,1268,0.913187,3040,165
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,0.947683,1268,0.913187,3040,165
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,0.947683,1268,0.913187,3040,165


In [51]:
from sklearn.preprocessing import LabelEncoder
import time
from datetime import datetime
asset_dir = '/opt/ml/input/code/'#/asset/'

def preprocessing(df, is_train=True):
    cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]

    #if not os.path.exists(self.args.asset_dir):
    #     os.makedirs(self.args.asset_dir)
    #df["assessmentItemID_ori"] = df["assessmentItemID"]
    for col in cate_cols:

        le = LabelEncoder()
        if is_train:
            # For UNKNOWN class
            a = df[col].unique().tolist() + ["unknown"]
            le.fit(a)
            le_path = os.path.join(asset_dir, col + "_classes.npy")
            print(le_path)
            #print(le_path)
            np.save(le_path, le.classes_)
        
        else:
            label_path = os.path.join(asset_dir, col + "_classes.npy")
            print(label_path)
            le.classes_ = np.load(label_path)

            df[col] = df[col].apply(
                lambda x: x if str(x) in le.classes_ else "unknown"
            )

        # 모든 컬럼이 범주형이라고 가정
        df[col] = df[col].astype(str)
        test = le.transform(df[col])
        df[col] = test

    def convert_time(s):
        timestamp = time.mktime(
            datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
        )
        return int(timestamp)

    df["Timestamp"] = df["Timestamp"].apply(convert_time)

    return df

In [52]:
df = preprocessing(df, is_train=True)

/opt/ml/input/code/assessmentItemID_classes.npy
/opt/ml/input/code/testId_classes.npy
/opt/ml/input/code/KnowledgeTag_classes.npy


In [53]:
df = df.sort_values(['userID', 'Timestamp'])

In [54]:
# diff
df['diff'] = df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()#periods=-1).apply(abs)

diff_df = df['diff']
diff_df.dropna(inplace=True)

# nan은 -1
# 600(10분) 이상이면 다 600
df['diff'].fillna(-1, inplace=True)
idx = df[df['diff'] >= 600].index
df.loc[idx, 'diff'] = 600

In [55]:
tmp = df[df['diff'] >= 0]
correct_k = tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")

In [56]:
item_group = df.groupby('assessmentItemID')['answerCode'].mean()
dict_item_mean = dict(item_group)

df['item_acc'] = df['assessmentItemID'].apply(lambda x: dict_item_mean[x])

In [57]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    #test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [58]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,unique_tag_cnt,diff,mean,item_acc
0,0,5354,975,1,1585009031,618,,0,,0.947683,1268,0.955022,637,165,-1.0,345.923228,0.982063
1,0,5355,975,1,1585009034,619,1.0,1,1.000000,0.947683,1268,0.913187,3040,165,3.0,59.008799,0.964126
2,0,5356,975,1,1585009042,619,2.0,2,1.000000,0.947683,1268,0.913187,3040,165,8.0,59.008799,0.910314
3,0,5357,975,1,1585009049,619,3.0,3,1.000000,0.947683,1268,0.913187,3040,165,7.0,59.008799,0.968610
4,0,5358,975,1,1585009056,619,4.0,4,1.000000,0.947683,1268,0.913187,3040,165,7.0,59.008799,0.941704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,2373,456,0,1591339821,375,1.0,4,0.250000,0.662590,921,0.689706,2814,2,24.0,160.096973,0.435252
2266582,7441,3909,748,1,1597971999,784,1.0,5,0.200000,0.655109,718,0.697874,2199,2,600.0,162.502079,0.649635
2266583,7441,3910,748,1,1597972010,784,2.0,6,0.333333,0.655109,718,0.697874,2199,2,11.0,162.502079,0.635036
2266584,7441,3911,748,1,1597972056,784,3.0,7,0.428571,0.655109,718,0.697874,2199,2,46.0,162.502079,0.791971


In [59]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc',
       'test_mean', 'test_sum', 'tag_mean', 'tag_sum', 'unique_tag_cnt',
       'diff', 'mean', 'item_acc'],
      dtype='object')

In [60]:
def add_user_recent_acc(train_df, shift_size):
    train_df = train_df.sort_values(by=['userID'])
    train_df_temp = train_df.copy()

    # 새롭게 이력이 시작되는 유저 구함
    user_start_pos = train_df['userID'].diff() > 0
    train_df_temp['previous_answer_count'] = train_df_temp.groupby('userID')['answerCode'].cumsum().shift(fill_value=0)
    train_df_temp['shift_previous_answer_count'] = train_df_temp.groupby('userID')['answerCode'].cumsum().shift(fill_value=0)
    train_df_temp.loc[user_start_pos, ['previous_answer_count', 'shift_previous_answer_count']] = 0
    train_df_temp['shift_previous_answer_count'] = train_df_temp['shift_previous_answer_count'].shift(shift_size)
    train_df_temp['temp'] = len(train_df_temp) * [1]
    train_df_temp['previous_problem_count'] = train_df_temp.groupby('userID')['temp'].cumsum().shift(fill_value=0)
    train_df_temp['previous_problem_count'] = train_df_temp['previous_problem_count'].apply(lambda x: shift_size if x > shift_size else x)
    train_df_temp.loc[user_start_pos, ['previous_problem_count']] = 0

    train_df_temp['shift_previous_answer_count'] = train_df_temp.apply(lambda x: 0 if x['previous_problem_count'] < shift_size else x['shift_previous_answer_count'], axis=1)
    train_df_temp['count'] = train_df_temp['previous_answer_count'] - train_df_temp['shift_previous_answer_count']
    train_df['user_recent_acc_'] = (train_df_temp['count'] / train_df_temp['previous_problem_count']).fillna(0)
    
    return train_df

In [61]:
df = add_user_recent_acc(df, shift_size=5)


In [62]:
df


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,unique_tag_cnt,diff,mean,item_acc,user_recent_acc_
0,0,5354,975,1,1585009031,618,,0,,0.947683,1268,0.955022,637,165,-1.0,345.923228,0.982063,0.0
491,0,5993,1073,1,1599095338,594,304.0,491,0.619145,0.736464,1129,0.766124,3433,165,29.0,126.123658,0.319635,1.0
492,0,8499,1406,1,1599268080,293,305.0,492,0.619919,0.657609,363,0.536178,289,165,600.0,122.560297,0.695652,1.0
493,0,8500,1406,1,1599268160,299,306.0,493,0.620690,0.657609,363,0.551144,2360,165,80.0,101.186128,0.641304,1.0
494,0,8501,1406,0,1599268195,299,307.0,494,0.621457,0.657609,363,0.551144,2360,165,35.0,101.186128,0.684783,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266580,7441,2372,456,0,1591339797,375,1.0,3,0.333333,0.662590,921,0.689706,2814,2,34.0,160.096973,0.597122,0.5
2266581,7441,2373,456,0,1591339821,375,1.0,4,0.250000,0.662590,921,0.689706,2814,2,24.0,160.096973,0.435252,0.4
2266582,7441,3909,748,1,1597971999,784,1.0,5,0.200000,0.655109,718,0.697874,2199,2,600.0,162.502079,0.649635,0.2
2266583,7441,3910,748,1,1597972010,784,2.0,6,0.333333,0.655109,718,0.697874,2199,2,11.0,162.502079,0.635036,0.4


In [63]:
# 유저별 분리
train, test = custom_train_test_split(df)

#FEATS = ['userID', 'assessmentItemID', 'testId', 'Timestamp',
#       'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc',
#       'test_mean', 'test_sum', 'tag_mean', 'tag_sum', 'diff', 'mean']
# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer',
         'user_acc', 'test_mean', 'test_sum', 'tag_mean',
         'assessmentItemID', 'Timestamp', 'diff', 'mean']#, 'item_acc']#, 'tag_sum']

# X, y 값 분리
train.reset_index(inplace=True, drop=True)
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)
train.fillna(0, axis=1, inplace=True)

test.reset_index(inplace=True, drop=True)
y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)
test.fillna(0, axis=1, inplace=True)

In [64]:
from sklearn.preprocessing import MinMaxScaler

#diff_scalar = MinMaxScaler()
#train['mean'] = diff_scalar.fit_transform(train['mean'].values.reshape(-1, 1))
#test['mean'] = diff_scalar.transform(test['mean'].values.reshape(-1, 1))

user_total_scaler = MinMaxScaler()
train['user_total_answer'] = user_total_scaler.fit_transform(train['user_total_answer'].values.reshape(-1, 1))
test['user_total_answer'] = user_total_scaler.transform(test['user_total_answer'].values.reshape(-1, 1))

user_correct_scalar = MinMaxScaler()
train['user_correct_answer'] = user_correct_scalar.fit_transform(train['user_correct_answer'].values.reshape(-1, 1))
test['user_correct_answer'] = user_correct_scalar.transform(test['user_correct_answer'].values.reshape(-1, 1))

tmestamp_scalar = MinMaxScaler()
train['Timestamp'] = tmestamp_scalar.fit_transform(train['Timestamp'].values.reshape(-1, 1))
test['Timestamp'] = tmestamp_scalar.transform(test['Timestamp'].values.reshape(-1, 1))

test_sum_scalar = MinMaxScaler()
train['test_sum'] = test_sum_scalar.fit_transform(train['test_sum'].values.reshape(-1, 1))
test['test_sum'] = test_sum_scalar.transform(test['test_sum'].values.reshape(-1, 1))

tag_sum_scalar = MinMaxScaler()
train['tag_sum'] = tag_sum_scalar.fit_transform(train['tag_sum'].values.reshape(-1, 1))
test['tag_sum'] = tag_sum_scalar.transform(test['tag_sum'].values.reshape(-1, 1))

item_acc_scaler = MinMaxScaler()
train['item_acc'] = item_acc_scaler.fit_transform(train['item_acc'].values.reshape(-1, 1))
test['item_acc'] = item_acc_scaler.transform(test['item_acc'].values.reshape(-1, 1))

# 모델링

### lightgbm

In [65]:
save_df = df.copy()

In [66]:
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

In [67]:
kfold = StratifiedKFold(n_splits=5)
fold_models = []
oof = np.zeros(len(train))
for i, (train_index, valid_index) in enumerate(kfold.split(train, y_train)):

            train_data, valid_data = train.iloc[train_index], train.iloc[valid_index]
            train_ydata, valid_ydata = y_train.iloc[train_index], y_train.iloc[valid_index]
            lgb_train = lgb.Dataset(train_data[FEATS], train_ydata)
            lgb_valid = lgb.Dataset(valid_data[FEATS], valid_ydata)

            # 모델 생성 및 훈련
            print(f'Calculating train oof {i + 1}')
            model_basic = lgb.train({'objective': 'binary'}, 
                                    lgb_train,
                                    valid_sets=[lgb_train, lgb_valid],
                                    verbose_eval=100,
                                    num_boost_round=2500, #config.num_boost_round,
                                    early_stopping_rounds=100 #config.early_stopping_rounds
                                    )
            # 모델 검증
            predict = model_basic.predict(valid_data[FEATS])
            
            # fold별 oof 값 모으기
            oof[valid_index] = predict
            fold_models.append(model_basic)

Calculating train oof 1




[LightGBM] [Info] Number of positive: 831652, number of negative: 437273
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 1268925, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642856
[LightGBM] [Info] Start training from score 0.642856
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.520274	valid_1's binary_logloss: 0.52306
[200]	training's binary_logloss: 0.516022	valid_1's binary_logloss: 0.521251
[300]	training's binary_logloss: 0.512842	valid_1's binary_logloss: 0.52097
[400]	training's binary_logloss: 0.510177	valid_1's binary_logloss: 0.520651
[500]	training's binary_logloss: 0.507435	valid_1's binary_logloss: 0.52001
[600]	training's binary_logloss: 0.505168	valid_1's binary_logloss: 0.519693
[700]	training's binary



[LightGBM] [Info] Number of positive: 831652, number of negative: 437273
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1268925, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642856
[LightGBM] [Info] Start training from score 0.642856
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.520038	valid_1's binary_logloss: 0.523789
[200]	training's binary_logloss: 0.515921	valid_1's binary_logloss: 0.522553
[300]	training's binary_logloss: 0.512643	valid_1's binary_logloss: 0.521843
[400]	training's binary_logloss: 0.509884	valid_1's binary_logloss: 0.521335
[500]	training's binary_logloss: 0.507354	valid_1's binary_logloss: 0.521256
[600]	training's binary_logloss: 0.504914	valid_1's binary_logloss: 0.521495
Early stopping, best



[LightGBM] [Info] Number of positive: 831652, number of negative: 437274
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1268926, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655398 -> initscore=0.642854
[LightGBM] [Info] Start training from score 0.642854
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.519127	valid_1's binary_logloss: 0.526928
[200]	training's binary_logloss: 0.514779	valid_1's binary_logloss: 0.525776
[300]	training's binary_logloss: 0.511446	valid_1's binary_logloss: 0.52549
[400]	training's binary_logloss: 0.508468	valid_1's binary_logloss: 0.525097
Early stopping, best iteration is:
[380]	training's binary_logloss: 0.5091	valid_1's binary_logloss: 0.524942
Calculating train oof 4




[LightGBM] [Info] Number of positive: 831652, number of negative: 437274
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1268926, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655398 -> initscore=0.642854
[LightGBM] [Info] Start training from score 0.642854
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.520988	valid_1's binary_logloss: 0.518823
[200]	training's binary_logloss: 0.516586	valid_1's binary_logloss: 0.517437
[300]	training's binary_logloss: 0.513373	valid_1's binary_logloss: 0.516839
[400]	training's binary_logloss: 0.510209	valid_1's binary_logloss: 0.516031
[500]	training's binary_logloss: 0.507327	valid_1's binary_logloss: 0.515638
[600]	training's binary_logloss: 0.50483	valid_1's binary_logloss: 0.515466
[700]	training's bina



[LightGBM] [Info] Number of positive: 831652, number of negative: 437274
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2546
[LightGBM] [Info] Number of data points in the train set: 1268926, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655398 -> initscore=0.642854
[LightGBM] [Info] Start training from score 0.642854
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.518745	valid_1's binary_logloss: 0.530649
[200]	training's binary_logloss: 0.514335	valid_1's binary_logloss: 0.529773
[300]	training's binary_logloss: 0.511301	valid_1's binary_logloss: 0.529303
[400]	training's binary_logloss: 0.508424	valid_1's binary_logloss: 0.528715
[500]	training's binary_logloss: 0.505792	valid_1's binary_logloss: 0.528356
[600]	training's binary_logloss: 0.503378	valid_1's binary_logloss: 0.528611
Early stopping, best

In [26]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = preprocessing(test_df, is_train=False)

test_df = test_df.sort_values(['userID', 'Timestamp'])

# diff: nan은 -1 , 600(10분) 이상이면 다 600
test_df['diff'] = test_df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()#periods=-1).apply(abs)
test_df['diff'].fillna(-1, inplace=True)
idx = test_df[test_df['diff'] >= 600].index
test_df.loc[idx, 'diff'] = 600

tmp = test_df[test_df['diff'] >= 0]
correct_k = tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
test_df = pd.merge(test_df, correct_k, on=['KnowledgeTag'], how="left")

# item_acc
item_group = test_df.groupby('assessmentItemID')['answerCode'].mean()
dict_item_mean = dict(item_group)
test_df['item_acc'] = test_df['assessmentItemID'].apply(lambda x: dict_item_mean[x])

#user_acc
test_df = add_user_recent_acc(test_df, shift_size=10)

# min max scalar
#test_df['mean'] = diff_scalar.transform(test_df['mean'].values.reshape(-1, 1))
test_df['user_total_answer'] = user_total_scaler.transform(test_df['user_total_answer'].values.reshape(-1, 1))
test_df['user_correct_answer'] = user_correct_scalar.transform(test_df['user_correct_answer'].values.reshape(-1, 1))
test_df['Timestamp'] = tmestamp_scalar.transform(test_df['Timestamp'].values.reshape(-1, 1))
test_df['test_sum'] = test_sum_scalar.transform(test_df['test_sum'].values.reshape(-1, 1))
test_df['tag_sum'] = tag_sum_scalar.transform(test_df['tag_sum'].values.reshape(-1, 1))
test_df['item_acc'] = item_acc_scaler.transform(test_df['item_acc'].values.reshape(-1, 1))


# LEAVE LAST INTERACTION ONLY
test_df_check = test_df[test_df['answerCode'] >= 0]
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
check_ans = test_df_check.pop('answerCode')
#test_df = test_df.drop(['answerCode'], axis=1)
test_df_check.fillna(0, axis=1, inplace=True)

test_df = test_df.drop(['answerCode'], axis=1)
test_df.fillna(0, axis=1, inplace=True)

/opt/ml/input/code/assessmentItemID_classes.npy
/opt/ml/input/code/testId_classes.npy
/opt/ml/input/code/KnowledgeTag_classes.npy


In [68]:
pred = np.zeros(len(test))
for i, model in enumerate(fold_models):
    print(f'Calculating test {i + 1}')
    predict = model.predict(test[FEATS])        
    # fold별 prediction 값 모으기
    pred += predict
pred /= len(fold_models)
lgbm_valid_acc = accuracy_score(y_test, np.where(pred >= 0.5, 1, 0))
lgbm_valid_auc = roc_auc_score(y_test, pred)
print(f'VALID LGBM AUC : {lgbm_valid_auc} ACC : {lgbm_valid_acc}')

pred_test = np.zeros(len(test_df_check))
for i, model in enumerate(fold_models):
    print(f'Calculating test {i + 1}')
    predict = model.predict(test_df_check[FEATS])
    # fold별 prediction 값 모으기
    pred_test += predict
pred_test /= len(fold_models)
lgbm_test_acc = accuracy_score(check_ans, np.where(pred_test >= 0.5, 1, 0))
lgbm_test_auc = roc_auc_score(check_ans, pred_test)
print(f'TEST LGBM AUC : {lgbm_test_auc} ACC : {lgbm_test_acc}')

Calculating test 1
Calculating test 2
Calculating test 3
Calculating test 4
Calculating test 5
VALID LGBM AUC : 0.7815311673268399 ACC : 0.6956052725559904
Calculating test 1
Calculating test 2
Calculating test 3
Calculating test 4
Calculating test 5
TEST LGBM AUC : 0.7725734271649278 ACC : 0.7030805413116398


# Inference

In [70]:
# inference
total_preds = model_basic.predict(test_df[FEATS])

In [71]:
# SAVE OUTPUT
output_dir = 'output/lgbm/'
write_path = os.path.join(output_dir, "update_diff_submission.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/lgbm/update_diff_submission.csv
