In [554]:
import pandas as pd
import os
import random
import numpy as np

In [555]:
data_dir = '/opt/ml/input/data'
train_data_path = os.path.join(data_dir, 'train_data.csv')
train_df = pd.read_csv(train_data_path)

In [556]:
def feature_engineering(df):
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)

    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
    correct_a.columns = ['item_mean', 'item_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")

    return df

In [557]:
df = feature_engineering(train_df)
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637,0.982063,219
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.000000,0.947683,1268,0.913187,3040,0.964126,215
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.000000,0.947683,1268,0.913187,3040,0.910314,203
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.000000,0.947683,1268,0.913187,3040,0.968610,216
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.000000,0.947683,1268,0.913187,3040,0.941704,210
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,1.0,4,0.250000,0.662590,921,0.689706,2814,0.435252,121
2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,1.0,5,0.200000,0.655109,718,0.697874,2199,0.649635,178
2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,2.0,6,0.333333,0.655109,718,0.697874,2199,0.635036,174
2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,3.0,7,0.428571,0.655109,718,0.697874,2199,0.791971,217


In [558]:
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import time
asset_dir = '/opt/ml/input/code/asset'

def convert_time(s):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

def preprocessing(df, is_train=True):
    category_columns = ["assessmentItemID", "testId", "KnowledgeTag"]

    if not os.path.exists(asset_dir):
        os.makedirs(asset_dir)

    for column in category_columns:
        label_encoder = LabelEncoder()
        if is_train:
            unique_values = df[column].unique().tolist() + ["unknown"]
            label_encoder.fit(unique_values)
            label_encoder_path = os.path.join(asset_dir, column + "_classes.npy")
            np.save(label_encoder_path, label_encoder.classes_)
        else:
            label_encoder_path = os.path.join(asset_dir, column + "_classes.npy")
            label_encoder.classes_ = np.load(label_encoder_path)

            df[column] = df[column].apply(
                lambda x: x if str(x) in label_encoder.classes_ else "unknown"
            )

        df[column] = df[column].astype(str)
        df[column] = label_encoder.transform(df[column])

    df["Timestamp"] = df["Timestamp"].apply(convert_time)

    return df

In [559]:
df = preprocessing(df, is_train=True)

In [560]:
df = df.sort_values(['userID', 'Timestamp'])

In [561]:
df['diff'] = df.sort_values(['userID', 'Timestamp']).groupby('userID')['Timestamp'].diff()

In [562]:
diff_df = df['diff']
diff_df.fillna(-1, inplace=True)
idx = df[df['diff'] >= 600].index
df.loc[idx, 'diff'] = 600

In [563]:
temp = df[df['diff'] >= 0]
diff_group_by_knowledgeTag = temp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
diff_group_by_knowledgeTag.columns = ['diffMean']
df = pd.merge(df, diff_group_by_knowledgeTag, on=['KnowledgeTag'], how='left')

In [564]:
random.seed(42)

def custom_train_test_split(df, ratio=0.7):
    # unique한 값들의 등장 횟수를 묶어서 리스트로 만들어준다.
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)

    max_train_data_len = ratio * len(df)
    sum_of_train_data = 0
    user_ids = []

    # train 데이터 셋을 랜덤으로 섞어서 합이 비율 이하가 되도록 선정한다.
    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)

    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    # test데이터셋은 각 유저의 마지막 interaction만 추출
    # test = test[test['userID'] != test['userID'].shift(-1)]

    return train, test

In [565]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum,diff,diffMean
0,0,5354,975,1,1585009031,618,,0,,0.947683,1268,0.955022,637,0.982063,219,-1.0,345.923228
1,0,5355,975,1,1585009034,619,1.0,1,1.000000,0.947683,1268,0.913187,3040,0.964126,215,3.0,59.008799
2,0,5356,975,1,1585009042,619,2.0,2,1.000000,0.947683,1268,0.913187,3040,0.910314,203,8.0,59.008799
3,0,5357,975,1,1585009049,619,3.0,3,1.000000,0.947683,1268,0.913187,3040,0.968610,216,7.0,59.008799
4,0,5358,975,1,1585009056,619,4.0,4,1.000000,0.947683,1268,0.913187,3040,0.941704,210,7.0,59.008799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,2373,456,0,1591339821,375,1.0,4,0.250000,0.662590,921,0.689706,2814,0.435252,121,24.0,160.096973
2266582,7441,3909,748,1,1597971999,784,1.0,5,0.200000,0.655109,718,0.697874,2199,0.649635,178,600.0,162.502079
2266583,7441,3910,748,1,1597972010,784,2.0,6,0.333333,0.655109,718,0.697874,2199,0.635036,174,11.0,162.502079
2266584,7441,3911,748,1,1597972056,784,3.0,7,0.428571,0.655109,718,0.697874,2199,0.791971,217,46.0,162.502079


In [566]:
df['user_correct_answer'].fillna(0, inplace=True)

In [567]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum,diff,diffMean
0,0,5354,975,1,1585009031,618,0.0,0,,0.947683,1268,0.955022,637,0.982063,219,-1.0,345.923228
1,0,5355,975,1,1585009034,619,1.0,1,1.000000,0.947683,1268,0.913187,3040,0.964126,215,3.0,59.008799
2,0,5356,975,1,1585009042,619,2.0,2,1.000000,0.947683,1268,0.913187,3040,0.910314,203,8.0,59.008799
3,0,5357,975,1,1585009049,619,3.0,3,1.000000,0.947683,1268,0.913187,3040,0.968610,216,7.0,59.008799
4,0,5358,975,1,1585009056,619,4.0,4,1.000000,0.947683,1268,0.913187,3040,0.941704,210,7.0,59.008799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,2373,456,0,1591339821,375,1.0,4,0.250000,0.662590,921,0.689706,2814,0.435252,121,24.0,160.096973
2266582,7441,3909,748,1,1597971999,784,1.0,5,0.200000,0.655109,718,0.697874,2199,0.649635,178,600.0,162.502079
2266583,7441,3910,748,1,1597972010,784,2.0,6,0.333333,0.655109,718,0.697874,2199,0.635036,174,11.0,162.502079
2266584,7441,3911,748,1,1597972056,784,3.0,7,0.428571,0.655109,718,0.697874,2199,0.791971,217,46.0,162.502079


In [568]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc',
       'test_mean', 'test_sum', 'tag_mean', 'tag_sum', 'item_mean', 'item_sum',
       'diff', 'diffMean'],
      dtype='object')

In [569]:
train, test = custom_train_test_split(df)

features = ['KnowledgeTag', 'user_correct_answer', 'user_acc',
            'test_mean', 'test_sum', 'tag_mean', 'Timestamp', 'diff', 'diffMean', 'item_mean']

In [570]:
y_train = train['answerCode']
train = train.drop('answerCode', axis=1)
train.fillna(0, axis=1, inplace=True)

y_test = test['answerCode']
test = test.drop('answerCode', axis=1)
test.fillna(0, axis=1, inplace=True)

In [571]:
from sklearn.preprocessing import MinMaxScaler

features_for_scaling = ['user_correct_answer', 'Timestamp', 'test_sum']

for feature in features_for_scaling:
    temp_scaler = MinMaxScaler()
    train[feature] = temp_scaler.fit_transform(train[feature].values.reshape(-1, 1))
    test[feature] = temp_scaler.transform(test[feature].values.reshape(-1, 1))

In [572]:
train

Unnamed: 0,userID,assessmentItemID,testId,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum,diff,diffMean
745,1.0,3122.0,596.0,0.014725,179.0,0.000000,0.0,0.000000,0.595472,0.270648,0.616900,971.0,0.800000,212.0,-1.0,245.954282
746,1.0,3123.0,596.0,0.014731,179.0,0.000674,1.0,1.000000,0.595472,0.270648,0.616900,971.0,0.615094,163.0,183.0,245.954282
747,1.0,3124.0,596.0,0.014732,178.0,0.001348,2.0,1.000000,0.595472,0.270648,0.570962,1050.0,0.781132,207.0,43.0,136.552890
748,1.0,3125.0,596.0,0.014735,178.0,0.002022,3.0,1.000000,0.595472,0.270648,0.570962,1050.0,0.381132,101.0,104.0,136.552890
749,1.0,3126.0,596.0,0.014742,178.0,0.002695,4.0,1.000000,0.595472,0.270648,0.570962,1050.0,0.400000,106.0,212.0,136.552890
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266572,7440.0,3047.0,582.0,0.809380,175.0,0.003369,10.0,0.500000,0.738462,0.355564,0.749632,2039.0,0.893773,244.0,600.0,149.561785
2266573,7440.0,3048.0,582.0,0.809382,175.0,0.004043,11.0,0.545455,0.738462,0.355564,0.749632,2039.0,0.472527,129.0,61.0,149.561785
2266574,7440.0,3049.0,582.0,0.809383,175.0,0.004043,12.0,0.500000,0.738462,0.355564,0.749632,2039.0,0.901099,246.0,29.0,149.561785
2266575,7440.0,3050.0,582.0,0.809383,175.0,0.004043,13.0,0.461538,0.738462,0.355564,0.749632,2039.0,0.838828,229.0,2.0,149.561785


In [573]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586157 entries, 745 to 2266576
Data columns (total 16 columns):
 #   Column               Non-Null Count    Dtype  
---  ------               --------------    -----  
 0   userID               1586157 non-null  float64
 1   assessmentItemID     1586157 non-null  float64
 2   testId               1586157 non-null  float64
 3   Timestamp            1586157 non-null  float64
 4   KnowledgeTag         1586157 non-null  float64
 5   user_correct_answer  1586157 non-null  float64
 6   user_total_answer    1586157 non-null  float64
 7   user_acc             1586157 non-null  float64
 8   test_mean            1586157 non-null  float64
 9   test_sum             1586157 non-null  float64
 10  tag_mean             1586157 non-null  float64
 11  tag_sum              1586157 non-null  float64
 12  item_mean            1586157 non-null  float64
 13  item_sum             1586157 non-null  float64
 14  diff                 1586157 non-null  float64
 

In [574]:
from sklearn.preprocessing import MinMaxScaler

#diff_scalar = MinMaxScaler()
#train['mean'] = diff_scalar.fit_transform(train['mean'].values.reshape(-1, 1))
#test['mean'] = diff_scalar.transform(test['mean'].values.reshape(-1, 1))

user_total_scaler = MinMaxScaler()
train['user_total_answer'] = user_total_scaler.fit_transform(train['user_total_answer'].values.reshape(-1, 1))
test['user_total_answer'] = user_total_scaler.transform(test['user_total_answer'].values.reshape(-1, 1))

user_correct_scalar = MinMaxScaler()
train['user_correct_answer'] = user_correct_scalar.fit_transform(train['user_correct_answer'].values.reshape(-1, 1))
test['user_correct_answer'] = user_correct_scalar.transform(test['user_correct_answer'].values.reshape(-1, 1))

tmestamp_scalar = MinMaxScaler()
train['Timestamp'] = tmestamp_scalar.fit_transform(train['Timestamp'].values.reshape(-1, 1))
test['Timestamp'] = tmestamp_scalar.transform(test['Timestamp'].values.reshape(-1, 1))

test_sum_scalar = MinMaxScaler()
train['test_sum'] = test_sum_scalar.fit_transform(train['test_sum'].values.reshape(-1, 1))
test['test_sum'] = test_sum_scalar.transform(test['test_sum'].values.reshape(-1, 1))

tag_sum_scalar = MinMaxScaler()
train['tag_sum'] = tag_sum_scalar.fit_transform(train['tag_sum'].values.reshape(-1, 1))
test['tag_sum'] = tag_sum_scalar.transform(test['tag_sum'].values.reshape(-1, 1))

LightGBM

In [575]:
import lightgbm
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as np

lightgbm_train = lightgbm.Dataset(train[features], y_train)
lightgbm_test = lightgbm.Dataset(test[features], y_test)

In [576]:
def train_lightgbm():
    model = lightgbm.train(
        {'objective': 'binary'},
        lightgbm_train,
        valid_sets=[lightgbm_train, lightgbm_test],
        verbose_eval=100,
        num_boost_round=2500,
        early_stopping_rounds=100
    )

    preds = model.predict(test[features])
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'Valid AUC: {auc} / Valid ACC: {acc}\n')
    return model

In [577]:
model = train_lightgbm()



[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.475565	valid_1's binary_logloss: 0.480616
[200]	training's binary_logloss: 0.472526	valid_1's binary_logloss: 0.479367
[300]	training's binary_logloss: 0.470382	valid_1's binary_logloss: 0.478875
[400]	training's binary_logloss: 0.468347	valid_1's binary_logloss: 0.478581
[500]	training's binary_logloss: 0.46659	valid_1's binary_logloss: 0.478469
[600]	training's binary_logloss: 0.464991	valid_1's binary_logloss: 0.478355
[700]	training's bin

In [578]:
lgbm_train_preds = model.predict(train[features])
lgbm_valid_preds = model.predict(test[features])

In [579]:
import xgboost as xgb
from xgboost import XGBClassifier

params = {
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'n_estimators': 800,
    'early_stopping_rounds': 100,
    'max_depth': 12,
    'max_leaves':5,
}

xg = XGBClassifier(**params)

xg.fit(train[features], y_train, eval_set=[(test[features], y_test)])

[0]	validation_0-auc:0.81534
[1]	validation_0-auc:0.81824
[2]	validation_0-auc:0.81947
[3]	validation_0-auc:0.82023
[4]	validation_0-auc:0.82077
[5]	validation_0-auc:0.82117
[6]	validation_0-auc:0.82150
[7]	validation_0-auc:0.82173
[8]	validation_0-auc:0.82198
[9]	validation_0-auc:0.82220
[10]	validation_0-auc:0.82241
[11]	validation_0-auc:0.82256
[12]	validation_0-auc:0.82271
[13]	validation_0-auc:0.82288
[14]	validation_0-auc:0.82301
[15]	validation_0-auc:0.82313
[16]	validation_0-auc:0.82325
[17]	validation_0-auc:0.82334
[18]	validation_0-auc:0.82344
[19]	validation_0-auc:0.82349
[20]	validation_0-auc:0.82362
[21]	validation_0-auc:0.82371
[22]	validation_0-auc:0.82381
[23]	validation_0-auc:0.82390
[24]	validation_0-auc:0.82402
[25]	validation_0-auc:0.82408
[26]	validation_0-auc:0.82415
[27]	validation_0-auc:0.82424
[28]	validation_0-auc:0.82431
[29]	validation_0-auc:0.82435
[30]	validation_0-auc:0.82442
[31]	validation_0-auc:0.82445
[32]	validation_0-auc:0.82451
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=100, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=5, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=800,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [580]:
xg_train_preds = xg.predict_proba(train[features])
xg_valid_preds = xg.predict_proba(test[features])

## Catboost 앙상블

In [581]:
train_data_path = os.path.join(data_dir, 'train_data.csv')
test_data_path = os.path.join(data_dir, 'test_data.csv')
train_df_for_catboost = pd.read_csv(train_data_path)
test_df_for_catboost = pd.read_csv(test_data_path)

In [582]:
# # train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
# random.seed(42)
# def custom_train_test_split_for_catboost(df, ratio=0.7, split=True):
#
#     users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
#     random.shuffle(users)
#
#     max_train_data_len = ratio*len(df)
#     sum_of_train_data = 0
#     user_ids =[]
#
#     for user_id, count in users:
#         sum_of_train_data += count
#         if max_train_data_len < sum_of_train_data:
#             break
#         user_ids.append(user_id)
#
#
#     train = df[df['userID'].isin(user_ids)]
#     test = df[df['userID'].isin(user_ids) == False]
#
#     #test데이터셋은 각 유저의 마지막 interaction만 추출
#     test = test[test['userID'] != test['userID'].shift(-1)]
#     return train, test

In [583]:
# def feature_engineering_for_catboost(df):
#     scaler= MinMaxScaler()
#
#     df["Timestamp"] = df["Timestamp"].apply(convert_time)
#
#     # --- diff
#     # df['diff'] = df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff(periods=1).apply(abs)
#     df['diff'] = df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()
#
#     # nan은 -1
#     # 600(10분) 이상이면 다 600
#     df['diff'].fillna(-1, inplace=True)
#     idx = df[df['diff'] >= 600].index
#     df.loc[idx, 'diff'] = 600
#
#     # # --- mean 태그별 문제 풀이 평균 시간
#     tmp= df[df['diff']>=0]
#     correct_k= tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
#     df= pd.merge(df, correct_k, on=['KnowledgeTag'], how= 'left')
#     df['mean']= scaler.fit_transform(df['mean'].values.reshape(-1, 1)).reshape(-1) # minmax scaling
#
#
#     # --- before_tag 이전 태그 문제 풀이 여부
#     df2= df.sort_values(['userID','KnowledgeTag','Timestamp'])
#     df2.reset_index(inplace=True, drop=True)
#     df2['cumsum']= df2.groupby(['userID','KnowledgeTag'])['answerCode'].cumsum()
#     df2['temp']= 1
#     df2['seq']= df2.groupby(['userID','KnowledgeTag'])['temp'].cumsum()
#     df2.drop(['temp'], axis=1, inplace=True)
#     df2['cumsum'] -= df2['answerCode']
#
#     df2['seq'] -= 1
#     df2['before_tag']= df2['cumsum']/df2['seq']
#
#     tag_avg= dict(df2.groupby(['KnowledgeTag'])['answerCode'].mean())
#     def match_avg(x):
#         if x>1:
#             return tag_avg[x]
#         else:
#             return x
#
#     df2['before_tag'].fillna(df2.KnowledgeTag, inplace=True)
#     df2['before_tag']= df2['before_tag'].apply(match_avg)
#     df2.loc[df2[df2['before_tag'] >= 0.5].index,'before_tag'] = 1
#     df2.loc[df2[df2['before_tag'] < 0.5].index,'before_tag'] = 0
#
#     df= pd.merge(df2[['userID','assessmentItemID','Timestamp','before_tag']],df, on=['userID','assessmentItemID','Timestamp'])
#
#     # # --- user_correct_answer
#     df2= df.sort_values(by=['userID','Timestamp'])
#     df2['user_correct_answer'] = df2.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
#     df2['user_correct_answer'].fillna(0)
#
#     # # --- user_acc
#     df2['user_total_answer'] = df2.groupby('userID')['answerCode'].cumcount()
#     df2['user_acc'] = df2['user_correct_answer']/df2['user_total_answer']
#
#     df2['user_correct_answer']= scaler.fit_transform(df2['user_correct_answer'].values.reshape(-1, 1)).reshape(-1)
#
#     # # test mean, sum
#     correct_t = df2.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
#     correct_t.columns = ["test_mean", 'test_sum']
#     df2 = pd.merge(df2, correct_t, on=['testId'], how="left")
#     df2['test_mean']= scaler.fit_transform(df2['test_mean'].values.reshape(-1,1)).reshape(-1)
#     df2['test_sum']= scaler.fit_transform(df2['test_sum'].values.reshape(-1,1)).reshape(-1)
#
#     df= pd.merge(df2[['userID','assessmentItemID','Timestamp','user_correct_answer', 'user_acc', 'test_mean', 'test_sum']],df, on=['userID','assessmentItemID','Timestamp'])
#
#     # # --- test_seq
#     df2= df.sort_values(['userID','testId','Timestamp'])
#     df2.reset_index(inplace=True, drop=True)
#     df2['test_seq']= df2.groupby(['userID','testId']).cumcount()
#     # self.args.num_test_seq= df2['test_seq'].max()+2 # 개수 + 패딩
#     df= pd.merge(df2[['userID','assessmentItemID','Timestamp','test_seq']], df, on=['userID','assessmentItemID','Timestamp'])
#
#     # # --- Timestamp 스케일링
#     df['Timestamp']= scaler.fit_transform(df['Timestamp'].values.reshape(-1,1)).reshape(-1)
#
#     df['item']= df['assessmentItemID']
#
#     return df

In [584]:
# train_df_cb = feature_engineering(train_df_for_catboost)

In [585]:
# train_df_cb

In [586]:
test_df_cb = feature_engineering(test_df_for_catboost)

In [587]:
test_df_cb

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,,0,,0.473214,106,0.587097,91,0.750000,24
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,1.0,1,1.000000,0.473214,106,0.587097,91,0.562500,18
2,3,A050023003,A050000023,0,2020-01-09 10:58:31,2625,2.0,2,1.000000,0.473214,106,0.588517,123,0.343750,11
3,3,A050023004,A050000023,0,2020-01-09 10:58:36,2625,2.0,3,0.666667,0.473214,106,0.588517,123,0.500000,16
4,3,A050023006,A050000023,0,2020-01-09 10:58:43,2623,2.0,4,0.500000,0.473214,106,0.581986,252,0.281250,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260109,7439,A040130001,A040000130,0,2020-10-14 23:07:23,8832,8.0,11,0.727273,0.604762,127,0.584848,193,0.380952,16
260110,7439,A040130002,A040000130,1,2020-10-14 23:07:41,8832,8.0,12,0.666667,0.604762,127,0.584848,193,0.523810,22
260111,7439,A040130003,A040000130,1,2020-10-14 23:08:02,8244,9.0,13,0.692308,0.604762,127,0.725490,111,0.761905,32
260112,7439,A040130004,A040000130,1,2020-10-14 23:09:31,8244,10.0,14,0.714286,0.604762,127,0.725490,111,0.857143,36


In [588]:
inference_cb = test_df_cb.loc[test_df_cb['answerCode'] == -1]

In [589]:
inference_cb

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,item_mean,item_sum
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,717.0,1035,0.692754,0.661765,90,0.542662,159,0.470588,8
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,465.0,670,0.694030,0.740385,77,0.565693,155,0.153846,2
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,915.0,1316,0.695289,0.417857,117,0.446753,172,0.285714,10
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,1031.0,1259,0.818904,0.625000,30,0.514286,36,0.250000,2
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,293.0,386,0.759067,0.678571,133,0.602767,305,0.321429,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260052,7395,A040122005,A040000122,-1,2020-09-08 02:05:20,10615,7.0,23,0.304348,0.753846,147,0.654902,167,0.358974,14
260067,7404,A030111005,A030000111,-1,2020-10-13 09:49:18,7636,7.0,14,0.500000,0.866667,156,0.834661,419,0.861111,31
260082,7416,A050193004,A050000193,-1,2020-10-04 02:44:41,10402,7.0,14,0.500000,0.750000,75,0.792517,233,0.760000,19
260097,7417,A050193004,A050000193,-1,2020-09-06 13:09:15,10402,2.0,14,0.142857,0.750000,75,0.792517,233,0.760000,19


In [590]:
train_cb, test_cb = custom_train_test_split(df)

In [591]:
categorical_cols =  ['assessmentItemID', 'testId', 'before_tag']
train_cb = train_cb[['assessmentItemID', 'testId', 'answerCode', 'test_seq','user_correct_answer', 'user_acc',
                'test_mean', 'before_tag', 'diff']]
train_cb[categorical_cols] = train_cb[categorical_cols].fillna("")
train_cb['diff'] = train_cb['diff'].astype(int)
train_cb['before_tag'] = train_cb['before_tag'].astype(int)
train_cb

KeyError: "['test_seq', 'before_tag'] not in index"

In [None]:
from catboost import CatBoostClassifier, Pool

TARGET_COL = 'answerCode'
X_train_cb = train_cb.drop([TARGET_COL],axis=1)
y_train_cb = train_cb[TARGET_COL]

train_pool = Pool(data=X_train_cb, label = y_train_cb, cat_features=categorical_cols)

In [None]:
categorical_cols =  ['assessmentItemID', 'testId', 'before_tag']
test_cb = test_cb[['assessmentItemID', 'testId', 'answerCode', 'test_seq','user_correct_answer', 'user_acc',
                'test_mean', 'before_tag', 'diff']]
test_cb[categorical_cols] = test_cb[categorical_cols].fillna("")
test_cb['diff'] = test_cb['diff'].astype(int)
test_cb['before_tag'] = test_cb['before_tag'].astype(int)

In [None]:
TARGET_COL = 'answerCode'
X_test_cb = test_cb.drop([TARGET_COL],axis=1)
y_test_cb = test_cb[TARGET_COL]

test_pool = Pool(data=X_test_cb,label=y_test_cb,cat_features=categorical_cols)

In [None]:
categorical_cols =  ['assessmentItemID', 'testId', 'before_tag']
inference_cb = inference_cb[['assessmentItemID', 'testId', 'answerCode', 'test_seq','user_correct_answer', 'user_acc',
                'test_mean', 'before_tag', 'diff']]
inference_cb[categorical_cols] = inference_cb[categorical_cols].fillna("")
inference_cb['diff'] = inference_cb['diff'].astype(int)
# inference[['before_tag', 'diff']] = inference[['before_tag', 'diff']].astype(int)
inference_cb['before_tag'] = inference_cb['before_tag'].astype(int)
# inference.drop(columns=['Timestamp'], inplace=True)

TARGET_COL = 'answerCode'
X_infer_cb = inference_cb.drop([TARGET_COL],axis=1)
y_infer_cb = inference_cb[TARGET_COL]

infer_pool = Pool(data=X_infer_cb,label = y_infer_cb,cat_features=categorical_cols)

In [None]:
params = {'iterations':5000,
          'random_seed':63,
          'learning_rate':0.01,
          # 'max_depth' : 10,
          'eval_metric' : 'AUC',
          # 'objective' : 'CrossEntropy',
          'loss_function':'CrossEntropy',
          # 'custom_metric':['Logloss','AUC'],##, '
          'early_stopping_rounds':100,
          'use_best_model': True,
          'task_type':"GPU",
        #   'bagging_temperature':1,
          'verbose':100}

In [None]:
model_basic = CatBoostClassifier(**params)#,learning_rate=0.1, task_type="GPU",)
# model_basic = CatBoostClassifier(verbose=50)#,learning_rate=0.1, task_type="GPU",)
model_basic.fit(train_pool, eval_set=test_pool, use_best_model=True)
print(model_basic.get_best_score())

In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(20,10))

importances = pd.Series(model_basic.feature_importances_, index = train_pool.get_feature_names())
importances = importances.sort_values()
importances.plot(kind = "barh")
plt.title("importance in the CBClassifier Model")

plt.show()

In [None]:
print("train ACC :", model_basic.score(train_pool))
print("test ACC :", model_basic.score(test_pool))

pred = model_basic.predict_proba(test_pool)[:,1]
print("test AUC :", roc_auc_score(y_test_cb, pred))

In [None]:
print("test set 평균, 최대값, 최소값")
print(pred.mean())
print(pred.max())
print(pred.min())

In [None]:
cb_train_preds = model_basic.predict_proba(train_pool)[:,-1]
cb_valid_preds = model_basic.predict_proba(test_pool)[:,-1]

In [None]:
# inference
infer_cbc = model_basic.predict_proba(infer_pool)[:,1]
infer_cbc[:10]

In [None]:
len(infer_cbc)

In [None]:
print("infer set 평균, 최대값, 최소값")
print(infer_cbc.mean())
print(infer_cbc.max())
print(infer_cbc.min())

In [None]:
output = pd.DataFrame({"id":range(744), 'prediction':infer_cbc})
output

In [None]:
output.to_csv("cbc_fe_all4.csv", index=False)
print("output 저장.")

## Stacking과 Meta Model 사용

In [592]:
lgbm_train_acc = accuracy_score(y_train, np.where(lgbm_train_preds >= 0.5, 1, 0))
lgbm_train_auc = roc_auc_score(y_train, lgbm_train_preds)

lgbm_valid_acc = accuracy_score(y_test, np.where(lgbm_valid_preds >= 0.5, 1, 0))
lgbm_valid_auc = roc_auc_score(y_test, lgbm_valid_preds)

xg_train_preds = xg_train_preds[:,-1]
xg_valid_preds = xg_valid_preds[:,-1]

xgboost_train_acc = accuracy_score(y_train, np.where(xg_train_preds >= 0.5, 1, 0))
xgboost_train_auc = roc_auc_score(y_train, xg_train_preds)

xgboost_valid_acc = accuracy_score(y_test, np.where(xg_valid_preds >= 0.5, 1, 0))
xgboost_valid_auc = roc_auc_score(y_test, xg_valid_preds)

# catboost_train_acc = accuracy_score(y_train_cb, np.where(cb_train_preds >= 0.5, 1, 0))
# catboost_train_auc = roc_auc_score(y_train_cb, cb_train_preds)
#
# catboost_valid_acc = accuracy_score(y_test_cb, np.where(cb_valid_preds >= 0.5, 1, 0))
# catboost_valid_auc = roc_auc_score(y_test_cb, cb_valid_preds)

In [593]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

ensemble_train = pd.DataFrame()
ensemble_train['lightgbm'] = lgbm_train_preds
ensemble_train['xgboost'] = xg_train_preds
# ensemble_train['catboost'] = cb_train_preds

meta_model = GaussianNB()
meta_model.fit(ensemble_train, y_train)

meta_train_pred = meta_model.predict_proba(ensemble_train)
meta_train_pred = meta_train_pred[:,-1]

meta_train_acc = accuracy_score(y_train, np.where(meta_train_pred >= 0.5, 1, 0))
meta_train_auc = roc_auc_score(y_train, meta_train_pred)

In [594]:
ensemble_valid = pd.DataFrame()
ensemble_valid['lightgbm'] = lgbm_valid_preds
ensemble_valid['xgboost'] = xg_valid_preds
# ensemble_valid['catboost'] = cb_valid_preds

meta_valid_pred = meta_model.predict_proba(ensemble_valid)
meta_valid_pred = meta_valid_pred[:,-1]
meta_valid_acc = accuracy_score(y_test, np.where(meta_valid_pred >= 0.5, 1, 0))
meta_valid_auc = roc_auc_score(y_test, meta_valid_pred)

In [595]:
# 앙상블 기존 방법: 가중 합
# train_ensem = (ensemble_train.lightgbm + ensemble_train.xgboost + ensemble_train.catboost)/2
train_ensem = (ensemble_train.lightgbm + ensemble_train.xgboost)/2
ensem_train_acc = accuracy_score(y_train, np.where(train_ensem >= 0.5, 1, 0))
ensem_train_auc = roc_auc_score(y_train, train_ensem)

# valid_ensem = (ensemble_valid.lightgbm + ensemble_valid.xgboost + ensemble_valid.catboost)/2
valid_ensem = (ensemble_valid.lightgbm + ensemble_valid.xgboost)/2
ensem_valid_acc = accuracy_score(y_test, np.where(valid_ensem >= 0.5, 1, 0))
ensem_valid_auc = roc_auc_score(y_test, valid_ensem)

In [596]:
print(f'TRAIN LGBM AUC : {lgbm_train_auc} ACC : {lgbm_train_acc}')
print(f'VALID LGBM AUC : {lgbm_valid_auc} ACC : {lgbm_valid_acc}\n')

print(f'TRAIN XGBOOST AUC : {xgboost_train_auc} ACC : {xgboost_train_acc}')
print(f'VALID XGBOOST AUC : {xgboost_valid_auc} ACC : {xgboost_valid_acc}\n')

print(f'TRAIN META AUC : {meta_train_auc} ACC : {meta_train_acc}')
print(f'VALID META AUC : {meta_valid_auc} ACC : {meta_valid_acc}\n')

print(f'TRAIN ENSEM_5 AUC : {ensem_train_auc} ACC : {ensem_train_acc}')
print(f'VALID ENSEM_5 AUC : {ensem_valid_auc} ACC : {ensem_valid_acc}')

TRAIN LGBM AUC : 0.8435949562103464 ACC : 0.7879768522283733
VALID LGBM AUC : 0.825441141750568 ACC : 0.7764586753357073

TRAIN XGBOOST AUC : 0.8603065456885014 ACC : 0.798779692048139
VALID XGBOOST AUC : 0.8253224427676324 ACC : 0.7764645539799157

TRAIN META AUC : 0.8537903732638416 ACC : 0.7933962400947699
VALID META AUC : 0.8263644130435972 ACC : 0.7754093373445282

TRAIN ENSEM_5 AUC : 0.8532725119653202 ACC : 0.7939327569717247
VALID ENSEM_5 AUC : 0.826379132888058 ACC : 0.7769980409418176


In [597]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = preprocessing(test_df, is_train=False)

test_df = test_df.sort_values(['userID', 'Timestamp'])

# diff: nan은 -1 , 600(10분) 이상이면 다 600
test_df['diff'] = test_df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()#periods=-1).apply(abs)
test_df['diff'].fillna(-1, inplace=True)
idx = test_df[test_df['diff'] >= 600].index
test_df.loc[idx, 'diff'] = 600

tmp = test_df[test_df['diff'] >= 0]
correct_k = tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
correct_k.columns = ['diffMean']
test_df = pd.merge(test_df, correct_k, on=['KnowledgeTag'], how="left")

# min max scalar
# test_df['mean'] = diff_scalar.transform(test_df['mean'].values.reshape(-1, 1))
test_df['user_total_answer'] = user_total_scaler.transform(test_df['user_total_answer'].values.reshape(-1, 1))
test_df['user_correct_answer'] = user_correct_scalar.transform(test_df['user_correct_answer'].values.reshape(-1, 1))
test_df['Timestamp'] = tmestamp_scalar.transform(test_df['Timestamp'].values.reshape(-1, 1))
test_df['test_sum'] = test_sum_scalar.transform(test_df['test_sum'].values.reshape(-1, 1))
test_df['tag_sum'] = tag_sum_scalar.transform(test_df['tag_sum'].values.reshape(-1, 1))

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)
test_df.fillna(0, axis=1, inplace=True)

In [600]:
lgbm_test_preds = model.predict(test_df[features])
xg_test_preds = xg.predict_proba(test_df[features])
xg_test_preds = xg_test_preds[:,-1]

## valid
ensem_test = pd.DataFrame()
ensem_test['lgbm'] = lgbm_test_preds
ensem_test['xg'] =  xg_test_preds

# meta_test_pred = meta_model.predict_proba(ensem_test)
# meta_test_pred = meta_test_pred[:,-1]
meta_test_pred = (lgbm_test_preds + xg_test_preds) / 2

In [601]:
# SAVE OUTPUT
output_dir = 'output/lgbm/'
write_path = os.path.join(output_dir, "meta_xg_lgbm.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(meta_test_pred):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/lgbm/meta_xg_lgbm.csv
