In [1]:
import pandas as pd
import os
import random
import numpy as np

In [167]:
data_dir = '/opt/ml/input/data' # 경로는 상황에 맞춰서 수정해주세요!
csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path) 

# 전처리

In [168]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

In [169]:
df = feature_engineering(df)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,,0,,0.947683,1268,0.955022,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1.0,1,1.0,0.947683,1268,0.913187,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,0.947683,1268,0.913187,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,0.947683,1268,0.913187,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,0.947683,1268,0.913187,3040


In [170]:
from sklearn.preprocessing import LabelEncoder
import time
from datetime import datetime
asset_dir = '/opt/ml/input/code/'#/asset/'

def preprocessing(df, is_train=True):
    cate_cols = ["assessmentItemID", "testId", "KnowledgeTag"]

    #if not os.path.exists(self.args.asset_dir):
    #     os.makedirs(self.args.asset_dir)

    for col in cate_cols:

        le = LabelEncoder()
        if is_train:
            # For UNKNOWN class
            a = df[col].unique().tolist() + ["unknown"]
            le.fit(a)
            le_path = os.path.join(asset_dir, col + "_classes.npy")
            print(le_path)
            #print(le_path)
            np.save(le_path, le.classes_)
        
        else:
            label_path = os.path.join(asset_dir, col + "_classes.npy")
            print(label_path)
            le.classes_ = np.load(label_path)

            df[col] = df[col].apply(
                lambda x: x if str(x) in le.classes_ else "unknown"
            )

        # 모든 컬럼이 범주형이라고 가정
        df[col] = df[col].astype(str)
        test = le.transform(df[col])
        df[col] = test

    def convert_time(s):
        timestamp = time.mktime(
            datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
        )
        return int(timestamp)

    df["Timestamp"] = df["Timestamp"].apply(convert_time)

    return df

In [171]:
df = preprocessing(df, is_train=True)

/opt/ml/input/code/assessmentItemID_classes.npy
/opt/ml/input/code/testId_classes.npy
/opt/ml/input/code/KnowledgeTag_classes.npy


In [172]:
df = df.sort_values(['userID', 'Timestamp'])

In [173]:
# diff
df['diff'] = df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()#periods=-1).apply(abs)

diff_df = df['diff']
diff_df.dropna(inplace=True)

# nan은 -1
# 600(10분) 이상이면 다 600
df['diff'].fillna(-1, inplace=True)
idx = df[df['diff'] >= 600].index
df.loc[idx, 'diff'] = 600

In [175]:
tmp = df[df['diff'] >= 0]
correct_k = tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")

In [177]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    #test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [178]:
df

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,test_mean,test_sum,tag_mean,tag_sum,diff,mean
0,0,5354,975,1,1585009031,618,,0,,0.947683,1268,0.955022,637,-1.0,345.923228
1,0,5355,975,1,1585009034,619,1.0,1,1.000000,0.947683,1268,0.913187,3040,3.0,59.008799
2,0,5356,975,1,1585009042,619,2.0,2,1.000000,0.947683,1268,0.913187,3040,8.0,59.008799
3,0,5357,975,1,1585009049,619,3.0,3,1.000000,0.947683,1268,0.913187,3040,7.0,59.008799
4,0,5358,975,1,1585009056,619,4.0,4,1.000000,0.947683,1268,0.913187,3040,7.0,59.008799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,7441,2373,456,0,1591339821,375,1.0,4,0.250000,0.662590,921,0.689706,2814,24.0,160.096973
2266582,7441,3909,748,1,1597971999,784,1.0,5,0.200000,0.655109,718,0.697874,2199,600.0,162.502079
2266583,7441,3910,748,1,1597972010,784,2.0,6,0.333333,0.655109,718,0.697874,2199,11.0,162.502079
2266584,7441,3911,748,1,1597972056,784,3.0,7,0.428571,0.655109,718,0.697874,2199,46.0,162.502079


In [179]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc',
       'test_mean', 'test_sum', 'tag_mean', 'tag_sum', 'diff', 'mean'],
      dtype='object')

In [180]:
# 유저별 분리
train, test = custom_train_test_split(df)

#FEATS = ['userID', 'assessmentItemID', 'testId', 'Timestamp',
#       'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'user_acc',
#       'test_mean', 'test_sum', 'tag_mean', 'tag_sum', 'diff', 'mean']
# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 'user_correct_answer',
         'user_acc', 'test_mean', 'test_sum', 'tag_mean',
         'assessmentItemID', 'Timestamp', 'diff', 'mean']#, 'tag_sum']

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)
train.fillna(0, axis=1, inplace=True)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)
test.fillna(0, axis=1, inplace=True)

In [181]:
from sklearn.preprocessing import MinMaxScaler

#diff_scalar = MinMaxScaler()
#train['mean'] = diff_scalar.fit_transform(train['mean'].values.reshape(-1, 1))
#test['mean'] = diff_scalar.transform(test['mean'].values.reshape(-1, 1))

user_total_scaler = MinMaxScaler()
train['user_total_answer'] = user_total_scaler.fit_transform(train['user_total_answer'].values.reshape(-1, 1))
test['user_total_answer'] = user_total_scaler.transform(test['user_total_answer'].values.reshape(-1, 1))

user_correct_scalar = MinMaxScaler()
train['user_correct_answer'] = user_correct_scalar.fit_transform(train['user_correct_answer'].values.reshape(-1, 1))
test['user_correct_answer'] = user_correct_scalar.transform(test['user_correct_answer'].values.reshape(-1, 1))

tmestamp_scalar = MinMaxScaler()
train['Timestamp'] = tmestamp_scalar.fit_transform(train['Timestamp'].values.reshape(-1, 1))
test['Timestamp'] = tmestamp_scalar.transform(test['Timestamp'].values.reshape(-1, 1))

test_sum_scalar = MinMaxScaler()
train['test_sum'] = test_sum_scalar.fit_transform(train['test_sum'].values.reshape(-1, 1))
test['test_sum'] = test_sum_scalar.transform(test['test_sum'].values.reshape(-1, 1))

tag_sum_scalar = MinMaxScaler()
train['tag_sum'] = tag_sum_scalar.fit_transform(train['tag_sum'].values.reshape(-1, 1))
test['tag_sum'] = tag_sum_scalar.transform(test['tag_sum'].values.reshape(-1, 1))

# 모델링

### lightgbm

In [182]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

In [183]:
def model_train(): 
    model = lgb.train(
        {'objective': 'binary'}, 
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=2500, #config.num_boost_round,
        early_stopping_rounds=100 #config.early_stopping_rounds
    )

    preds = model.predict(test[FEATS])
    acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test, preds)

    print(f'VALID AUC : {auc} ACC : {acc}\n')

    return model

In [184]:
model = model_train()



[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2547
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.520279	valid_1's binary_logloss: 0.52452
[200]	training's binary_logloss: 0.516122	valid_1's binary_logloss: 0.52202
[300]	training's binary_logloss: 0.513424	valid_1's binary_logloss: 0.521018
[400]	training's binary_logloss: 0.510904	valid_1's binary_logloss: 0.520107
[500]	training's binary_logloss: 0.508831	valid_1's binary_logloss: 0.519422
[600]	training's binary_logloss: 0.506856	valid_1's binary_logloss: 0.51886
[700]	training's binar

In [185]:
lgbm_train_preds = model.predict(train[FEATS])
lgbm_valid_preds = model.predict(test[FEATS])


### xgboost

In [213]:
import xgboost as xgb
from xgboost import XGBClassifier

params = {
    'learning_rate': 0.1,
    'objective': 'binary:logistic',
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'n_estimators': 800,
    'early_stopping_rounds': 100,
    'max_depth': 12, 
    'max_leaves':5, 
    # 'min_child_weigh'1
    }

xg = XGBClassifier(**params)

xg.fit(train[FEATS], y_train, eval_set=[(test[FEATS], y_test)])

[0]	validation_0-auc:0.76518
[1]	validation_0-auc:0.76891
[2]	validation_0-auc:0.77056
[3]	validation_0-auc:0.77177
[4]	validation_0-auc:0.77237
[5]	validation_0-auc:0.77285
[6]	validation_0-auc:0.77352
[7]	validation_0-auc:0.77404
[8]	validation_0-auc:0.77444
[9]	validation_0-auc:0.77508
[10]	validation_0-auc:0.77549
[11]	validation_0-auc:0.77585
[12]	validation_0-auc:0.77614
[13]	validation_0-auc:0.77648
[14]	validation_0-auc:0.77685
[15]	validation_0-auc:0.77714
[16]	validation_0-auc:0.77743
[17]	validation_0-auc:0.77762
[18]	validation_0-auc:0.77793
[19]	validation_0-auc:0.77813
[20]	validation_0-auc:0.77840
[21]	validation_0-auc:0.77861
[22]	validation_0-auc:0.77892
[23]	validation_0-auc:0.77908
[24]	validation_0-auc:0.77930
[25]	validation_0-auc:0.77955
[26]	validation_0-auc:0.77971
[27]	validation_0-auc:0.77985
[28]	validation_0-auc:0.77999
[29]	validation_0-auc:0.78027
[30]	validation_0-auc:0.78049
[31]	validation_0-auc:0.78074
[32]	validation_0-auc:0.78115
[33]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=100, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=5, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=800,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [214]:
xg_train_preds = xg.predict_proba(train[FEATS])
xg_valid_preds = xg.predict_proba(test[FEATS])

# 앙상블

In [215]:
# 단독모델 성능
lgbm_train_acc = accuracy_score(y_train, np.where(lgbm_train_preds >= 0.5, 1, 0))
lgbm_train_auc = roc_auc_score(y_train, lgbm_train_preds)

lgbm_valid_acc = accuracy_score(y_test, np.where(lgbm_valid_preds >= 0.5, 1, 0))
lgbm_valid_auc = roc_auc_score(y_test, lgbm_valid_preds)

xg_train_preds = xg_train_preds[:,-1]
xg_valid_preds = xg_valid_preds[:,-1]
xgboost_train_acc = accuracy_score(y_train, np.where(xg_train_preds >= 0.5, 1, 0))
xgboost_train_auc = roc_auc_score(y_train, xg_train_preds)

xgboost_valid_acc = accuracy_score(y_test, np.where(xg_valid_preds >= 0.5, 1, 0))
xgboost_valid_auc = roc_auc_score(y_test, xg_valid_preds)

In [216]:
# 메타러닝
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

ensem_train = pd.DataFrame()
ensem_train['lgbm'] = lgbm_train_preds
ensem_train['xg'] =  xg_train_preds

#meta_model = LogisticRegression()
meta_model = GaussianNB()
#meta_model = DecisionTreeClassifier()
meta_model.fit(ensem_train, y_train)

meta_train_pred = meta_model.predict_proba(ensem_train)

meta_train_pred = meta_train_pred[:,-1]
meta_train_acc = accuracy_score(y_train, np.where(meta_train_pred >= 0.5, 1, 0))
meta_train_auc = roc_auc_score(y_train, meta_train_pred)

## valid
ensem_valid = pd.DataFrame()
ensem_valid['lgbm'] = lgbm_valid_preds
ensem_valid['xg'] =  xg_valid_preds

meta_valid_pred = meta_model.predict_proba(ensem_valid)
meta_valid_pred = meta_valid_pred[:,-1]
meta_valid_acc = accuracy_score(y_test, np.where(meta_valid_pred >= 0.5, 1, 0))
meta_valid_auc = roc_auc_score(y_test, meta_valid_pred)

In [217]:
# 앙상블 기존 방법: 가중 합
train_ensem_5 = (ensem_train.lgbm + ensem_train.xg)/2
ensem_5_train_acc = accuracy_score(y_train, np.where(train_ensem_5 >= 0.5, 1, 0))
ensem_5_train_auc = roc_auc_score(y_train, train_ensem_5)

valid_ensem_5 = (ensem_valid.lgbm + ensem_valid.xg)/2
ensem_5_valid_acc = accuracy_score(y_test, np.where(valid_ensem_5 >= 0.5, 1, 0))
ensem_5_valid_auc = roc_auc_score(y_test, valid_ensem_5)

In [218]:
print(f'TRAIN LGBM AUC : {lgbm_train_auc} ACC : {lgbm_train_acc}')
print(f'VALID LGBM AUC : {lgbm_valid_auc} ACC : {lgbm_valid_acc}\n')

print(f'TRAIN XGBOOST AUC : {xgboost_train_auc} ACC : {xgboost_train_acc}')
print(f'VALID XGBOOST AUC : {xgboost_valid_auc} ACC : {xgboost_valid_acc}\n')

print(f'TRAIN META AUC : {meta_train_auc} ACC : {meta_train_acc}')
print(f'VALID META AUC : {meta_valid_auc} ACC : {meta_valid_acc}\n')

print(f'TRAIN ENSEM_5 AUC : {ensem_5_train_auc} ACC : {ensem_5_train_acc}')
print(f'VALID ENSEM_5 AUC : {ensem_5_valid_auc} ACC : {ensem_5_valid_acc}')

TRAIN LGBM AUC : 0.817897566425905 ACC : 0.7726360000933072
VALID LGBM AUC : 0.7844633271046747 ACC : 0.7557232275520297

TRAIN XGBOOST AUC : 0.9089044442253097 ACC : 0.8274180929126184
VALID XGBOOST AUC : 0.8020092042657967 ACC : 0.7642310953824719

TRAIN META AUC : 0.886815593222321 ACC : 0.8169140885801343
VALID META AUC : 0.8003611384783527 ACC : 0.7619707566843859

TRAIN ENSEM_5 AUC : 0.876101012354168 ACC : 0.8004857022350247
VALID ENSEM_5 AUC : 0.7986952096820791 ACC : 0.7620736329580309


# Inference

In [193]:
# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path)

# FEATURE ENGINEERING
test_df = feature_engineering(test_df)
test_df = preprocessing(test_df, is_train=False)

test_df = test_df.sort_values(['userID', 'Timestamp'])

# diff: nan은 -1 , 600(10분) 이상이면 다 600
test_df['diff'] = test_df.sort_values(['userID','Timestamp']).groupby('userID')['Timestamp'].diff()#periods=-1).apply(abs)
test_df['diff'].fillna(-1, inplace=True)
idx = test_df[test_df['diff'] >= 600].index
test_df.loc[idx, 'diff'] = 600

tmp = test_df[test_df['diff'] >= 0]
correct_k = tmp.groupby(['KnowledgeTag'])['diff'].agg(['mean'])
test_df = pd.merge(test_df, correct_k, on=['KnowledgeTag'], how="left")

# min max scalar
#test_df['mean'] = diff_scalar.transform(test_df['mean'].values.reshape(-1, 1))
test_df['user_total_answer'] = user_total_scaler.transform(test_df['user_total_answer'].values.reshape(-1, 1))
test_df['user_correct_answer'] = user_correct_scalar.transform(test_df['user_correct_answer'].values.reshape(-1, 1))
test_df['Timestamp'] = tmestamp_scalar.transform(test_df['Timestamp'].values.reshape(-1, 1))
test_df['test_sum'] = test_sum_scalar.transform(test_df['test_sum'].values.reshape(-1, 1))
test_df['tag_sum'] = tag_sum_scalar.transform(test_df['tag_sum'].values.reshape(-1, 1))

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)
test_df.fillna(0, axis=1, inplace=True)

/opt/ml/input/code/assessmentItemID_classes.npy
/opt/ml/input/code/testId_classes.npy
/opt/ml/input/code/KnowledgeTag_classes.npy


In [194]:
lgbm_test_preds = model.predict(test_df[FEATS])
xg_test_preds = xg.predict_proba(test_df[FEATS])
xg_test_preds = xg_test_preds[:,-1]

## valid
ensem_test = pd.DataFrame()
ensem_test['lgbm'] = lgbm_test_preds
ensem_test['xg'] =  xg_test_preds

meta_test_pred = meta_model.predict_proba(ensem_test)
meta_test_pred = meta_test_pred[:,-1]

In [196]:
# SAVE OUTPUT
output_dir = 'output/lgbm/'
write_path = os.path.join(output_dir, "meta_xg_lgbm.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(meta_test_pred):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/lgbm/meta_xg_lgbm.csv
