# **Import**

In [1]:
import warnings

warnings.filterwarnings('ignore')

In [19]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.calibration import calibration_curve

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import shap

# **Data Load**

In [3]:
cd /content/drive/MyDrive/[Projects]/Dacon/운수종사자 인지적 특성 데이터를 활용한 교통사고 위험 예측 AI 경진대회/Data

/content/drive/MyDrive/[Projects]/Dacon/운수종사자 인지적 특성 데이터를 활용한 교통사고 위험 예측 AI 경진대회/Data


In [6]:
train_a = pd.read_csv('./train_a_renamed.csv')
train_a = train_a.dropna()
train_a.info()

<class 'pandas.core.frame.DataFrame'>
Index: 647237 entries, 0 to 647240
Data columns (total 38 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   Test_id             647237 non-null  object
 1   Test                647237 non-null  object
 2   Label               647237 non-null  int64 
 3   PrimaryKey          647237 non-null  object
 4   Age                 647237 non-null  object
 5   TestDate            647237 non-null  int64 
 6   A1_Direction        647237 non-null  object
 7   A1_Speed            647237 non-null  object
 8   A1_Response         647237 non-null  object
 9   A1_ResponseTime     647237 non-null  object
 10  A2_Speed1           647237 non-null  object
 11  A2_Speed2           647237 non-null  object
 12  A2_Response         647237 non-null  object
 13  A2_ResponseTime     647237 non-null  object
 14  A3_ArrowSize        647237 non-null  object
 15  A3_ArrowPosition    647237 non-null  object
 16  A3_Arro

# **Features**

In [7]:
list_cols = ['A1_Direction', 'A1_Speed', 'A1_Response', 'A1_ResponseTime',
             'A2_Speed1', 'A2_Speed2', 'A2_Response', 'A2_ResponseTime',
             'A3_ArrowSize', 'A3_ArrowPosition', 'A3_ArrowDirection',
             'A3_CorrectPosition', 'A3_ResponseType', 'A3_Response', 'A3_ResponseTime',
             'A4_Condition', 'A4_Color', 'A4_Response1', 'A4_Response2', 'A4_ResponseTime',
             'A5_ChangeType', 'A5_Response1', 'A5_Response']

for col in list_cols:
    train_a[col + '_list'] = train_a[col].apply(lambda x: list(map(int, x.split(','))) if isinstance(x, str) else x)

In [8]:
train_a['Age'] = train_a['Age'].astype(str).str.extract(r'(\d+)').astype(int)
print(train_a['Age'].head())

0    50
1    60
2    50
3    50
4    40
Name: Age, dtype: int64


In [9]:
def compute_A1_features(row):
    # 리스트 추출
    d = row['A1_Direction_list'] or []
    s = row['A1_Speed_list'] or []
    r = row['A1_Response_list'] or []
    rt = row['A1_ResponseTime_list'] or []

    L = min(len(d), len(s), len(r), len(rt))
    if L == 0:
        return pd.Series({
            'A1_response_rate': 0,
            'A1_left_response_rate': 0,
            'A1_right_response_rate': 0,
            'A1_fast_response_rate': 0,
            'A1_mean_response_time': np.nan,
            'A1_fast_avg_rt': np.nan,
            'A1_direction_diff_rt': np.nan
        })

    d, s, r, rt = np.array(d[:L]), np.array(s[:L]), np.array(r[:L]), np.array(rt[:L])

    # 전체 응답률
    A1_response_rate = r.mean()

    # 왼쪽/오른쪽 조건 응답률
    A1_left_response_rate = r[d == 1].mean() if np.any(d == 1) else 0
    A1_right_response_rate = r[d == 2].mean() if np.any(d == 2) else 0

    # 빠름 조건(3)의 응답률
    A1_fast_response_rate = r[s == 3].mean() if np.any(s == 3) else 0

    # 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r == 1]
    A1_mean_response_time = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    # 빠름 조건에서의 평균 반응시간
    fast_rt = rt[(s == 3) & (r == 1)]
    A1_fast_avg_rt = fast_rt.mean() if len(fast_rt) > 0 else np.nan

    # 방향별 반응시간 차이 (left - right)
    left_rt = rt[(d == 1) & (r == 1)]
    right_rt = rt[(d == 2) & (r == 1)]
    A1_direction_diff_rt = left_rt.mean() - right_rt.mean() if len(left_rt) > 0 and len(right_rt) > 0 else np.nan

    return pd.Series({
        'A1_response_rate': A1_response_rate,
        'A1_left_response_rate': A1_left_response_rate,
        'A1_right_response_rate': A1_right_response_rate,
        'A1_fast_response_rate': A1_fast_response_rate,
        'A1_mean_response_time': A1_mean_response_time,
        'A1_fast_avg_rt': A1_fast_avg_rt,
        'A1_direction_diff_rt': A1_direction_diff_rt
    })


# 적용
a1_feats = train_a.apply(compute_A1_features, axis=1)
train_a = pd.concat([train_a, a1_feats], axis=1)

# NaN 처리: 각 컬럼 최대값으로 채우기
rt_cols = ['A1_mean_response_time', 'A1_fast_avg_rt', 'A1_direction_diff_rt']
for col in rt_cols:
    max_val = train_a[col].max(skipna=True)
    train_a[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_a[['A1_response_rate','A1_left_response_rate','A1_right_response_rate',
               'A1_fast_response_rate','A1_mean_response_time','A1_fast_avg_rt','A1_direction_diff_rt']].head())


   A1_response_rate  A1_left_response_rate  A1_right_response_rate  \
0          0.000000               0.000000                0.000000   
1          0.000000               0.000000                0.000000   
2          0.000000               0.000000                0.000000   
3          0.277778               0.333333                0.222222   
4          0.000000               0.000000                0.000000   

   A1_fast_response_rate  A1_mean_response_time  A1_fast_avg_rt  \
0               0.000000                  753.0           817.0   
1               0.000000                  753.0           817.0   
2               0.000000                  753.0           817.0   
3               0.333333                  646.6           723.5   
4               0.000000                  753.0           817.0   

   A1_direction_diff_rt  
0                1585.0  
1                1585.0  
2                1585.0  
3                 -24.0  
4                1585.0  


In [10]:
def compute_A2_features(row):
    s1 = row['A2_Speed1_list'] or []
    s2 = row['A2_Speed2_list'] or []
    r  = row['A2_Response_list'] or []
    rt = row['A2_ResponseTime_list'] or []

    L = min(len(s1), len(s2), len(r), len(rt))
    if L == 0:
        return pd.Series({
            'A2_response_rate': 0,
            'A2_slow_to_fast_rt_diff': np.nan,
            'A2_correct_ratio_by_speed': np.nan,
            'A2_mean_response_time': np.nan
        })

    s1, s2, r, rt = np.array(s1[:L]), np.array(s2[:L]), np.array(r[:L]), np.array(rt[:L])

    # 전체 응답률
    A2_response_rate = r.mean()

    # 느림/빠름 인덱스 (가정: 1=느림, 2=빠름)
    slow_idx = np.where(s1 == 1)[0]
    fast_idx = np.where(s1 == 2)[0]

    # 느림→빠름 조건 반응시간 차이
    slow_rt = rt[slow_idx & (r[slow_idx]==1)] if len(slow_idx) > 0 else np.array([])
    fast_rt = rt[fast_idx & (r[fast_idx]==1)] if len(fast_idx) > 0 else np.array([])
    A2_slow_to_fast_rt_diff = fast_rt.mean() - slow_rt.mean() if len(slow_rt) > 0 and len(fast_rt) > 0 else np.nan

    # 속도 조건별 응답률 비교 (fast/slow)
    slow_resp = r[slow_idx].mean() if len(slow_idx) > 0 else np.nan
    fast_resp = r[fast_idx].mean() if len(fast_idx) > 0 else np.nan
    if not np.isnan(slow_resp) and not np.isnan(fast_resp) and slow_resp != 0:
        A2_correct_ratio_by_speed = fast_resp / slow_resp
    else:
        A2_correct_ratio_by_speed = np.nan

    # 전체 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r == 1]
    A2_mean_response_time = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    return pd.Series({
        'A2_response_rate': A2_response_rate,
        'A2_slow_to_fast_rt_diff': A2_slow_to_fast_rt_diff,
        'A2_correct_ratio_by_speed': A2_correct_ratio_by_speed,
        'A2_mean_response_time': A2_mean_response_time
    })

# 적용
a2_feats = train_a.apply(compute_A2_features, axis=1)
train_a = pd.concat([train_a, a2_feats], axis=1)

# NaN 처리: 각 컬럼 최대값으로 채우기
rt_cols = ['A2_slow_to_fast_rt_diff', 'A2_correct_ratio_by_speed', 'A2_mean_response_time']
for col in rt_cols:
    max_val = train_a[col].max(skipna=True)
    train_a[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_a[['A2_response_rate','A2_slow_to_fast_rt_diff','A2_correct_ratio_by_speed','A2_mean_response_time']].head())

   A2_response_rate  A2_slow_to_fast_rt_diff  A2_correct_ratio_by_speed  \
0               0.0                      0.0                        6.0   
1               0.0                      0.0                        6.0   
2               0.0                      0.0                        6.0   
3               0.0                      0.0                        6.0   
4               0.0                      0.0                        6.0   

   A2_mean_response_time  
0                  999.0  
1                  999.0  
2                  999.0  
3                  999.0  
4                  999.0  


In [11]:
def compute_A3_features(row):
    arrow_size     = row['A3_ArrowSize_list'] or []
    arrow_pos      = row['A3_ArrowPosition_list'] or []
    arrow_dir      = row['A3_ArrowDirection_list'] or []
    correct_pos    = row['A3_CorrectPosition_list'] or []
    resp_type      = row['A3_ResponseType_list'] or []
    resp           = row['A3_Response_list'] or []
    rt             = row['A3_ResponseTime_list'] or []

    L = min(len(arrow_size), len(arrow_pos), len(arrow_dir), len(correct_pos), len(resp_type), len(resp), len(rt))
    if L == 0:
        return pd.Series({
            'A3_valid_accuracy': np.nan,
            'A3_invalid_accuracy': np.nan,
            'A3_total_accuracy': np.nan,
            'A3_valid_rt': np.nan,
            'A3_invalid_rt': np.nan,
            'A3_correct_rt': np.nan,
            'A3_incorrect_rt': np.nan,
            'A3_accuracy_gap': np.nan
        })

    # 배열로 변환
    arrow_size, arrow_pos, arrow_dir = np.array(arrow_size[:L]), np.array(arrow_pos[:L]), np.array(arrow_dir[:L])
    correct_pos, resp_type, resp, rt = np.array(correct_pos[:L]), np.array(resp_type[:L]), np.array(resp[:L]), np.array(rt[:L])

    # valid / invalid trial 인덱스 (예: 1=valid, 3=invalid)
    valid_idx = np.where(resp_type == 1)[0]
    invalid_idx = np.where(resp_type == 3)[0]

    # 정확도 계산
    A3_valid_accuracy = (resp[valid_idx] == 1).mean() if len(valid_idx) > 0 else np.nan
    A3_invalid_accuracy = (resp[invalid_idx] == 1).mean() if len(invalid_idx) > 0 else np.nan
    A3_total_accuracy = (resp == 1).mean() if len(resp) > 0 else np.nan

    # 반응시간 계산 (응답한 trial만)
    A3_valid_rt = rt[valid_idx].mean() if len(valid_idx) > 0 else np.nan
    A3_invalid_rt = rt[invalid_idx].mean() if len(invalid_idx) > 0 else np.nan
    A3_correct_rt = rt[resp == 1].mean() if np.any(resp == 1) else np.nan
    A3_incorrect_rt = rt[resp == 0].mean() if np.any(resp == 0) else np.nan

    # valid / invalid 정확도 차이
    if not np.isnan(A3_valid_accuracy) and not np.isnan(A3_invalid_accuracy):
        A3_accuracy_gap = A3_valid_accuracy - A3_invalid_accuracy
    else:
        A3_accuracy_gap = np.nan

    return pd.Series({
        'A3_valid_accuracy': A3_valid_accuracy,
        'A3_invalid_accuracy': A3_invalid_accuracy,
        'A3_total_accuracy': A3_total_accuracy,
        'A3_valid_rt': A3_valid_rt,
        'A3_invalid_rt': A3_invalid_rt,
        'A3_correct_rt': A3_correct_rt,
        'A3_incorrect_rt': A3_incorrect_rt,
        'A3_accuracy_gap': A3_accuracy_gap
    })


# 적용
a3_feats = train_a.apply(compute_A3_features, axis=1)
train_a = pd.concat([train_a, a3_feats], axis=1)

# NaN 처리: 반응시간 관련 컬럼 최대값으로 채우기
rt_cols = ['A3_valid_rt', 'A3_invalid_rt', 'A3_correct_rt', 'A3_incorrect_rt']
for col in rt_cols:
    max_val = train_a[col].max(skipna=True)
    train_a[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_a[['A3_valid_accuracy','A3_invalid_accuracy','A3_total_accuracy',
               'A3_valid_rt','A3_invalid_rt','A3_correct_rt','A3_incorrect_rt','A3_accuracy_gap']].head())

   A3_valid_accuracy  A3_invalid_accuracy  A3_total_accuracy  A3_valid_rt  \
0                0.0                  0.0                0.0   728.833333   
1                0.0                  0.0                0.0  1083.791667   
2                0.0                  0.0                0.0   728.833333   
3                0.0                  0.0                0.0   712.416667   
4                0.0                  0.0                0.0   693.208333   

   A3_invalid_rt  A3_correct_rt  A3_incorrect_rt  A3_accuracy_gap  
0    1092.375000         2577.0        819.71875              0.0  
1    1015.250000         2577.0       1066.65625              0.0  
2    1092.375000         2577.0        819.71875              0.0  
3    1291.142857         2577.0        874.12500              0.0  
4    1202.750000         2577.0        820.59375              0.0  


In [12]:
# A3 결측치 최대값으로 채우기
for col in ['A3_valid_accuracy','A3_invalid_accuracy','A3_accuracy_gap']:
    max_val = train_a[col].max()
    train_a[col].fillna(max_val, inplace=True)

In [13]:
def compute_A4_features(row):
    condition = row['A4_Condition_list'] or []
    resp1     = row['A4_Response1_list'] or []
    resp2     = row['A4_Response2_list'] or []
    rt        = row['A4_ResponseTime_list'] or []

    L = min(len(condition), len(resp1), len(resp2), len(rt))
    if L == 0:
        return pd.Series({
            'A4_congruent_accuracy': np.nan,
            'A4_incongruent_accuracy': np.nan,
            'A4_accuracy_gap': np.nan,
            'A4_mean_rt_con': np.nan,
            'A4_mean_rt_incon': np.nan,
            'A4_rt_gap': np.nan,
            'A4_response_rate': 0
        })

    condition, resp1, resp2, rt = np.array(condition[:L]), np.array(resp1[:L]), np.array(resp2[:L]), np.array(rt[:L])

    # 응답이 있는 trial만
    valid_idx = np.where((resp1 != -1) & (resp2 != -1))[0]  # -1 등으로 결측 없음 가정
    response_rate = len(valid_idx)/L if L>0 else 0

    # congruent / incongruent trial
    con_idx = valid_idx[condition[valid_idx] == 1]
    incon_idx = valid_idx[condition[valid_idx] == 2]

    # 정확도 계산 (resp1==1이 정답)
    con_acc = (resp1[con_idx] == 1).mean() if len(con_idx) > 0 else np.nan
    incon_acc = (resp1[incon_idx] == 1).mean() if len(incon_idx) > 0 else np.nan
    acc_gap = con_acc - incon_acc if not np.isnan(con_acc) and not np.isnan(incon_acc) else np.nan

    # 반응시간 계산
    mean_rt_con = rt[con_idx].mean() if len(con_idx) > 0 else np.nan
    mean_rt_incon = rt[incon_idx].mean() if len(incon_idx) > 0 else np.nan
    rt_gap = mean_rt_incon - mean_rt_con if not np.isnan(mean_rt_con) and not np.isnan(mean_rt_incon) else np.nan

    return pd.Series({
        'A4_congruent_accuracy': con_acc,
        'A4_incongruent_accuracy': incon_acc,
        'A4_accuracy_gap': acc_gap,
        'A4_mean_rt_con': mean_rt_con,
        'A4_mean_rt_incon': mean_rt_incon,
        'A4_rt_gap': rt_gap,
        'A4_response_rate': response_rate
    })


# 적용
a4_feats = train_a.apply(compute_A4_features, axis=1)
train_a = pd.concat([train_a, a4_feats], axis=1)

# NaN 처리: 반응시간 관련 컬럼 최대값으로 채우기
rt_cols = ['A4_mean_rt_con', 'A4_mean_rt_incon', 'A4_rt_gap']
for col in rt_cols:
    max_val = train_a[col].max(skipna=True)
    train_a[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_a[['A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap',
               'A4_mean_rt_con','A4_mean_rt_incon','A4_rt_gap','A4_response_rate']].head())

   A4_congruent_accuracy  A4_incongruent_accuracy  A4_accuracy_gap  \
0                  1.000                      1.0            0.000   
1                  0.975                      1.0           -0.025   
2                  1.000                      1.0            0.000   
3                  1.000                      1.0            0.000   
4                  0.975                      1.0           -0.025   

   A4_mean_rt_con  A4_mean_rt_incon  A4_rt_gap  A4_response_rate  
0         579.075           565.900    -13.175               1.0  
1         529.925           571.175     41.250               1.0  
2         579.075           565.900    -13.175               1.0  
3         597.675           605.375      7.700               1.0  
4         596.225           622.550     26.325               1.0  


In [14]:
def compute_A5_features(row):
    change_type = row['A5_ChangeType_list'] or []
    resp1       = row['A5_Response1_list'] or []
    resp2       = row['A5_Response_list'] or []  # 혹시 필요 시
    L = min(len(change_type), len(resp1))
    if L == 0:
        return pd.Series({
            'A5_accuracy_non_change': np.nan,
            'A5_accuracy_pos_change': np.nan,
            'A5_accuracy_color_change': np.nan,
            'A5_accuracy_shape_change': np.nan,
            'A5_accuracy_var': np.nan
        })

    change_type = np.array(change_type[:L])
    resp1 = np.array(resp1[:L])

    # 각 조건별 인덱스
    idx_non_change  = np.where(change_type == 1)[0]
    idx_pos_change  = np.where(change_type == 2)[0]
    idx_color_change = np.where(change_type == 3)[0]
    idx_shape_change = np.where(change_type == 4)[0]

    # 정확도 계산 (1=정답)
    acc_non = (resp1[idx_non_change] == 1).mean() if len(idx_non_change) > 0 else np.nan
    acc_pos = (resp1[idx_pos_change] == 1).mean() if len(idx_pos_change) > 0 else np.nan
    acc_color = (resp1[idx_color_change] == 1).mean() if len(idx_color_change) > 0 else np.nan
    acc_shape = (resp1[idx_shape_change] == 1).mean() if len(idx_shape_change) > 0 else np.nan

    # 변화 유형 간 정확도 분산
    acc_list = [acc_non, acc_pos, acc_color, acc_shape]
    acc_var = np.nanvar(acc_list)  # NaN 자동 무시

    return pd.Series({
        'A5_accuracy_non_change': acc_non,
        'A5_accuracy_pos_change': acc_pos,
        'A5_accuracy_color_change': acc_color,
        'A5_accuracy_shape_change': acc_shape,
        'A5_accuracy_var': acc_var
    })

# 적용
a5_feats = train_a.apply(compute_A5_features, axis=1)
train_a = pd.concat([train_a, a5_feats], axis=1)

# NaN 처리: 정확도 관련 컬럼은 최대값으로 채우기
acc_cols = ['A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var']
for col in acc_cols:
    max_val = train_a[col].max(skipna=True)
    train_a[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_a[['A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var']].head())

   A5_accuracy_non_change  A5_accuracy_pos_change  A5_accuracy_color_change  \
0                1.000000                     1.0                  0.500000   
1                1.000000                     1.0                  0.000000   
2                1.000000                     1.0                  0.500000   
3                0.944444                     1.0                  0.333333   
4                1.000000                     1.0                  0.166667   

   A5_accuracy_shape_change  A5_accuracy_var  
0                  0.833333         0.041667  
1                  0.500000         0.171875  
2                  0.833333         0.041667  
3                  0.500000         0.081019  
4                  0.333333         0.144097  


In [15]:
# A6, A7 총 점수
train_a['A6_score'] = train_a['A6_Count']
train_a['A7_score'] = train_a['A7_Count']

# 정규화 점수 (z-score)
train_a['A6_zscore'] = (train_a['A6_score'] - train_a['A6_score'].mean()) / train_a['A6_score'].std()
train_a['A7_zscore'] = (train_a['A7_score'] - train_a['A7_score'].mean()) / train_a['A7_score'].std()

# 결과 확인
print(train_a[['A6_score','A6_zscore','A7_score','A7_zscore']].head())

   A6_score  A6_zscore  A7_score  A7_zscore
0        12   0.570933        15   0.857024
1        10  -0.108453        10  -0.407901
2        12   0.570933        17   1.362994
3        13   0.910627        14   0.604039
4        13   0.910627        10  -0.407901


In [16]:
# A8 점수
train_a['A8_distortion_score'] = train_a['A8_Count1']
train_a['A8_consistency_score'] = train_a['A8_Count2']

# distortion_flag 설정 (예: 5 초과 시 이상 응답)
threshold = 5
train_a['A8_distortion_flag'] = (train_a['A8_distortion_score'] > threshold).astype(int)

# 결과 확인
print(train_a[['A8_distortion_score','A8_consistency_score','A8_distortion_flag']].head())

   A8_distortion_score  A8_consistency_score  A8_distortion_flag
0                    0                     1                   0
1                    1                     2                   0
2                   10                     0                   1
3                    7                     1                   1
4                    9                     0                   1


In [17]:
# 개별 점수
train_a['A9_emotional_stability'] = train_a['A9_Count1']
train_a['A9_behavior_stability'] = train_a['A9_Count2']
train_a['A9_reality_checking'] = train_a['A9_Count3']
train_a['A9_cognitive_agility'] = train_a['A9_Count4']
train_a['A9_stress_level'] = train_a['A9_Count5']

# 총합 점수
train_a['A9_total_score'] = train_a[['A9_emotional_stability','A9_behavior_stability',
                                     'A9_reality_checking','A9_cognitive_agility','A9_stress_level']].sum(axis=1)

# 정서 / 행동 안정성 차이
train_a['A9_stability_gap'] = train_a['A9_emotional_stability'] - train_a['A9_behavior_stability']

# 결과 확인
print(train_a[['A9_emotional_stability','A9_behavior_stability','A9_reality_checking',
               'A9_cognitive_agility','A9_stress_level','A9_total_score','A9_stability_gap']].head())


   A9_emotional_stability  A9_behavior_stability  A9_reality_checking  \
0                      10                     12                    6   
1                       2                     15                    6   
2                       0                      0                    0   
3                       5                      1                    0   
4                       1                      6                    0   

   A9_cognitive_agility  A9_stress_level  A9_total_score  A9_stability_gap  
0                    10               14              52                -2  
1                     0                5              28               -13  
2                     0                0               0                 0  
3                    20                9              35                 4  
4                     6                2              15                -5  


# **Fit**

In [87]:
results = []

def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")

    submission_df = submission_df[submission_df.iloc[:, 0].isin(answer_df.iloc[:, 0])]
    submission_df.index = range(submission_df.shape[0])

    auc_scores, brier_scores, ece_scores = [], [], []

    for column in answer_df.columns[1:]:
        y_true = answer_df[column]
        y_prob = submission_df[column]
        auc_scores.append(roc_auc_score(y_true, y_prob))
        brier_scores.append(mean_squared_error(y_true, y_prob))
        ece_scores.append(expected_calibration_error(y_true, y_prob))

    mean_auc = np.mean(auc_scores)
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    return combined_score

## **LightGBM**

In [88]:
lgb_results = []

### All Features

In [89]:
feature_cols = [
    'Age',
    # A1
    'A1_response_rate','A1_left_response_rate','A1_right_response_rate','A1_fast_response_rate',
    'A1_mean_response_time','A1_fast_avg_rt','A1_direction_diff_rt',
    # A2
    'A2_response_rate','A2_slow_to_fast_rt_diff','A2_correct_ratio_by_speed','A2_mean_response_time',
    # A3
    'A3_valid_accuracy','A3_invalid_accuracy','A3_total_accuracy',
    'A3_valid_rt','A3_invalid_rt','A3_correct_rt','A3_incorrect_rt','A3_accuracy_gap',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap',
    'A4_mean_rt_con','A4_mean_rt_incon','A4_rt_gap','A4_response_rate',
    # A5
    'A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var',
    # A6
    'A6_score','A6_zscore',
    # A7
    'A7_score','A7_zscore',
    # A8
    'A8_distortion_score','A8_consistency_score','A8_distortion_flag',
    # A9
    'A9_emotional_stability','A9_behavior_stability','A9_reality_checking','A9_cognitive_agility','A9_stress_level','A9_total_score','A9_stability_gap'
]

In [90]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.6141374792910088),
 'Brier': 0.023137108324408878,
 'ECE': np.float64(0.009565034141253387),
 'Combined': np.float64(0.20110679597091116)}

### Top 20 Features

In [91]:
feature_cols = [
    'Age',
    # A1
    'A1_response_rate',
    # A3
    'A3_valid_rt',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_mean_rt_con','A4_mean_rt_incon',
    # A5
    'A5_accuracy_non_change','A5_accuracy_color_change','A5_accuracy_shape_change',
    # A6
    'A6_score','A6_zscore',
    # A7
    'A7_score','A7_zscore',
    # A8
    'A8_distortion_score','A8_consistency_score',
    # A9
    'A9_emotional_stability','A9_reality_checking','A9_cognitive_agility','A9_stress_level','A9_stability_gap'
]

In [92]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.6130107699937781),
 'Brier': 0.02342254703166787,
 'ECE': np.float64(0.01091818134823397),
 'Combined': np.float64(0.2020797970980864)}

### Over 0.3 Features

In [93]:
feature_cols = [
    'Age',
    # A4
    'A4_incongruent_accuracy',
    # A5
    'A5_accuracy_non_change','A5_accuracy_color_change',
    # A6
    'A6_zscore',
    # A8
    'A8_distortion_score','A8_consistency_score',
    # A9
    'A9_reality_checking','A9_stability_gap'
]

In [94]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Over 0.3 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.3 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Over 0.3 Features',
 'AUC': np.float64(0.6087687488037631),
 'Brier': 0.02424510317135594,
 'ECE': np.float64(0.022948629921213996),
 'Combined': np.float64(0.20741405887126096)}

### Over 0.5 Features

In [95]:
feature_cols = [
    'Age',
    # A5
    'A5_accuracy_non_change',
    # A8
    'A8_distortion_score',
    # A9
    'A9_reality_checking','A9_stability_gap'
]

In [96]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Over 0.5 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.5 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Over 0.5 Features',
 'AUC': np.float64(0.5956228689359023),
 'Brier': 0.1048811458029333,
 'ECE': np.float64(0.2493875772432912),
 'Combined': np.float64(0.29075574629360496)}

### Result

In [97]:
lgb_results_df = pd.DataFrame(lgb_results)
best = lgb_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features,0.614137,0.023137,0.009565,0.201107
1,Top 20 Features,0.613011,0.023423,0.010918,0.20208
2,Over 0.3 Features,0.608769,0.024245,0.022949,0.207414
3,Over 0.5 Features,0.595623,0.104881,0.249388,0.290756


## **XGBoost**

In [98]:
xgb_results = []

### All Features

In [99]:
feature_cols = [
    'Age',
    # A1
    'A1_response_rate','A1_left_response_rate','A1_right_response_rate','A1_fast_response_rate',
    'A1_mean_response_time','A1_fast_avg_rt','A1_direction_diff_rt',
    # A2
    'A2_response_rate','A2_slow_to_fast_rt_diff','A2_correct_ratio_by_speed','A2_mean_response_time',
    # A3
    'A3_valid_accuracy','A3_invalid_accuracy','A3_total_accuracy',
    'A3_valid_rt','A3_invalid_rt','A3_correct_rt','A3_incorrect_rt','A3_accuracy_gap',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap',
    'A4_mean_rt_con','A4_mean_rt_incon','A4_rt_gap','A4_response_rate',
    # A5
    'A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var',
    # A6
    'A6_score','A6_zscore',
    # A7
    'A7_score','A7_zscore',
    # A8
    'A8_distortion_score','A8_consistency_score','A8_distortion_flag',
    # A9
    'A9_emotional_stability','A9_behavior_stability','A9_reality_checking','A9_cognitive_agility','A9_stress_level','A9_total_score','A9_stability_gap'
]

In [100]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.6078631614104343),
 'Brier': 0.02411486767232418,
 'ECE': np.float64(0.015619388219178411),
 'Combined': np.float64(0.2060019832676585)}

### Top 20 Features

In [101]:
feature_cols = [
    'Age',
    # A1
    'A1_response_rate','A1_mean_response_time',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap','A4_mean_rt_incon',
    # A5
    'A5_accuracy_non_change','A5_accuracy_color_change','A5_accuracy_shape_change',
    # A6
    'A6_score',
    # A7
    'A7_score',
    # A8
    'A8_distortion_score','A8_consistency_score','A8_distortion_flag',
    # A9
    'A9_emotional_stability','A9_behavior_stability','A9_reality_checking','A9_cognitive_agility','A9_stress_level','A9_stability_gap'
]

In [102]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.5901849362895676),
 'Brier': 0.0395066924393177,
 'ECE': np.float64(0.04979918083754115),
 'Combined': np.float64(0.22723400017443093)}

### Over 0.3 Features

In [103]:
feature_cols = [
    'Age',
    # A5
    'A5_accuracy_non_change',
    # A6
    'A6_score',
    # A8
    'A8_distortion_score',
    # A9
    'A9_emotional_stability',
]

In [104]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'Over 0.3 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.3 Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'Over 0.3 Features',
 'AUC': np.float64(0.6113640456774337),
 'Brier': 0.10661613196134567,
 'ECE': np.float64(0.2552227915334428),
 'Combined': np.float64(0.28477770803498026)}

### Result

In [105]:
xgb_results_df = pd.DataFrame(xgb_results)
best = xgb_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features,0.607863,0.024115,0.015619,0.206002
1,Top 20 Features,0.590185,0.039507,0.049799,0.227234
2,Over 0.3 Features,0.611364,0.106616,0.255223,0.284778


## **DNN**

In [116]:
dnn_results = []

### All Features

In [117]:
feature_cols = [
    'Age',
    # A1
    'A1_response_rate','A1_left_response_rate','A1_right_response_rate','A1_fast_response_rate',
    'A1_mean_response_time','A1_fast_avg_rt','A1_direction_diff_rt',
    # A2
    'A2_response_rate','A2_slow_to_fast_rt_diff','A2_correct_ratio_by_speed','A2_mean_response_time',
    # A3
    'A3_valid_accuracy','A3_invalid_accuracy','A3_total_accuracy',
    'A3_valid_rt','A3_invalid_rt','A3_correct_rt','A3_incorrect_rt','A3_accuracy_gap',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap',
    'A4_mean_rt_con','A4_mean_rt_incon','A4_rt_gap','A4_response_rate',
    # A5
    'A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var',
    # A6
    'A6_score','A6_zscore',
    # A7
    'A7_score','A7_zscore',
    # A8
    'A8_distortion_score','A8_consistency_score','A8_distortion_flag',
    # A9
    'A9_emotional_stability','A9_behavior_stability','A9_reality_checking','A9_cognitive_agility','A9_stress_level','A9_total_score','A9_stability_gap'
]

In [118]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# DNN 모델 정의
input_dim = x_res.shape[1]

dnn_model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    x_res, y_res,
    validation_data=(x_val_scaled, y_val),
    epochs=500,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# 예측 및 평가
y_prob = dnn_model.predict(x_val_scaled).flatten()

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

dnn_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (DNN)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})

Epoch 1/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8983 - loss: 0.2395 - val_accuracy: 0.9745 - val_loss: 0.1348
Epoch 2/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9805 - loss: 0.0795 - val_accuracy: 0.9763 - val_loss: 0.1173
Epoch 3/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9822 - loss: 0.0716 - val_accuracy: 0.9770 - val_loss: 0.1111
Epoch 4/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9832 - loss: 0.0681 - val_accuracy: 0.9765 - val_loss: 0.1121
Epoch 5/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9833 - loss: 0.0671 - val_accuracy: 0.9764 - val_loss: 0.1093
Epoch 6/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9836 - loss: 0.0662 - val_accuracy: 0.9741 - val_loss: 0.1181
Epoch 7/5

In [123]:
dnn_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.6371571323113857),
 'Brier': 0.022919973358511925,
 'ECE': np.float64(0.009261092074672643),
 'Combined': np.float64(0.18946670020260328)}

### Top 20 Features

In [120]:
feature_cols = [
    'Age',
    # A3
    'A3_valid_rt','A3_incorrect_rt',
    # A4
    'A4_congruent_accuracy','A4_incongruent_accuracy','A4_accuracy_gap','A4_mean_rt_incon',
    # A5
    'A5_accuracy_non_change','A5_accuracy_pos_change','A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var',
    # A6
    'A6_score','A6_zscore',
    # A7
    'A7_zscore',
    # A8
    'A8_distortion_score','A8_distortion_flag',
    # A9
    'A9_emotional_stability','A9_behavior_stability','A9_total_score','A9_stability_gap'
]

In [121]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# DNN 모델 정의
input_dim = x_res.shape[1]

dnn_model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    x_res, y_res,
    validation_data=(x_val_scaled, y_val),
    epochs=500,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# 예측 및 평가
y_prob = dnn_model.predict(x_val_scaled).flatten()

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

dnn_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (DNN)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})

Epoch 1/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 2ms/step - accuracy: 0.8568 - loss: 0.3253 - val_accuracy: 0.9634 - val_loss: 0.1772
Epoch 2/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9447 - loss: 0.1623 - val_accuracy: 0.9514 - val_loss: 0.1749
Epoch 3/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9526 - loss: 0.1441 - val_accuracy: 0.9635 - val_loss: 0.1792
Epoch 4/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9566 - loss: 0.1340 - val_accuracy: 0.9686 - val_loss: 0.1570
Epoch 5/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9605 - loss: 0.1246 - val_accuracy: 0.9737 - val_loss: 0.1330
Epoch 6/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.9636 - loss: 0.1173 - val_accuracy: 0.9763 - val_loss: 0.1213
Epoch 7/5

In [124]:
dnn_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.6094919848825239),
 'Brier': 0.024284182116389275,
 'ECE': np.float64(0.016888032119455887),
 'Combined': np.float64(0.20554706111769935)}

### Over 1.0 Features

In [125]:
feature_cols = [
    'Age',
    # A5
    'A5_accuracy_color_change','A5_accuracy_shape_change','A5_accuracy_var',
]

In [126]:
x = train_a[feature_cols]
y = train_a['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# DNN 모델 정의
input_dim = x_res.shape[1]

dnn_model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    x_res, y_res,
    validation_data=(x_val_scaled, y_val),
    epochs=500,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# 예측 및 평가
y_prob = dnn_model.predict(x_val_scaled).flatten()

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

dnn_results.append({
    'features': 'Over 1.0 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 1.0 Features (DNN)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})

Epoch 1/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2ms/step - accuracy: 0.6127 - loss: 0.6557 - val_accuracy: 0.6989 - val_loss: 0.6421
Epoch 2/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6160 - loss: 0.6525 - val_accuracy: 0.7246 - val_loss: 0.6411
Epoch 3/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6178 - loss: 0.6520 - val_accuracy: 0.7270 - val_loss: 0.6470
Epoch 4/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6183 - loss: 0.6512 - val_accuracy: 0.6357 - val_loss: 0.6781
Epoch 5/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6193 - loss: 0.6502 - val_accuracy: 0.6539 - val_loss: 0.6589
Epoch 6/500
[1m3954/3954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.6190 - loss: 0.6501 - val_accuracy: 0.7454 - val_loss: 0.6382
Epoch 7/5

In [127]:
dnn_results[-1]

{'features': 'Over 1.0 Features',
 'AUC': np.float64(0.6450058608550574),
 'Brier': 0.21618904173374176,
 'ECE': np.float64(0.4223293013152849),
 'Combined': np.float64(0.33712665533472796)}

### Result

In [128]:
dnn_results_df = pd.DataFrame(dnn_results)
best = dnn_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features,0.637157,0.02292,0.009261,0.189467
1,Top 20 Features,0.609492,0.024284,0.016888,0.205547
2,Over 1.0 Features,0.645006,0.216189,0.422329,0.337127


# **Final Result**

In [129]:
results_df = pd.DataFrame(results)
best = results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
7,All Features (DNN),0.637157,0.02292,0.009261,0.189467
0,All Features (LGBM),0.614137,0.023137,0.009565,0.201107
1,Top 20 Features (LGBM),0.613011,0.023423,0.010918,0.20208
8,Top 20 Features (DNN),0.609492,0.024284,0.016888,0.205547
4,All Features (XGB),0.607863,0.024115,0.015619,0.206002
