# **Import**

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.calibration import calibration_curve

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import shap

# **Data Load**

In [4]:
cd /content/drive/MyDrive/[Projects]/Dacon/운수종사자 인지적 특성 데이터를 활용한 교통사고 위험 예측 AI 경진대회/Data

/content/drive/MyDrive/[Projects]/Dacon/운수종사자 인지적 특성 데이터를 활용한 교통사고 위험 예측 AI 경진대회/Data


In [5]:
train_b = pd.read_csv('./train_b_renamed.csv')
train_b = train_b.dropna()
train_b.info()

<class 'pandas.core.frame.DataFrame'>
Index: 297513 entries, 0 to 297525
Data columns (total 32 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   Test_id          297513 non-null  object
 1   Test             297513 non-null  object
 2   Label            297513 non-null  int64 
 3   PrimaryKey       297513 non-null  object
 4   Age              297513 non-null  object
 5   TestDate         297513 non-null  int64 
 6   B1_Response1     297513 non-null  object
 7   B1_ResponseTime  297513 non-null  object
 8   B1_Response2     297513 non-null  object
 9   B2_Response1     297513 non-null  object
 10  B2_ResponseTime  297513 non-null  object
 11  B2_Response2     297513 non-null  object
 12  B3_Response      297513 non-null  object
 13  B3_ResponseTime  297513 non-null  object
 14  B4_Response      297513 non-null  object
 15  B4_ResponseTime  297513 non-null  object
 16  B5_Response      297513 non-null  object
 17  B5_ResponseTime

# **Features**

In [6]:
list_cols = ['B1_Response1', 'B1_ResponseTime', 'B1_Response2', 'B2_Response1',
             'B2_ResponseTime', 'B2_Response2', 'B3_Response', 'B3_ResponseTime',
             'B4_Response', 'B4_ResponseTime', 'B5_Response', 'B5_ResponseTime',
             'B6_Response', 'B7_Response', 'B8_Response']

for col in list_cols:
    train_b[col + '_list'] = train_b[col].apply(
        lambda x: list(map(float, x.split(','))) if isinstance(x, str) else x
    )

In [7]:
train_b['Age'] = train_b['Age'].astype(str).str.extract(r'(\d+)').astype(int)
print(train_b['Age'].head())

0    60
1    60
2    60
3    70
4    70
Name: Age, dtype: int64


In [8]:
def compute_B1_features(row):
    r1 = row.get('B1_Response1_list', []) or []
    rt = row.get('B1_ResponseTime_list', []) or []
    r2 = row.get('B1_Response2_list', []) or []

    L = min(len(r1), len(rt), len(r2))
    if L == 0:
        return pd.Series({
            'B1_task1_accuracy': 0,
            'B1_task2_change_acc': 0,
            'B1_task2_non_change_acc': 0,
            'B1_task2_accuracy_gap': 0,
            'B1_task2_mean_rt': np.nan
        })

    r1, r2, rt = np.array(r1[:L]), np.array(r2[:L]), np.array(rt[:L])

    # 1과제 정답률
    r1_bin = np.array([1 if val == 1 else 0 for val in r1])
    B1_task1_accuracy = r1_bin.mean()

    # 2과제: change / non-change 정확도
    r2_bin = np.array([1 if val == 1 else 0 for val in r2])
    change_mask = np.arange(L) < L//2        # 앞 절반이 change
    non_change_mask = np.arange(L) >= L//2   # 뒤 절반이 non-change

    B1_task2_change_acc = r2_bin[change_mask].mean() if np.any(change_mask) else 0
    B1_task2_non_change_acc = r2_bin[non_change_mask].mean() if np.any(non_change_mask) else 0

    # 정확도 차
    B1_task2_accuracy_gap = B1_task2_change_acc - B1_task2_non_change_acc

    # 2과제 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r2_bin == 1]
    B1_task2_mean_rt = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    return pd.Series({
        'B1_task1_accuracy': B1_task1_accuracy,
        'B1_task2_change_acc': B1_task2_change_acc,
        'B1_task2_non_change_acc': B1_task2_non_change_acc,
        'B1_task2_accuracy_gap': B1_task2_accuracy_gap,
        'B1_task2_mean_rt': B1_task2_mean_rt
    })

# 적용
b1_feats = train_b.apply(compute_B1_features, axis=1)
train_b = pd.concat([train_b, b1_feats], axis=1)

# NaN 처리: 평균값 혹은 최대값으로 채워도 됨
rt_cols = ['B1_task2_mean_rt']
for col in rt_cols:
    max_val = train_b[col].max(skipna=True)
    train_b[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_b[['B1_task1_accuracy','B1_task2_change_acc','B1_task2_non_change_acc',
               'B1_task2_accuracy_gap','B1_task2_mean_rt']].head())


   B1_task1_accuracy  B1_task2_change_acc  B1_task2_non_change_acc  \
0             0.9375                0.000                    0.000   
1             0.9375                0.375                    0.625   
2             0.8125                0.500                    0.500   
3             1.0000                0.375                    0.500   
4             1.0000                0.000                    0.000   

   B1_task2_accuracy_gap  B1_task2_mean_rt  
0                  0.000          1.327321  
1                 -0.250          0.679636  
2                  0.000          0.719479  
3                 -0.125          0.657621  
4                  0.000          1.327321  


In [9]:
def compute_B2_features(row):
    r1 = row.get('B2_Response1_list', []) or []
    rt = row.get('B2_ResponseTime_list', []) or []
    r2 = row.get('B2_Response2_list', []) or []

    L = min(len(r1), len(rt), len(r2))
    if L == 0:
        return pd.Series({
            'B2_task1_accuracy': 0,
            'B2_task2_change_acc': 0,
            'B2_task2_non_change_acc': 0,
            'B2_task2_accuracy_gap': 0,
            'B2_task2_mean_rt': np.nan
        })

    r1, r2, rt = np.array(r1[:L]), np.array(r2[:L]), np.array(rt[:L])

    # 1과제 정답률
    r1_bin = np.array([1 if val == 1 else 0 for val in r1])
    B2_task1_accuracy = r1_bin.mean()

    # 2과제: change / non-change 정확도
    r2_bin = np.array([1 if val == 1 else 0 for val in r2])
    change_mask = np.arange(L) < L//2
    non_change_mask = np.arange(L) >= L//2

    B2_task2_change_acc = r2_bin[change_mask].mean() if np.any(change_mask) else 0
    B2_task2_non_change_acc = r2_bin[non_change_mask].mean() if np.any(non_change_mask) else 0

    # 정확도 차
    B2_task2_accuracy_gap = B2_task2_change_acc - B2_task2_non_change_acc

    # 2과제 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r2_bin == 1]
    B2_task2_mean_rt = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    return pd.Series({
        'B2_task1_accuracy': B2_task1_accuracy,
        'B2_task2_change_acc': B2_task2_change_acc,
        'B2_task2_non_change_acc': B2_task2_non_change_acc,
        'B2_task2_accuracy_gap': B2_task2_accuracy_gap,
        'B2_task2_mean_rt': B2_task2_mean_rt
    })

# 적용
b2_feats = train_b.apply(compute_B2_features, axis=1)
train_b = pd.concat([train_b, b2_feats], axis=1)

# 결측 처리
# 반응시간 관련 → 최대값
rt_cols = ['B2_task2_mean_rt']
for col in rt_cols:
    max_val = train_b[col].max(skipna=True)
    train_b[col].fillna(max_val, inplace=True)

# 결과 확인
print(train_b[['B2_task1_accuracy','B2_task2_change_acc','B2_task2_non_change_acc','B2_task2_accuracy_gap', 'B2_task2_mean_rt']].head())

   B2_task1_accuracy  B2_task2_change_acc  B2_task2_non_change_acc  \
0             1.0000                0.000                    0.000   
1             1.0000                0.500                    0.500   
2             1.0000                0.375                    0.625   
3             0.9375                0.250                    0.750   
4             0.6250                0.000                    0.000   

   B2_task2_accuracy_gap  B2_task2_mean_rt  
0                   0.00          1.315023  
1                   0.00          0.640186  
2                  -0.25          0.565130  
3                  -0.50          0.626532  
4                   0.00          1.315023  


In [10]:
def compute_B3_features(row):
    # 리스트 추출
    r = row['B3_Response_list'] or []
    rt = row['B3_ResponseTime_list'] or []

    L = min(len(r), len(rt))
    if L == 0:
        return pd.Series({
            'B3_accuracy': 0,
            'B3_mean_rt': np.nan
        })

    r, rt = np.array(r[:L]), np.array(rt[:L])

    # 전체 정확도
    B3_accuracy = r.mean()

    # 전체 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r == 1]
    B3_mean_rt = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    return pd.Series({
        'B3_accuracy': B3_accuracy,
        'B3_mean_rt': B3_mean_rt
    })

# 적용
b3_feats = train_b.apply(compute_B3_features, axis=1)
train_b = pd.concat([train_b, b3_feats], axis=1)

# 결측 처리
max_rt = train_b['B3_mean_rt'].max(skipna=True)
train_b['B3_mean_rt'].fillna(max_rt, inplace=True)

# 결과 확인
print(train_b[['B3_accuracy','B3_mean_rt']].head())

   B3_accuracy  B3_mean_rt
0          1.0    0.726614
1          1.0    0.484873
2          1.0    0.691859
3          1.0    0.623723
4          1.0    0.719648


In [11]:
def compute_B4_features(row):
    r = row.get('B4_Response_list', []) or []
    rt = row.get('B4_ResponseTime_list', []) or []

    L = min(len(r), len(rt))
    if L == 0:
        return pd.Series({
            'B4_congruent_accuracy': 0,
            'B4_incongruent_accuracy': 0,
            'B4_accuracy_gap': 0,
            'B4_mean_rt_congruent': np.nan,
            'B4_mean_rt_incongruent': np.nan,
            'B4_rt_gap': np.nan
        })

    r, rt = np.array(r[:L]), np.array(rt[:L])

    # 정답 1, 오답 0 변환
    r_bin = np.array([1 if val == 1 else 0 for val in r])

    # mask 생성
    congruent_mask = np.arange(L) < L//2       # 앞 30 trials
    incongruent_mask = np.arange(L) >= L//2    # 뒤 30 trials

    # 정확도
    B4_congruent_accuracy = r_bin[congruent_mask].mean() if np.any(congruent_mask) else 0
    B4_incongruent_accuracy = r_bin[incongruent_mask].mean() if np.any(incongruent_mask) else 0
    B4_accuracy_gap = B4_incongruent_accuracy - B4_congruent_accuracy

    # 평균 반응시간 (응답한 trial만)
    valid_rt_con = rt[congruent_mask & (r_bin == 1)]
    valid_rt_incon = rt[incongruent_mask & (r_bin == 1)]

    B4_mean_rt_congruent = valid_rt_con.mean() if len(valid_rt_con) > 0 else np.nan
    B4_mean_rt_incongruent = valid_rt_incon.mean() if len(valid_rt_incon) > 0 else np.nan
    B4_rt_gap = B4_mean_rt_incongruent - B4_mean_rt_congruent if len(valid_rt_con) > 0 and len(valid_rt_incon) > 0 else np.nan

    return pd.Series({
        'B4_congruent_accuracy': B4_congruent_accuracy,
        'B4_incongruent_accuracy': B4_incongruent_accuracy,
        'B4_accuracy_gap': B4_accuracy_gap,
        'B4_mean_rt_congruent': B4_mean_rt_congruent,
        'B4_mean_rt_incongruent': B4_mean_rt_incongruent,
        'B4_rt_gap': B4_rt_gap
    })

b4_feats = train_b.apply(compute_B4_features, axis=1)
train_b = pd.concat([train_b, b4_feats], axis=1)

# 결측 처리
rt_cols = ['B4_mean_rt_congruent','B4_mean_rt_incongruent','B4_rt_gap']
for col in rt_cols:
    train_b[col].fillna(train_b[col].max(skipna=True), inplace=True)

# 결과 확인
print(train_b[['B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
               'B4_mean_rt_congruent','B4_mean_rt_incongruent','B4_rt_gap']].head())

   B4_congruent_accuracy  B4_incongruent_accuracy  B4_accuracy_gap  \
0               0.600000                 0.400000        -0.200000   
1               0.566667                 0.433333        -0.133333   
2               0.400000                 0.600000         0.200000   
3               0.366667                 0.600000         0.233333   
4               0.433333                 0.566667         0.133333   

   B4_mean_rt_congruent  B4_mean_rt_incongruent  B4_rt_gap  
0              0.740000                0.811667   0.071667  
1              0.579412                0.657692   0.078281  
2              0.655000                0.547222  -0.107778  
3              0.597273                0.611667   0.014394  
4              0.619231                0.610588  -0.008643  


In [12]:
def compute_B5_features(row):
    r = row.get('B5_Response_list', []) or []
    rt = row.get('B5_ResponseTime_list', []) or []

    L = min(len(r), len(rt))
    if L == 0:
        return pd.Series({
            'B5_accuracy': 0,
            'B5_mean_rt': np.nan
        })

    r, rt = np.array(r[:L]), np.array(rt[:L])

    # 정답 1, 오답 0 변환
    r_bin = np.array([1 if val == 1 else 0 for val in r])

    # 전체 정확도
    B5_accuracy = r_bin.mean()

    # 평균 반응시간 (응답한 trial만)
    valid_rt = rt[r_bin == 1]
    B5_mean_rt = valid_rt.mean() if len(valid_rt) > 0 else np.nan

    return pd.Series({
        'B5_accuracy': B5_accuracy,
        'B5_mean_rt': B5_mean_rt
    })

b5_feats = train_b.apply(compute_B5_features, axis=1)
train_b = pd.concat([train_b, b5_feats], axis=1)

# 결측 처리
train_b['B5_mean_rt'].fillna(train_b['B5_mean_rt'].max(skipna=True), inplace=True)

# 결과 확인
print(train_b[['B5_accuracy','B5_mean_rt']].head())

   B5_accuracy  B5_mean_rt
0          1.0      5.2955
1          1.0      4.7675
2          1.0      6.0410
3          1.0      5.2890
4          1.0      6.4040


In [13]:
def compute_B6_features(row):
    r = row.get('B6_Response_list', []) or []

    if len(r) == 0:
        return pd.Series({'B6_accuracy': 0})

    # 정답 1, 오답 0
    r_bin = np.array([1 if val == 1 else 0 for val in r])

    # 전체 정확도
    B6_accuracy = r_bin.mean()

    return pd.Series({'B6_accuracy': B6_accuracy})

b6_feats = train_b.apply(compute_B6_features, axis=1)
train_b = pd.concat([train_b, b6_feats], axis=1)

# 결과 확인
print(train_b[['B6_accuracy']].head())

   B6_accuracy
0     1.000000
1     1.000000
2     0.933333
3     0.933333
4     0.933333


In [14]:
def compute_B7_features(row):
    r = row.get('B7_Response_list', []) or []

    if len(r) == 0:
        return pd.Series({'B7_accuracy': 0})

    # 정답 1, 오답 0
    r_bin = np.array([1 if val == 1 else 0 for val in r])

    # 전체 정확도
    B7_accuracy = r_bin.mean()

    return pd.Series({'B7_accuracy': B7_accuracy})

b7_feats = train_b.apply(compute_B7_features, axis=1)
train_b = pd.concat([train_b, b7_feats], axis=1)

# 결과 확인
print(train_b[['B7_accuracy']].head())

   B7_accuracy
0     0.866667
1     0.933333
2     0.933333
3     0.933333
4     0.800000


In [15]:
def compute_B8_features(row):
    r = row.get('B8_Response_list', []) or []

    if len(r) == 0:
        return pd.Series({'B8_accuracy': 0})

    # 정답 1, 오답 0
    r_bin = np.array([1 if val == 1 else 0 for val in r])

    # 전체 정확도
    B8_accuracy = r_bin.mean()

    return pd.Series({'B8_accuracy': B8_accuracy})


b8_feats = train_b.apply(compute_B8_features, axis=1)
train_b = pd.concat([train_b, b8_feats], axis=1)

# 결과 확인
print(train_b[['B8_accuracy']].head())

   B8_accuracy
0          1.0
1          1.0
2          1.0
3          1.0
4          1.0


In [16]:
train_b['B9_aud_hit'] = train_b['B9_Count1']   # 청각 자극 맞춘 횟수
train_b['B9_aud_miss'] = train_b['B9_Count2']  # 청각 자극 놓친 횟수
train_b['B9_aud_fa'] = train_b['B9_Count3']    # 청각 자극 틀린 횟수
train_b['B9_aud_cr'] = train_b['B9_Count4']    # 청각 자극 무반응 횟수
train_b['B9_vis_err'] = train_b['B9_Count5']   # 시각 장애물 회피 오류 횟수

# 결과 확인
print(train_b[['B9_aud_hit','B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err']].head())

   B9_aud_hit  B9_aud_miss  B9_aud_fa  B9_aud_cr  B9_vis_err
0          15            0          1         34           0
1          15            0          0         35           0
2          15            0          1         34           1
3          15            0          0         35           0
4          15            0          2         33           0


In [17]:
train_b['B10_aud_hit'] = train_b['B10_Count1']
train_b['B10_aud_miss'] = train_b['B10_Count2']
train_b['B10_aud_fa'] = train_b['B10_Count3']
train_b['B10_aud_cr'] = train_b['B10_Count4']
train_b['B10_vis1_err'] = train_b['B10_Count5']
train_b['B10_vis2_correct'] = train_b['B10_Count6']

# 확인
print(train_b[['B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct']].head())

   B10_aud_hit  B10_aud_miss  B10_aud_fa  B10_aud_cr  B10_vis1_err  \
0           20             0           2          58             0   
1           20             0           1          59             3   
2           20             0           1          59             2   
3           18             2           0          60             5   
4           20             0           0          60             0   

   B10_vis2_correct  
0                20  
1                20  
2                18  
3                19  
4                19  


# **Fit**

In [18]:
results = []

def expected_calibration_error(y_true, y_prob, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=n_bins, strategy='uniform')
    bin_totals = np.histogram(y_prob, bins=np.linspace(0, 1, n_bins + 1), density=False)[0]
    non_empty_bins = bin_totals > 0
    bin_weights = bin_totals / len(y_prob)
    bin_weights = bin_weights[non_empty_bins]
    prob_true = prob_true[:len(bin_weights)]
    prob_pred = prob_pred[:len(bin_weights)]
    ece = np.sum(bin_weights * np.abs(prob_true - prob_pred))
    return ece

def auc_brier_ece(answer_df, submission_df):
    if submission_df.isnull().values.any():
        raise ValueError("The submission dataframe contains missing values.")

    if len(answer_df.columns) != len(submission_df.columns) or not all(answer_df.columns == submission_df.columns):
        raise ValueError("The columns of the answer and submission dataframes do not match.")

    submission_df = submission_df[submission_df.iloc[:, 0].isin(answer_df.iloc[:, 0])]
    submission_df.index = range(submission_df.shape[0])

    auc_scores, brier_scores, ece_scores = [], [], []

    for column in answer_df.columns[1:]:
        y_true = answer_df[column]
        y_prob = submission_df[column]
        auc_scores.append(roc_auc_score(y_true, y_prob))
        brier_scores.append(mean_squared_error(y_true, y_prob))
        ece_scores.append(expected_calibration_error(y_true, y_prob))

    mean_auc = np.mean(auc_scores)
    mean_brier = np.mean(brier_scores)
    mean_ece = np.mean(ece_scores)
    combined_score = 0.5 * (1 - mean_auc) + 0.25 * mean_brier + 0.25 * mean_ece
    return combined_score

## **LightGBM**

In [19]:
lgb_results = []

### All Features

In [22]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy','B1_task2_change_acc','B1_task2_non_change_acc','B1_task2_accuracy_gap','B1_task2_mean_rt',
    # B2
    'B2_task1_accuracy','B2_task2_change_acc','B2_task2_non_change_acc','B2_task2_accuracy_gap','B2_task2_mean_rt',
    # B3
    'B3_accuracy','B3_mean_rt',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
    'B4_mean_rt_congruent','B4_mean_rt_incongruent','B4_rt_gap',
    # B5
    'B5_accuracy','B5_mean_rt',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B8
    'B8_accuracy',
    # B9
    'B9_aud_hit','B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct'
]

In [23]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.5374525596977536),
 'Brier': 0.04058156417708454,
 'ECE': np.float64(0.0030817784542322055),
 'Combined': np.float64(0.24218955580895238)}

### Top 20 Features

In [27]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy','B1_task2_accuracy_gap',
    # B2
    'B2_task1_accuracy','B2_task2_change_acc',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
    # B5
    'B5_accuracy',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B9
    'B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct'
]

In [28]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.5343635850829019),
 'Brier': 0.040575301233831275,
 'ECE': np.float64(0.0027161464479221954),
 'Combined': np.float64(0.24364106937898738)}

### Over 0.3 Features

In [29]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy',
    # B2
    'B2_task1_accuracy',
    # B5
    'B5_accuracy',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B9
    'B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_vis1_err','B10_vis2_correct'
]

In [30]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Over 0.3 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.3 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Over 0.3 Features',
 'AUC': np.float64(0.4996292030363745),
 'Brier': 0.04280329663902809,
 'ECE': np.float64(0.02105058537616434),
 'Combined': np.float64(0.2661488689856109)}

### Over 0.5 Features

In [31]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy',
    # B9
    'B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_vis2_correct'
]

In [32]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
lgb_model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    random_state=42,
    device='gpu',
    verbose=-1
)
lgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = lgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

lgb_results.append({
    'features': 'Over 0.5 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.5 Features (LGBM)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
lgb_results[-1]

{'features': 'Over 0.5 Features',
 'AUC': np.float64(0.501239836884182),
 'Brier': 0.18946848374907474,
 'ECE': np.float64(0.36569833463496443),
 'Combined': np.float64(0.3881717861539188)}

### Result

In [33]:
lgb_results_df = pd.DataFrame(lgb_results)
best = lgb_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features,0.537453,0.040582,0.003082,0.24219
1,Top 20 Features,0.534364,0.040575,0.002716,0.243641
2,Over 0.3 Features,0.499629,0.042803,0.021051,0.266149
3,Over 0.5 Features,0.50124,0.189468,0.365698,0.388172


## **XGBoost**

In [34]:
xgb_results = []

### All Features

In [35]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy','B1_task2_change_acc','B1_task2_non_change_acc','B1_task2_accuracy_gap','B1_task2_mean_rt',
    # B2
    'B2_task1_accuracy','B2_task2_change_acc','B2_task2_non_change_acc','B2_task2_accuracy_gap','B2_task2_mean_rt',
    # B3
    'B3_accuracy','B3_mean_rt',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
    'B4_mean_rt_congruent','B4_mean_rt_incongruent','B4_rt_gap',
    # B5
    'B5_accuracy','B5_mean_rt',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B8
    'B8_accuracy',
    # B9
    'B9_aud_hit','B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct'
]

In [36]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.5344941811603641),
 'Brier': 0.04061109945178032,
 'ECE': np.float64(0.0044943377367686905),
 'Combined': np.float64(0.2440292687169552)}

### Top 20 Features

In [39]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy',
    # B2
    'B2_task1_accuracy','B2_task2_change_acc','B2_task2_accuracy_gap',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy',
    # B5
    'B5_accuracy',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B9
    'B9_aud_hit','B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct'
]

In [40]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.5372427341253008),
 'Brier': 0.040595218539237976,
 'ECE': np.float64(0.004785476352137912),
 'Combined': np.float64(0.24272380666019358)}

### Over 0.3 Features

In [41]:
feature_cols = [
    'Age',
    # B9
    'B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis2_correct'
]

In [42]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'Over 0.3 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.3 Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'Over 0.3 Features',
 'AUC': np.float64(0.49507933856316994),
 'Brier': 0.18680579960346222,
 'ECE': np.float64(0.3570772004573781),
 'Combined': np.float64(0.3884310807336251)}

### Over 0.5 Features

In [43]:
feature_cols = [
    'Age',
    # B10
    'B10_aud_hit','B10_aud_miss'
]

In [44]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# LightGBM 학습
xgb_model = XGBClassifier(
    n_estimators=500,
    max_depth=5,
    learning_rate=0.05,
    tree_method='hist',
    random_state=42,
    eval_metric='logloss'
)
xgb_model.fit(x_res, y_res)

# 예측 및 평가
y_prob = xgb_model.predict_proba(x_val_scaled)[:, 1]

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

xgb_results.append({
    'features': 'Over 0.5 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Over 0.5 Features (XGB)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
xgb_results[-1]

{'features': 'Over 0.5 Features',
 'AUC': np.float64(0.5293808523528373),
 'Brier': 0.2482588291168213,
 'ECE': np.float64(0.45456713408301325),
 'Combined': np.float64(0.41101606462354)}

### Result

In [45]:
xgb_results_df = pd.DataFrame(xgb_results)
best = xgb_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
1,Top 20 Features,0.537243,0.040595,0.004785,0.242724
0,All Features,0.534494,0.040611,0.004494,0.244029
2,Over 0.3 Features,0.495079,0.186806,0.357077,0.388431
3,Over 0.5 Features,0.529381,0.248259,0.454567,0.411016


## **DNN**

In [46]:
dnn_results = []

### All Features

In [47]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy','B1_task2_change_acc','B1_task2_non_change_acc','B1_task2_accuracy_gap','B1_task2_mean_rt',
    # B2
    'B2_task1_accuracy','B2_task2_change_acc','B2_task2_non_change_acc','B2_task2_accuracy_gap','B2_task2_mean_rt',
    # B3
    'B3_accuracy','B3_mean_rt',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
    'B4_mean_rt_congruent','B4_mean_rt_incongruent','B4_rt_gap',
    # B5
    'B5_accuracy','B5_mean_rt',
    # B6
    'B6_accuracy',
    # B7
    'B7_accuracy',
    # B8
    'B8_accuracy',
    # B9
    'B9_aud_hit','B9_aud_miss','B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_aud_cr','B10_vis1_err','B10_vis2_correct'
]

In [48]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# DNN 모델 정의
input_dim = x_res.shape[1]

dnn_model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    x_res, y_res,
    validation_data=(x_val_scaled, y_val),
    epochs=500,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# 예측 및 평가
y_prob = dnn_model.predict(x_val_scaled).flatten()

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

dnn_results.append({
    'features': 'All Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'All Features (DNN)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})

Epoch 1/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.7192 - loss: 0.5302 - val_accuracy: 0.8464 - val_loss: 0.3989
Epoch 2/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8414 - loss: 0.3514 - val_accuracy: 0.8892 - val_loss: 0.3289
Epoch 3/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8538 - loss: 0.3296 - val_accuracy: 0.8620 - val_loss: 0.3715
Epoch 4/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8621 - loss: 0.3145 - val_accuracy: 0.8818 - val_loss: 0.3411
Epoch 5/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8688 - loss: 0.3023 - val_accuracy: 0.8861 - val_loss: 0.3337
Epoch 6/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8763 - loss: 0.2889 - val_accuracy: 0.8837 - val_loss: 0.3309
Epoch 7/50

In [49]:
dnn_results[-1]

{'features': 'All Features',
 'AUC': np.float64(0.5028994259668546),
 'Brier': 0.09116245806217194,
 'ECE': np.float64(0.13659927290950993),
 'Combined': np.float64(0.3054907197594932)}

### Top 20 Features

In [51]:
feature_cols = [
    'Age',
    # B1
    'B1_task1_accuracy','B1_task2_non_change_acc','B1_task2_mean_rt',
    # B2
    'B2_task2_change_acc','B2_task2_non_change_acc','B2_task2_accuracy_gap','B2_task2_mean_rt',
    # B4
    'B4_congruent_accuracy','B4_incongruent_accuracy','B4_accuracy_gap',
    # B5
    'B5_accuracy',
    # B7
    'B7_accuracy',
    # B9
    'B9_aud_fa','B9_aud_cr','B9_vis_err',
    # B10
    'B10_aud_hit','B10_aud_miss','B10_aud_fa','B10_vis1_err','B10_vis2_correct'
]

In [52]:
x = train_b[feature_cols]
y = train_b['Label']

x_train, x_val, y_train, y_val = train_test_split(x,
                                                  y,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y)

# SMOTE
smote = SMOTE(random_state=42)
x_res, y_res = smote.fit_resample(x_train, y_train)

# 정규화
scaler = StandardScaler()
x_res = scaler.fit_transform(x_res)
x_val_scaled = scaler.transform(x_val)

# DNN 모델 정의
input_dim = x_res.shape[1]

dnn_model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn_model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['accuracy'])

# EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# 학습
history = dnn_model.fit(
    x_res, y_res,
    validation_data=(x_val_scaled, y_val),
    epochs=500,
    batch_size=256,
    callbacks=[early_stop],
    verbose=1
)

# 예측 및 평가
y_prob = dnn_model.predict(x_val_scaled).flatten()

# 평가를 위한 데이터프레임 형태 맞추기
answer_df = pd.DataFrame({'id': x_val.index, 'target': y_val})
submission_df = pd.DataFrame({'id': x_val.index, 'target': y_prob})

auc = roc_auc_score(y_val, y_prob)
brier = mean_squared_error(y_val, y_prob)
ece = expected_calibration_error(y_val, y_prob)
combined_score = auc_brier_ece(answer_df, submission_df)

dnn_results.append({
    'features': 'Top 20 Features',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})
results.append({
    'features': 'Top 20 Features (DNN)',
    'AUC': auc,
    'Brier': brier,
    'ECE': ece,
    'Combined': combined_score
})

Epoch 1/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.6693 - loss: 0.5943 - val_accuracy: 0.7754 - val_loss: 0.4783
Epoch 2/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7559 - loss: 0.4855 - val_accuracy: 0.7559 - val_loss: 0.4850
Epoch 3/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7775 - loss: 0.4523 - val_accuracy: 0.8678 - val_loss: 0.3754
Epoch 4/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8151 - loss: 0.3972 - val_accuracy: 0.8407 - val_loss: 0.4244
Epoch 5/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8310 - loss: 0.3690 - val_accuracy: 0.8649 - val_loss: 0.3830
Epoch 6/500
[1m1781/1781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.8392 - loss: 0.3523 - val_accuracy: 0.8894 - val_loss: 0.3405
Epoch 7/50

In [53]:
dnn_results[-1]

{'features': 'Top 20 Features',
 'AUC': np.float64(0.495587314025687),
 'Brier': 0.09551866352558136,
 'ECE': np.float64(0.16625043153197958),
 'Combined': np.float64(0.3176486167515467)}

### Result

In [54]:
dnn_results_df = pd.DataFrame(dnn_results)
best = dnn_results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features,0.502899,0.091162,0.136599,0.305491
1,Top 20 Features,0.495587,0.095519,0.16625,0.317649


# **Final Result**

In [55]:
results_df = pd.DataFrame(results)
best = results_df.sort_values('Combined').head(5)
display(best)

Unnamed: 0,features,AUC,Brier,ECE,Combined
0,All Features (LGBM),0.537453,0.040582,0.003082,0.24219
5,Top 20 Features (XGB),0.537243,0.040595,0.004785,0.242724
1,Top 20 Features (LGBM),0.534364,0.040575,0.002716,0.243641
4,All Features (XGB),0.534494,0.040611,0.004494,0.244029
2,Over 0.3 Features (LGBM),0.499629,0.042803,0.021051,0.266149
