In [1]:
import os
import sklearn
import warnings
import numpy as np
import pandas as pd
import optuna
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
scale_col = ['physical_activity_minutes_per_week','diet_score','sleep_hours_per_day','screen_time_hours_per_day','bmi','waist_to_hip_ratio',
             'systolic_bp','diastolic_bp','heart_rate','cholesterol_total','hdl_cholesterol','ldl_cholesterol','triglycerides']
encode_col = ['gender','ethnicity','education_level','income_level',
              'smoking_status','employment_status']

In [4]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

def train_encode_scale_process(df,scale_col,encode_col):
    scalers = {}
    encoders = {}
    for col in scale_col:
        scaler = MinMaxScaler()
        df['scale_'+col] = scaler.fit_transform(df[[col]])
        scalers[col] = scaler
        df = df.drop(columns = col)
    for col in encode_col:
        encoder = LabelEncoder()
        df['encode_'+col] = encoder.fit_transform(df[[col]])
        encoders[col] = encoder
        df = df.drop(columns = col)
    df = df.drop(columns = ['id']).reset_index(drop = True)
    return df,scalers,encoders
def pred_encode_scale_process(df_pred,scale_col,encode_col,scalers,encoders):
    for col in scale_col:
        scaler = scalers[col]
        df_pred['scale_'+col] = scaler.transform(df_pred[[col]])
        df_pred = df_pred.drop(columns = col)
    for col in encode_col:
        encoder = encoders[col]
        df_pred['encode_'+col] = encoder.transform(df_pred[[col]])
        df_pred = df_pred.drop(columns = col)
    df_pred= df_pred.drop(columns = ['id']).reset_index(drop = True)
    return df_pred

In [5]:
train,scalers,encoders = train_encode_scale_process(df_train,scale_col,encode_col)
pred = pred_encode_scale_process(df_test,scale_col,encode_col,scalers,encoders)

In [6]:
x_col = list(train.columns)
y_col = 'diagnosed_diabetes'
x_col.remove('diagnosed_diabetes')
x = train[x_col]
y = train[y_col]

# RandomForestClassifier

In [14]:
from sklearn.model_selection import train_test_split

train_data,test_data,train_label,test_label = train_test_split(x,y,test_size=0.2,random_state=42,shuffle=True)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 建立並配置 Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,       # 樹的數量（可以增加以提升擬合能力，但可能增加計算成本）
    max_depth=24,           # 最大樹深度（限制以避免過擬合）
    min_samples_split=10,   # 分裂的最小樣本數
    min_samples_leaf=10,     # 葉節點的最小樣本數
    n_jobs=-1               # 使用所有處理器並行運算
)
rf_model.fit(train_data,train_label)

y_pred_rf = rf_model.predict(test_data)
accuracy = accuracy_score(test_label, y_pred_rf)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6669285714285714


In [29]:
pred['diagnosed_diabetes'] = rf_model.predict(pred)

In [35]:
combine = pd.DataFrame({
    'id':df_test['id'],
    'diagnosed_diabetes':pred['diagnosed_diabetes']
})

In [36]:
combine.to_csv('submission.csv', index=False)

In [38]:
# 特徵重要性
feature_importances = rf_model.feature_importances_
features = train_data.columns  # 特徵名稱

# 創建 DataFrame
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# 打印特徵重要性
print(importance_df)

                                     Feature  Importance
5   scale_physical_activity_minutes_per_week    0.138432
2                    family_history_diabetes    0.121410
0                                        age    0.092484
9                                  scale_bmi    0.059886
17                       scale_triglycerides    0.059320
16                     scale_ldl_cholesterol    0.052580
11                         scale_systolic_bp    0.050191
8            scale_screen_time_hours_per_day    0.049385
14                   scale_cholesterol_total    0.048486
6                           scale_diet_score    0.048345
7                  scale_sleep_hours_per_day    0.042388
15                     scale_hdl_cholesterol    0.041338
13                          scale_heart_rate    0.040573
12                        scale_diastolic_bp    0.039195
10                  scale_waist_to_hip_ratio    0.031197
19                          encode_ethnicity    0.014818
1               alcohol_consump

In [42]:
# 嘗試篩選掉低於 5% 以下特徵再進行一次
scale_col = ['physical_activity_minutes_per_week','bmi',
             'systolic_bp','triglycerides']
encode_col = []
drop_col = ['cardiovascular_history','hypertension_history','employment_status','smoking_status','gender','education_level','income_level','ethnicity',
           'alcohol_consumption_per_week','waist_to_hip_ratio','diastolic_bp','heart_rate','hdl_cholesterol','sleep_hours_per_day','diet_score',
           'cholesterol_total','screen_time_hours_per_day']
new_df_train = df_train.drop(columns=drop_col)
new_df_test = df_test.drop(columns=drop_col)
train,scalers,encoders = train_encode_scale_process(new_df_train,scale_col,encode_col)
pred = pred_encode_scale_process(new_df_test,scale_col,encode_col,scalers,encoders)

In [48]:
x_col = list(train.columns)
y_col = 'diagnosed_diabetes'
x_col.remove('diagnosed_diabetes')
x = train[x_col]
y = train[y_col]
from sklearn.model_selection import StratifiedKFold
# 初始化 StratifiedKFold
n_splits = 5  # 定義摺疊數目
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 建立並配置 Random Forest
rf_model2 = RandomForestClassifier(
    n_estimators=200,       # 樹的數量（可以增加以提升擬合能力，但可能增加計算成本）
    max_depth=30,           # 最大樹深度（限制以避免過擬合）
    min_samples_split=10,   # 分裂的最小樣本數
    min_samples_leaf=10,     # 葉節點的最小樣本數
    n_jobs=-1               # 使用所有處理器並行運算
)
# 保存每折的評估結果
fold_accuracies = []

# 開始交叉驗證
for fold_index, (train_index, test_index) in enumerate(skf.split(x, y)):
    print(f"Fold {fold_index + 1}:")
    
    # 切分訓練集和測試集
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]  # 如果是 DataFrame
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 訓練模型
    rf_model2.fit(X_train, y_train)
    
    # 進行測試集預測
    y_pred = rf_model2.predict(X_test)
    
    # 計算準確率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for fold {fold_index + 1}: {accuracy}")
    
    # 保存當前折的準確率
    fold_accuracies.append(accuracy)

# 計算平均交叉驗證準確率
mean_accuracy = np.mean(fold_accuracies)
print(f"Mean cross-validation accuracy: {mean_accuracy}")

Fold 1:
Accuracy for fold 1: 0.6721285714285714
Fold 2:
Accuracy for fold 2: 0.6713071428571429
Fold 3:
Accuracy for fold 3: 0.6695571428571429
Fold 4:
Accuracy for fold 4: 0.6697428571428572
Fold 5:
Accuracy for fold 5: 0.6696928571428571
Mean cross-validation accuracy: 0.6704857142857142


# LogisticRegression

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split

train_data,test_data,train_label,test_label = train_test_split(x,y,test_size=0.2,random_state=42,shuffle=True)
lr_model = LogisticRegression()
lr_model.fit(train_data,train_label)

y_pred_lr = lr_model.predict(test_data)
accuracy = accuracy_score(test_label, y_pred_lr)
print(f"Accuracy: {accuracy}")


Accuracy: 0.6564142857142857


In [8]:
n_splits = 5  # 定義摺疊數目
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
lr_model_skf = LogisticRegression()
# 保存每折的評估結果
fold_accuracies = []

# 開始交叉驗證
for fold_index, (train_index, test_index) in enumerate(skf.split(x, y)):
    print(f"Fold {fold_index + 1}:")
    
    # 切分訓練集和測試集
    X_train, X_test = x.iloc[train_index], x.iloc[test_index]  # 如果是 DataFrame
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # 訓練模型
    lr_model_skf.fit(X_train, y_train)
    
    # 進行測試集預測
    y_pred = lr_model_skf.predict(X_test)
    
    # 計算準確率
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for fold {fold_index + 1}: {accuracy}")
    
    # 保存當前折的準確率
    fold_accuracies.append(accuracy)

# 計算平均交叉驗證準確率
mean_accuracy = np.mean(fold_accuracies)
print(f"Mean cross-validation accuracy: {mean_accuracy}")

Fold 1:
Accuracy for fold 1: 0.6592428571428571
Fold 2:
Accuracy for fold 2: 0.6523142857142857
Fold 3:
Accuracy for fold 3: 0.6509928571428572
Fold 4:
Accuracy for fold 4: 0.6548071428571428
Fold 5:
Accuracy for fold 5: 0.6545928571428571
Mean cross-validation accuracy: 0.65439


# Gradient Boosting

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

train_data,test_data,train_label,test_label = train_test_split(x,y,test_size=0.2,random_state=42,shuffle=True)

gbc_model = GradientBoostingClassifier(
            n_estimators=200,            # 樹的數量（增大可能提升效果，但需注意計算成本）
            learning_rate=0.01,           # 學習率（減少可能減低步長，避免過擬合）
            max_depth=5,                 # 樹的深度（增加過擬合可能性）
            subsample=0.8,               # 每棵樹所使用的子樣本比例（控制過擬合）
            random_state=42              # 保持結果穩定
            )
gbc_model.fit(train_data,train_label)

y_pred_gbc = gbc_model.predict(test_data)
accuracy = accuracy_score(test_label, y_pred_gbc)
print(f"Accuracy: {accuracy}")

# LGBM

In [8]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 將資料分為訓練集和測試集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# LightGBM 的數據格式
train_data = lgb.Dataset(x_train, label=y_train)
test_data = lgb.Dataset(x_test, label=y_test, reference=train_data)

# 設定參數
params = {
    'objective': 'binary',           # 二分類任務
    'metric': 'binary_error',        # 度量準確率（錯誤率，1-error）
    'boosting_type': 'gbdt',         # 使用梯度提升樹
    'learning_rate': 0.1,            # 學習率，可以根據需求調整
    'num_leaves': 31,                # 葉子節點數目（控制模型複雜度）
    'max_depth': -1,                 # 不限制樹的深度
    'verbose': -1                    # 關閉訓練過程中的輸出
}

# 訓練模型
num_round = 100                     # 訓練的迭代次數
model = lgb.train(
    params,
    train_data,
    num_boost_round=num_round,
    valid_sets=[train_data, test_data],
    valid_names=['train', 'valid']
)
y_pred_prob = model.predict(x_test, num_iteration=model.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)   # 設定閾值為 0.5
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6801785714285714


In [9]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

# 建立模型
model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42
)

# 執行 5 折交叉驗證
scores = cross_val_score(model, x, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation accuracy: {scores.mean()}")

Cross-validation scores: [0.6819     0.68097143 0.68295    0.68185714 0.67711429]
Mean cross-validation accuracy: 0.6809585714285713


# Advanced Application by LGBM + optuna : To search the best hyperparameter

In [7]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    par = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'random_state': 42,
        'n_jobs': -1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'n_estimators': 2000 # Use a smaller number during tuning for speed
    }
    model = LGBMClassifier(**par)
    # 執行 5 折交叉驗證
    scores = cross_val_score(model, x, y, cv=5, scoring='accuracy')
    return scores.mean()
    
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) # Increase to 100 for better results

print("Best Score:", study.best_value)
print("Best Params:", study.best_params)


[I 2025-12-30 15:29:02,897] A new study created in memory with name: no-name-fee9e7a1-93e9-4d5c-a721-55e850128750
[I 2025-12-30 15:45:58,873] Trial 0 finished with value: 0.6848571428571428 and parameters: {'learning_rate': 0.010827781835940048, 'num_leaves': 236, 'max_depth': 9, 'min_child_samples': 145, 'subsample': 0.5800841515127184, 'colsample_bytree': 0.6933716365120444, 'lambda_l1': 0.00030058789814287095, 'lambda_l2': 0.00023062118601318822}. Best is trial 0 with value: 0.6848571428571428.
[I 2025-12-30 15:59:33,185] Trial 1 finished with value: 0.6849514285714285 and parameters: {'learning_rate': 0.01988605235192864, 'num_leaves': 113, 'max_depth': 10, 'min_child_samples': 128, 'subsample': 0.96345889030772, 'colsample_bytree': 0.8143642206334124, 'lambda_l1': 1.0724919378383263e-08, 'lambda_l2': 1.6284337014699236e-06}. Best is trial 1 with value: 0.6849514285714285.
[W 2025-12-30 15:59:37,315] Trial 2 failed with parameters: {'learning_rate': 0.04848188921148744, 'num_leaves

KeyboardInterrupt: 

In [11]:
print("Best Score:", study.best_value)
print("Best Params:", study.best_params)
best_params = study.best_params
best_params.update({'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'n_estimators': 5000})

Best Score: 0.6849514285714285
Best Params: {'learning_rate': 0.01988605235192864, 'num_leaves': 113, 'max_depth': 10, 'min_child_samples': 128, 'subsample': 0.96345889030772, 'colsample_bytree': 0.8143642206334124, 'lambda_l1': 1.0724919378383263e-08, 'lambda_l2': 1.6284337014699236e-06}


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
x_tr,x_te,y_tr,y_te = train_test_split(x,y,test_size=0.2,random_state=42)
# 建立模型
model = LGBMClassifier(
    **best_params
)
model.fit(x_tr,y_tr)

AttributeError: 'LGBMClassifier' object has no attribute 'best_iteration'

In [15]:
y_pred_prob = model.predict(x_te)
y_pred = (y_pred_prob > 0.5).astype(int)   # 設定閾值為 0.5
accuracy = accuracy_score(y_te, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6843928571428571


In [17]:
pred['diagnosed_diabetes'] = model.predict(pred)
combine = pd.DataFrame({
    'id':df_test['id'],
    'diagnosed_diabetes':pred['diagnosed_diabetes']
})
combine.to_csv('submission.csv', index=False)