In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_half = df_0.sample(frac=0.5, random_state=42)
df_1_half = df_1.sample(frac=0.5, random_state=42)
df_half = pd.concat([df_0_half, df_1_half])
df_half = df_half.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
df_half['smoking'].value_counts()

smoking
1    5000
0    5000
Name: count, dtype: int64

In [5]:
train_copy=df_half.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test= test_set.drop('smoking', axis=1)
y_test = test_set['smoking']

In [6]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=20)
    max_depth = trial.suggest_int('max_depth', 5, 10)
    max_features = trial.suggest_int('max_features', 3, 10)

    scores = []
    F1=[]

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        
        # scaler = MinMaxScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_val_scaled = scaler.transform(X_val)

        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=-1  
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        
        acc = accuracy_score(y_val, y_pred)
        F1c=f1_score(y_val, y_pred, average='macro') 
        F1.append(F1c)
        

        y_pred_train = rf.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        f1_train = f1_score(y_train, y_pred_train, average='macro')

        overfit_penalty = abs(acc_train - acc)

        #scores.append(acc)
        score = 0.5 * acc - 0.5 * overfit_penalty


        scores.append(score)

    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=20)  


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-22 15:06:17,033] A new study created in memory with name: no-name-60f0b277-58a8-47b5-a6f5-18e18518e642
[I 2025-08-22 15:06:19,187] Trial 0 finished with value: 0.3400875 and parameters: {'n_estimators': 180, 'max_depth': 10, 'max_features': 8}. Best is trial 0 with value: 0.3400875.
[I 2025-08-22 15:06:20,436] Trial 1 finished with value: 0.372875 and parameters: {'n_estimators': 220, 'max_depth': 5, 'max_features': 4}. Best is trial 1 with value: 0.372875.
[I 2025-08-22 15:06:21,506] Trial 2 finished with value: 0.33863750000000004 and parameters: {'n_estimators': 100, 'max_depth': 10, 'max_features': 7}. Best is trial 1 with value: 0.372875.
[I 2025-08-22 15:06:23,383] Trial 3 finished with value: 0.3745125 and parameters: {'n_estimators': 240, 'max_depth': 5, 'max_features': 10}. Best is trial 3 with value: 0.3745125.
[I 2025-08-22 15:06:25,228] Trial 4 finished with value: 0.3715875 and parameters: {'n_estimators': 280, 'max_depth': 6, 'max_features': 4}. Best is trial 3

best_params: {'n_estimators': 300, 'max_depth': 5, 'max_features': 10}
best_value: 0.3755


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []
for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    rf = RandomForestClassifier(
            n_estimators=300,
            max_depth=5,
            max_features=10,
            random_state=42
        )
    rf.fit(X_train_scaled, y_train)
    y_pred = rf.predict(X_val_scaled)

    acc = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='macro')
    recall = recall_score(y_val, y_pred, average='macro')
    f1 = f1_score(y_val, y_pred, average='macro')

    print(f"Fold {fold + 1}:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print("-" * 30)

  
    print(f"Fold {fold + 1} Classification Report:")
    print(classification_report(y_val, y_pred, digits=2))
    print("-" * 40)
    
    scores.append(acc)

    avg_acc = np.mean(scores)

Fold 1:
  Accuracy:  0.7890
  Precision: 0.8057
  Recall:    0.7890
  F1 Score:  0.7861
------------------------------
Fold 1 Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.67      0.76      1000
           1       0.73      0.91      0.81      1000

    accuracy                           0.79      2000
   macro avg       0.81      0.79      0.79      2000
weighted avg       0.81      0.79      0.79      2000

----------------------------------------
Fold 2:
  Accuracy:  0.7645
  Precision: 0.7826
  Recall:    0.7645
  F1 Score:  0.7607
------------------------------
Fold 2 Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.64      0.73      1000
           1       0.71      0.89      0.79      1000

    accuracy                           0.76      2000
   macro avg       0.78      0.76      0.76      2000
weighted avg       0.78      0.76      0.76      2000

-------

In [8]:
rf_final = RandomForestClassifier(
    n_estimators=300,
            max_depth=5,
            max_features=10,
            random_state=42
    
)
rf_final.fit(X, y)  

y_test_pred = rf_final.predict(X_test)

acc_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='macro')
recall_test = recall_score(y_test, y_test_pred, average='macro')
f1_test = f1_score(y_test, y_test_pred, average='macro')

train_pred = rf_final.predict(X)  # 用整个训练集来预测
train_acc = accuracy_score(y, train_pred)
print(f"Train Accuracy: {train_acc:.4f}, test Accuracy: {acc_test:.4f}")

print(f"test Accuracy:  {acc_test:.4f}")
print(f"test Precision: {precision_test:.4f}")
print(f"test Recall:    {recall_test:.4f}")
print(f"test F1 Score:  {f1_test:.4f}")
print("\ntest classification_report:")
print(classification_report(y_test, y_test_pred, digits=2))

Train Accuracy: 0.7806, test Accuracy: 0.7745
test Accuracy:  0.7745
test Precision: 0.7908
test Recall:    0.7745
test F1 Score:  0.7713

test classification_report:
              precision    recall  f1-score   support

           0       0.86      0.66      0.74      1000
           1       0.72      0.89      0.80      1000

    accuracy                           0.77      2000
   macro avg       0.79      0.77      0.77      2000
weighted avg       0.79      0.77      0.77      2000

