In [1]:
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np 
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import optuna


In [2]:
df=pd.read_csv('cleaned.csv',index_col=0)
train_copy=df.drop(columns=['bmi','avg_glucose_level','age'])
X = train_copy.drop('smoking_status', axis=1)
y = train_copy['smoking_status']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [3]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

 
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)


    model = xgb.XGBClassifier(
        objective='binary:logistic',
        max_depth=10,
        learning_rate=0.1,
        n_estimators=100,
        eval_metric='logloss',
        random_state=42
    )

 
    model.fit(X_train_scaled, y_train)

    
    y_pred = model.predict(X_val_scaled)
    acc = accuracy_score(y_val, y_pred)
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")
    scores.append(acc)


avg_acc = np.mean(scores)
print(f"\n avg_acc: {avg_acc:.4f}")

Fold 1 Accuracy: 0.5569
Fold 2 Accuracy: 0.5416
Fold 3 Accuracy: 0.5547
Fold 4 Accuracy: 0.5387
Fold 5 Accuracy: 0.5226

 avg_acc: 0.5429


In [4]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np
import xgboost as xgb




skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
   
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=20)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_val_scaled = scaler.transform(X_val)

       
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            n_jobs=-1,
            random_state=42
        )

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        acc = accuracy_score(y_val, y_pred)
        scores.append(acc)

    return np.mean(scores)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20) 


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-22 16:26:32,286] A new study created in memory with name: no-name-f4d56623-9635-4be6-b200-8d6e763a70f5
[I 2025-08-22 16:26:32,557] Trial 0 finished with value: 0.5747275010108319 and parameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01596238623655878}. Best is trial 0 with value: 0.5747275010108319.
[I 2025-08-22 16:26:32,986] Trial 1 finished with value: 0.5405703219765488 and parameters: {'n_estimators': 180, 'max_depth': 8, 'learning_rate': 0.27170267655794955}. Best is trial 0 with value: 0.5747275010108319.
[I 2025-08-22 16:26:33,672] Trial 2 finished with value: 0.54319678236258 and parameters: {'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.05260210989093649}. Best is trial 0 with value: 0.5747275010108319.
[I 2025-08-22 16:26:33,978] Trial 3 finished with value: 0.5534166116916005 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.16335177525473613}. Best is trial 0 with value: 0.5747275010108319.
[I 2025-08-22 16:26:

best_params: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.014777234896168582}
best_value: 0.5765
