In [1]:
import xgboost as xgb
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np 
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import optuna
from optuna.visualization import plot_optimization_history

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_sub = df_0.sample(n=150, random_state=42)
df_1_sub = df_1.sample(n=150, random_state=42)
df_sub = pd.concat([df_0_sub, df_1_sub])
df_sub = df_sub.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
train_copy=df_sub.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test= test_set.drop('smoking', axis=1)
y_test = test_set['smoking']

In [5]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np
import xgboost as xgb


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 50, 300, step=10)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.005, 0.3, log=True)

    scores = []
    F1=[]

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        
        # scaler = MinMaxScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_val_scaled = scaler.transform(X_val)

    
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            n_jobs=-1,
            random_state=42
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        acc = accuracy_score(y_val, y_pred)
        
        F1c=f1_score(y_val, y_pred, average='macro') 
        F1.append(F1c)
        

        y_pred_train = model.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        f1_train = f1_score(y_train, y_pred_train, average='macro')
        

        overfit_penalty = abs(acc_train - acc)
        score = acc
        
        
        
        scores.append(score)

    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)

study.optimize(objective, n_trials=20)  


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-20 19:22:03,128] A new study created in memory with name: no-name-4d70d527-5d06-4a44-bcd1-79f6637208b2
[I 2025-08-20 19:22:03,379] Trial 0 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 140, 'max_depth': 10, 'learning_rate': 0.1001303991139125}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-08-20 19:22:03,592] Trial 1 finished with value: 0.7 and parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.009470040922904443}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-08-20 19:22:03,734] Trial 2 finished with value: 0.7266666666666666 and parameters: {'n_estimators': 60, 'max_depth': 9, 'learning_rate': 0.05859268690985102}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-08-20 19:22:03,888] Trial 3 finished with value: 0.71 and parameters: {'n_estimators': 230, 'max_depth': 2, 'learning_rate': 0.2652261985899885}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-08-20 19:22:04,121] Trial 4 finished wit

best_params: {'n_estimators': 210, 'max_depth': 4, 'learning_rate': 0.019973242588202714}
best_value: 0.7433


In [6]:
fig = plot_optimization_history(study)
fig.update_layout(
    title_text="XGBoost Optimization Progress",  
    title_x=0.5,  
    title_font_size=20,  
    xaxis_title="Trial Number",  
    yaxis_title="Accuracy"
)
fig.show()