In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np 
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
import optuna
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
train=pd.read_csv('../Unbalanced_c_Dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

train_copy=train.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test= test_set.drop('smoking', axis=1)
y_test = test_set['smoking']

In [3]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import numpy as np
import xgboost as xgb


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=20)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)

    scores = []
    F1=[]

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        
        # scaler = MinMaxScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_val_scaled = scaler.transform(X_val)

        
        model = xgb.XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            n_jobs=-1,
            random_state=42
        )

        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        acc = accuracy_score(y_val, y_pred)
        
        F1c=f1_score(y_val, y_pred, average='macro') 
        F1.append(F1c)
        

        y_pred_train = model.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        f1_train = f1_score(y_train, y_pred_train, average='macro')
        

        overfit_penalty = abs(acc_train - acc)
        score = 0.5 * acc - 0.5 * overfit_penalty
        
        
        
        scores.append(score)

    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=20)  


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-23 17:13:57,629] A new study created in memory with name: no-name-86ea3975-8a89-4b19-8d22-d98eb279418b
[I 2025-08-23 17:13:59,186] Trial 0 finished with value: 0.2720083333333333 and parameters: {'n_estimators': 180, 'max_depth': 10, 'learning_rate': 0.1205712628744377}. Best is trial 0 with value: 0.2720083333333333.
[I 2025-08-23 17:13:59,689] Trial 1 finished with value: 0.37958333333333333 and parameters: {'n_estimators': 220, 'max_depth': 4, 'learning_rate': 0.01699897838270077}. Best is trial 1 with value: 0.37958333333333333.
[I 2025-08-23 17:14:00,441] Trial 2 finished with value: 0.30864166666666665 and parameters: {'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.07725378389307355}. Best is trial 1 with value: 0.37958333333333333.
[I 2025-08-23 17:14:00,850] Trial 3 finished with value: 0.351 and parameters: {'n_estimators': 240, 'max_depth': 3, 'learning_rate': 0.27081608642499677}. Best is trial 1 with value: 0.37958333333333333.
[I 2025-08-23 17:14:01,478

best_params: {'n_estimators': 280, 'max_depth': 4, 'learning_rate': 0.01855998084649058}
best_value: 0.3800


In [4]:
model = xgb.XGBClassifier(
        objective='binary:logistic',
        max_depth=4,
        learning_rate=0.01855998084649058,
        n_estimators=280,
        eval_metric='logloss',
        random_state=42
    )
model.fit(X, y)  

y_test_pred = model.predict(X_test)

acc_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, average='macro')
recall_test = recall_score(y_test, y_test_pred, average='macro')
f1_test = f1_score(y_test, y_test_pred, average='macro')

train_pred = model.predict(X)  
train_acc = accuracy_score(y, train_pred)
print(f"Train Accuracy: {train_acc:.4f}, test Accuracy: {acc_test:.4f}")

print(f"Test set Accuracy:  {acc_test:.4f}")
print(f"Test set Precision: {precision_test:.4f}")
print(f"Test set Recall:    {recall_test:.4f}")
print(f"Test set F1 Score:  {f1_test:.4f}")
print("\nTest set classification_report:")
print(classification_report(y_test, y_test_pred, digits=4))

Train Accuracy: 0.7920, test Accuracy: 0.7790
Test set Accuracy:  0.7790
Test set Precision: 0.7929
Test set Recall:    0.7790
Test set F1 Score:  0.7763

Test set classification_report:
              precision    recall  f1-score   support

           0     0.8568    0.6700    0.7520      1000
           1     0.7291    0.8880    0.8007      1000

    accuracy                         0.7790      2000
   macro avg     0.7929    0.7790    0.7763      2000
weighted avg     0.7929    0.7790    0.7763      2000



In [5]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
   
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    
    # scaler = MinMaxScaler()
    # X_train_scaled = scaler.fit_transform(X_train)
    # X_val_scaled = scaler.transform(X_val)

    
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        max_depth=4,
        learning_rate=0.01855998084649058,
        n_estimators=280,
        eval_metric='logloss',
        random_state=42
    )

    
    model.fit(X_train, y_train)

    y_train_pred=model.predict(X_train)
    y_train_acc=accuracy_score(y_train, y_train_pred)



    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"Fold {fold+1} train Accuracy: {y_train_acc:.4f}")
    print(f"Fold {fold+1} Accuracy: {acc:.4f}")

    
    scores.append(acc)


avg_acc = np.mean(scores)
print(f"\n avg_acc: {avg_acc:.4f}")

Fold 1 train Accuracy: 0.7904
Fold 1 Accuracy: 0.7910
Fold 2 train Accuracy: 0.7997
Fold 2 Accuracy: 0.7617
Fold 3 train Accuracy: 0.7924
Fold 3 Accuracy: 0.7870
Fold 4 train Accuracy: 0.7950
Fold 4 Accuracy: 0.7740
Fold 5 train Accuracy: 0.7951
Fold 5 Accuracy: 0.7733

 avg_acc: 0.7774
