In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from optuna.visualization import plot_optimization_history

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_sub = df_0.sample(n=150, random_state=42)
df_1_sub = df_1.sample(n=150, random_state=42)
df_sub = pd.concat([df_0_sub, df_1_sub])
df_sub = df_sub.sample(frac=1, random_state=42).reset_index(drop=True)

In [4]:
df_sub['smoking'].value_counts()

smoking
1    150
0    150
Name: count, dtype: int64

In [5]:
train_copy=df_sub.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test= test_set.drop('smoking', axis=1)
y_test = test_set['smoking']

In [18]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    
    n_estimators = trial.suggest_int('n_estimators', 20, 100, step=10)
    max_depth = trial.suggest_int('max_depth', 2, 6)
    max_features = trial.suggest_int('max_features', 2, 10)
    # min_samples_split = trial.suggest_int('min_samples_split', 2, 30)
    # min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 15)

    scores = []
    F1=[]

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        
        # scaler = MinMaxScaler()
        # X_train_scaled = scaler.fit_transform(X_train)
        # X_val_scaled = scaler.transform(X_val)

        
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            max_features=max_features,
            random_state=42,
            n_jobs=-1,
        )


        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        F1c=f1_score(y_val, y_pred, average='macro') 
        F1.append(F1c)


        y_pred_train = rf.predict(X_train)
        acc_train = accuracy_score(y_train, y_pred_train)
        f1_train = f1_score(y_train, y_pred_train, average='macro')

        overfit_penalty = abs(acc_train - acc)

        #scores.append(acc)
        score = acc


        scores.append(score)


    return np.mean(scores)

sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction='maximize',sampler=sampler)
study.optimize(objective, n_trials=20)  


print("best_params:", study.best_params)
print(f"best_value: {study.best_value:.4f}")

[I 2025-08-20 19:14:08,764] A new study created in memory with name: no-name-b6697c75-12ad-4df2-929b-98dd5232c6d6
[I 2025-08-20 19:14:09,135] Trial 0 finished with value: 0.7333333333333332 and parameters: {'n_estimators': 50, 'max_depth': 6, 'max_features': 8}. Best is trial 0 with value: 0.7333333333333332.
[I 2025-08-20 19:14:09,452] Trial 1 finished with value: 0.7433333333333334 and parameters: {'n_estimators': 70, 'max_depth': 2, 'max_features': 3}. Best is trial 1 with value: 0.7433333333333334.
[I 2025-08-20 19:14:09,680] Trial 2 finished with value: 0.6966666666666667 and parameters: {'n_estimators': 20, 'max_depth': 6, 'max_features': 7}. Best is trial 1 with value: 0.7433333333333334.
[I 2025-08-20 19:14:10,063] Trial 3 finished with value: 0.7166666666666666 and parameters: {'n_estimators': 80, 'max_depth': 2, 'max_features': 10}. Best is trial 1 with value: 0.7433333333333334.
[I 2025-08-20 19:14:10,442] Trial 4 finished with value: 0.7266666666666666 and parameters: {'n_e

best_params: {'n_estimators': 70, 'max_depth': 5, 'max_features': 4}
best_value: 0.7567


In [20]:
fig = plot_optimization_history(study)
fig.update_layout(
    title_text="Random Forest Optimization Progress",  
    title_x=0.5,  
    title_font_size=20,  
    xaxis_title="Trial Number",  
    yaxis_title="Accuracy"
)
fig.show()