In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import optuna
from sklearn.model_selection import StratifiedKFold
import random
import numpy as np
from optuna.visualization import plot_optimization_history

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_half = df_0.sample(n=150, random_state=42)
df_1_half = df_1.sample(n=150, random_state=42)
df_half = pd.concat([df_0_half, df_1_half])
df_half = df_half.sample(frac=1, random_state=42).reset_index(drop=True)


In [4]:
train_copy=df_half.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test_new= test_set.drop('smoking', axis=1)
y_test_new = test_set['smoking']

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    hidden1 = trial.suggest_int('hidden1', 32, 128)
    hidden2 = trial.suggest_int('hidden2', 16, 64)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    epochs = trial.suggest_int('epochs', 20, 60)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        
        class MLP(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.model = nn.Sequential(
                    nn.Linear(input_dim, hidden1),
                    nn.ReLU(),
                    nn.Linear(hidden1, hidden2),
                    nn.ReLU(),
                    nn.Linear(hidden2, 1),
                    nn.Sigmoid()
                )
            def forward(self, x):
                return self.model(x)
        set_seed(42)
        model = MLP(X_train_tensor.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor)
            y_pred_labels = (y_pred > 0.5).float()
            acc = accuracy_score(y_val_tensor, y_pred_labels)
            scores.append(acc)

    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

print("best_params:", study.best_params)


[I 2025-08-20 19:01:48,485] A new study created in memory with name: no-name-c913ace5-1c84-4fe9-ad77-9ede516bfa8a
[I 2025-08-20 19:01:48,606] Trial 0 finished with value: 0.6933333333333334 and parameters: {'hidden1': 68, 'hidden2': 62, 'lr': 0.0029106359131330704, 'epochs': 44}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-20 19:01:48,716] Trial 1 finished with value: 0.5633333333333334 and parameters: {'hidden1': 47, 'hidden2': 23, 'lr': 0.00013066739238053285, 'epochs': 55}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-20 19:01:48,840] Trial 2 finished with value: 0.6333333333333333 and parameters: {'hidden1': 90, 'hidden2': 50, 'lr': 0.00010994335574766199, 'epochs': 59}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-20 19:01:48,889] Trial 3 finished with value: 0.6599999999999999 and parameters: {'hidden1': 112, 'hidden2': 26, 'lr': 0.0002310201887845295, 'epochs': 27}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-20 19:01:48,

best_params: {'hidden1': 103, 'hidden2': 29, 'lr': 0.0010533018717800082, 'epochs': 31}


In [12]:
fig = plot_optimization_history(study)
fig.update_layout(
    title_text="MLP Optimization Progress",  
    title_x=0.5,  
    title_font_size=20,  
    xaxis_title="Trial Number",  
    yaxis_title="Accuracy"
)
fig.show()