In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import optuna
from sklearn.model_selection import StratifiedKFold
import random
import numpy as np

In [2]:
df=pd.read_csv('cleaned.csv',index_col=0)
testset=pd.read_csv('test.csv',index_col=0)

In [3]:

X = df.drop('SMOKING', axis=1)
y = df['SMOKING']

X_test_new = testset.drop('SMOKING', axis=1)
y_test_new = testset['SMOKING']

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    hidden1 = trial.suggest_int('hidden1', 32, 128)
    hidden2 = trial.suggest_int('hidden2', 16, 64)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    epochs = trial.suggest_int('epochs', 20, 60)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        
        class MLP(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.model = nn.Sequential(
                    nn.Linear(input_dim, hidden1),
                    nn.ReLU(),
                    nn.Linear(hidden1, hidden2),
                    nn.ReLU(),
                    nn.Linear(hidden2, 1),
                    nn.Sigmoid()
                )
            def forward(self, x):
                return self.model(x)
        set_seed(42)
        model = MLP(X_train_tensor.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor)
            y_pred_labels = (y_pred > 0.5).float()
            acc = accuracy_score(y_val_tensor, y_pred_labels)
            scores.append(acc)

    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

print("best_params:", study.best_params)

print(f"best_value: {study.best_value:.4f}")

[I 2025-08-16 09:40:19,197] A new study created in memory with name: no-name-be618f9a-41ae-4c69-a775-a9168f9ebfdd
[I 2025-08-16 09:40:19,805] Trial 0 finished with value: 0.777061224489796 and parameters: {'hidden1': 68, 'hidden2': 62, 'lr': 0.0029106359131330704, 'epochs': 44}. Best is trial 0 with value: 0.777061224489796.
[I 2025-08-16 09:40:19,881] Trial 1 finished with value: 0.6233469387755102 and parameters: {'hidden1': 47, 'hidden2': 23, 'lr': 0.00013066739238053285, 'epochs': 55}. Best is trial 0 with value: 0.777061224489796.
[I 2025-08-16 09:40:19,970] Trial 2 finished with value: 0.5178775510204082 and parameters: {'hidden1': 90, 'hidden2': 50, 'lr': 0.00010994335574766199, 'epochs': 59}. Best is trial 0 with value: 0.777061224489796.
[I 2025-08-16 09:40:20,015] Trial 3 finished with value: 0.6150204081632653 and parameters: {'hidden1': 112, 'hidden2': 26, 'lr': 0.0002310201887845295, 'epochs': 27}. Best is trial 0 with value: 0.777061224489796.
[I 2025-08-16 09:40:20,064] 

best_params: {'hidden1': 38, 'hidden2': 62, 'lr': 0.00853618986286683, 'epochs': 53}
best_value: 0.8422


In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X, y):
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        
        class MLP(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.model = nn.Sequential(
                    nn.Linear(input_dim, 38), #48
                    nn.ReLU(),
                    nn.Linear(38, 62), #35
                    nn.ReLU(),
                    nn.Linear(62, 1),
                    nn.Sigmoid()
                )
            def forward(self, x):
                return self.model(x)
        set_seed(42)
        model = MLP(X_train_tensor.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.00853618986286683) #0.005266579999771969
        epochs=53
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor)
            y_pred_labels = (y_pred > 0.5).float()
            acc = accuracy_score(y_val_tensor, y_pred_labels)
            

            y_pred_train=model(X_train_tensor)
            y_pred_labels_train = (y_pred_train > 0.5).float()
            y_pred_labels_train_np = y_pred_labels_train.cpu().numpy()

            train_acc=accuracy_score(y_train_fold, y_pred_labels_train)
            
            print(f"\ntrain Accuracy: {train_acc:.4f}")
            print(f"\ntest Accuracy: {acc:.4f}")

            print("\nClassification Report:\n", classification_report(y_val_tensor, y_pred_labels, digits=4))



train Accuracy: 1.0000

test Accuracy: 0.8200

Classification Report:
               precision    recall  f1-score   support

         0.0     0.8095    0.7727    0.7907        22
         1.0     0.8276    0.8571    0.8421        28

    accuracy                         0.8200        50
   macro avg     0.8186    0.8149    0.8164        50
weighted avg     0.8196    0.8200    0.8195        50


train Accuracy: 0.9949

test Accuracy: 0.8400

Classification Report:
               precision    recall  f1-score   support

         0.0     0.8182    0.8182    0.8182        22
         1.0     0.8571    0.8571    0.8571        28

    accuracy                         0.8400        50
   macro avg     0.8377    0.8377    0.8377        50
weighted avg     0.8400    0.8400    0.8400        50


train Accuracy: 0.9899

test Accuracy: 0.8980

Classification Report:
               precision    recall  f1-score   support

         0.0     0.8333    0.9524    0.8889        21
         1.0     0.96

In [7]:
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test_new)


X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)
y_val_tensor = torch.tensor(y_test_new.values, dtype=torch.float32).view(-1, 1)


class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 38), #48
                    nn.ReLU(),
                    nn.Linear(38, 62), #35
                    nn.ReLU(),
                    nn.Linear(62, 1),
                    nn.Sigmoid()
        )
    def forward(self, x):
        return self.model(x)
set_seed(42)
model = MLP(X_train_tensor.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00853618986286683) #0.005266579999771969

epochs=53
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()


model.eval()
with torch.no_grad():
    y_pred = model(X_val_tensor)
    y_pred_labels = (y_pred > 0.5).float()
    acc = accuracy_score(y_val_tensor, y_pred_labels)
    

    y_pred_train=model(X_train_tensor)
    y_pred_labels_train = (y_pred_train > 0.5).float()
    y_pred_labels_train_np = y_pred_labels_train.cpu().numpy()

    train_acc=accuracy_score(y, y_pred_labels_train)
    
    print(f"\ntrain Accuracy: {train_acc:.4f}")
    print(f"\ntest Accuracy: {acc:.4f}")

    print("\nClassification Report:\n", classification_report(y_val_tensor, y_pred_labels, digits=4))


train Accuracy: 0.9919

test Accuracy: 0.8871

Classification Report:
               precision    recall  f1-score   support

         0.0     0.8571    0.8889    0.8727        27
         1.0     0.9118    0.8857    0.8986        35

    accuracy                         0.8871        62
   macro avg     0.8845    0.8873    0.8856        62
weighted avg     0.8880    0.8871    0.8873        62



In [8]:
model.eval()
with torch.no_grad():
    y_proba_mlp = model(X_val_tensor)   
    y_proba_mlp_np = y_proba_mlp.cpu().numpy()      
    y_proba_mlp_flat = y_proba_mlp_np.flatten()

df_scores = pd.read_csv("roc_scores_rf.csv")   

df_scores["mlp"] = y_proba_mlp_flat


df_scores.to_csv("roc_scores_rf.csv", index=False)