In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import optuna
from sklearn.model_selection import StratifiedKFold
import random
import numpy as np

In [2]:
df=pd.read_csv('../balanced_cleaned_trian_dataset.csv',index_col=0)
test_set=pd.read_csv('../testset.csv',index_col=0)

In [3]:
grouped = df.groupby("smoking")
df_0 = grouped.get_group(0)
df_1 = grouped.get_group(1)
df_0_half = df_0.sample(n=150, random_state=42)
df_1_half = df_1.sample(n=150, random_state=42)
df_half = pd.concat([df_0_half, df_1_half])
df_half = df_half.sample(frac=1, random_state=42).reset_index(drop=True)


In [4]:
train_copy=df_half.drop('id',axis=1)
X = train_copy.drop('smoking', axis=1)
y = train_copy['smoking']

test_set=test_set.drop('id',axis=1)
X_test_new= test_set.drop('smoking', axis=1)
y_test_new = test_set['smoking']

In [5]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    hidden1 = trial.suggest_int('hidden1', 32, 128)
    hidden2 = trial.suggest_int('hidden2', 16, 64)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    epochs = trial.suggest_int('epochs', 20, 60)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        
        class MLP(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.model = nn.Sequential(
                    nn.Linear(input_dim, hidden1),
                    nn.ReLU(),
                    nn.Linear(hidden1, hidden2),
                    nn.ReLU(),
                    nn.Linear(hidden2, 1),
                    nn.Sigmoid()
                )
            def forward(self, x):
                return self.model(x)
        set_seed(42)
        model = MLP(X_train_tensor.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor)
            y_pred_labels = (y_pred > 0.5).float()
            acc = accuracy_score(y_val_tensor, y_pred_labels)
            scores.append(acc)

    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

print("best_params:", study.best_params)


[I 2025-08-22 14:56:00,951] A new study created in memory with name: no-name-4cec7da8-50f2-4b36-89d6-012fa20b9319
[I 2025-08-22 14:56:01,717] Trial 0 finished with value: 0.6933333333333334 and parameters: {'hidden1': 68, 'hidden2': 62, 'lr': 0.0029106359131330704, 'epochs': 44}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-22 14:56:01,802] Trial 1 finished with value: 0.5633333333333334 and parameters: {'hidden1': 47, 'hidden2': 23, 'lr': 0.00013066739238053285, 'epochs': 55}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-22 14:56:01,905] Trial 2 finished with value: 0.6333333333333333 and parameters: {'hidden1': 90, 'hidden2': 50, 'lr': 0.00010994335574766199, 'epochs': 59}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-22 14:56:01,963] Trial 3 finished with value: 0.6599999999999999 and parameters: {'hidden1': 112, 'hidden2': 26, 'lr': 0.0002310201887845295, 'epochs': 27}. Best is trial 0 with value: 0.6933333333333334.
[I 2025-08-22 14:56:02,

best_params: {'hidden1': 103, 'hidden2': 29, 'lr': 0.0010533018717800082, 'epochs': 31}


In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_new_scaler=scaler.transform(X_test_new)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

X_test_new_scaler_tensor=torch.tensor(X_test_new_scaler, dtype=torch.float32)
y_test_new_tensor=torch.tensor(y_test_new.values, dtype=torch.float32).view(-1, 1)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 103),
            nn.ReLU(),
            nn.Linear(103, 29),
            nn.ReLU(),
            nn.Linear(29, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.model(x)
set_seed(42)
model = MLP(input_dim=X_tensor.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0010533018717800082)


epochs = 31
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():

    y_pred = model(X_test_new_scaler_tensor)
    y_pred_labels = (y_pred > 0.5).float()

    y_true = y_test_new_tensor.cpu().numpy()
    y_pred_labels_np = y_pred_labels.cpu().numpy()
 
    

    y_pred_train=model(X_tensor)
    y_pred_labels_train = (y_pred_train > 0.5).float()

    y_pred_labels_train_np = y_pred_labels_train.cpu().numpy()

    train_acc=accuracy_score(y, y_pred_labels_train)
    print(f"\ntrain Accuracy: {train_acc:.4f}")
    

    acc = accuracy_score(y_true, y_pred_labels_np)
    print(f"\nTest Accuracy:  {acc:.4f}")

Epoch 0, Loss: 0.6954
Epoch 10, Loss: 0.6447
Epoch 20, Loss: 0.5849
Epoch 30, Loss: 0.5238

train Accuracy: 0.7533

Test Accuracy:  0.7465


In [8]:
model.eval()
with torch.no_grad():

    y_pred = model(X_test_new_scaler_tensor)
    y_pred_labels = (y_pred > 0.5).float()

    y_true = y_test_new_tensor.cpu().numpy()
    y_pred_labels_np = y_pred_labels.cpu().numpy()



    
    acc = accuracy_score(y_true, y_pred_labels_np)
    precision = precision_score(y_true, y_pred_labels_np)
    recall = recall_score(y_true, y_pred_labels_np)
    f1 = f1_score(y_true, y_pred_labels_np)


    print(f"\nTest Accuracy:  {acc:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall:    {recall:.4f}")
    print(f"Test F1 Score:  {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred_labels_np, digits=4))

    
    y_pred_train=model(X_tensor)
    y_pred_labels_train = (y_pred_train > 0.5).float()
    train_acc=accuracy_score(y, y_pred_labels_train)
    print(f"\ntrain Accuracy: {train_acc:.4f}")


Test Accuracy:  0.7465
Test Precision: 0.7157
Test Recall:    0.8180
Test F1 Score:  0.7634

Classification Report:
               precision    recall  f1-score   support

         0.0     0.7876    0.6750    0.7270      1000
         1.0     0.7157    0.8180    0.7634      1000

    accuracy                         0.7465      2000
   macro avg     0.7516    0.7465    0.7452      2000
weighted avg     0.7516    0.7465    0.7452      2000


train Accuracy: 0.7533


In [9]:

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, val_idx in skf.split(X, y):
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_val_scaled = scaler.transform(X_val_fold)

    
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
    y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

    
    class MLP(nn.Module):
        def __init__(self, input_dim):
            super(MLP, self).__init__()
            self.model = nn.Sequential(
               nn.Linear(input_dim, 103),
            nn.ReLU(),
            nn.Linear(103, 29),
            nn.ReLU(),
            nn.Linear(29, 1),
            nn.Sigmoid()
            )
        
        def forward(self, x):
            return self.model(x)
    set_seed(42)
    model = MLP(X_train_tensor.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.0010533018717800082)

    epochs=31
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    
    model.eval()
    with torch.no_grad():
        y_pred = model(X_val_tensor)
        y_pred_labels = (y_pred > 0.5).float()
        acc = accuracy_score(y_val_tensor, y_pred_labels)
        scores.append(acc)
    

        y_pred_train=model(X_train_tensor)
        y_pred_labels_train = (y_pred_train > 0.5).float()
        train_acc=accuracy_score(y_train_fold, y_pred_labels_train)
        print(f"\ntrain Accuracy: {train_acc:.4f}")
        print(f"\ntest Accuracy: {acc:.4f}")




train Accuracy: 0.7625

test Accuracy: 0.7000

train Accuracy: 0.7542

test Accuracy: 0.7333

train Accuracy: 0.8000

test Accuracy: 0.6500

train Accuracy: 0.7625

test Accuracy: 0.7667

train Accuracy: 0.7542

test Accuracy: 0.7833
