In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.model_selection import StratifiedKFold
import optuna
import random
import numpy as np

In [2]:
df=pd.read_csv('balanced_cleaned_train_dataset.csv')
test_set=pd.read_csv('testset.csv')

In [3]:
X=df.drop('smoking',axis=1)
y=df['smoking']

X_test_new= test_set.drop('smoking', axis=1)
y_test_new = test_set['smoking']

In [4]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False



In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial):
    hidden1 = trial.suggest_int('hidden1', 32, 128)
    hidden2 = trial.suggest_int('hidden2', 16, 64)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    epochs = trial.suggest_int('epochs', 20, 60)

    scores = []

    for train_idx, val_idx in skf.split(X, y):
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_fold)
        X_val_scaled = scaler.transform(X_val_fold)

        
        X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
        y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

        
        class MLP(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.model = nn.Sequential(
                    nn.Linear(input_dim, hidden1),
                    nn.ReLU(),
                    nn.Linear(hidden1, hidden2),
                    nn.ReLU(),
                    nn.Linear(hidden2, 1),
                    nn.Sigmoid()
                )
            def forward(self, x):
                return self.model(x)
        set_seed(42)
        model = MLP(X_train_tensor.shape[1])
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)

        
        for epoch in range(epochs):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

       
        model.eval()
        with torch.no_grad():
            y_pred = model(X_val_tensor)
            y_pred_labels = (y_pred > 0.5).float()
            acc = accuracy_score(y_val_tensor, y_pred_labels)


          
            y_pred_train = model(X_train_tensor)
            y_pred_train_labels = (y_pred_train > 0.5).float()
            acc_train = accuracy_score(y_train_tensor, y_pred_train_labels)
        

            overfit_penalty = abs(acc_train - acc)
            score = 0.5 * acc - 0.5 * overfit_penalty
            scores.append(acc)


    return np.mean(scores)


sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=20)

print("best_params:", study.best_params)


[I 2025-08-14 11:21:22,076] A new study created in memory with name: no-name-04046b15-ed03-413f-ab5a-ca13f10805b3
[I 2025-08-14 11:21:23,394] Trial 0 finished with value: 0.8262499999999999 and parameters: {'hidden1': 68, 'hidden2': 62, 'lr': 0.0029106359131330704, 'epochs': 44}. Best is trial 0 with value: 0.8262499999999999.
[I 2025-08-14 11:21:24,184] Trial 1 finished with value: 0.739 and parameters: {'hidden1': 47, 'hidden2': 23, 'lr': 0.00013066739238053285, 'epochs': 55}. Best is trial 0 with value: 0.8262499999999999.
[I 2025-08-14 11:21:25,621] Trial 2 finished with value: 0.77815 and parameters: {'hidden1': 90, 'hidden2': 50, 'lr': 0.00010994335574766199, 'epochs': 59}. Best is trial 0 with value: 0.8262499999999999.
[I 2025-08-14 11:21:26,253] Trial 3 finished with value: 0.79195 and parameters: {'hidden1': 112, 'hidden2': 26, 'lr': 0.0002310201887845295, 'epochs': 27}. Best is trial 0 with value: 0.8262499999999999.
[I 2025-08-14 11:21:26,914] Trial 4 finished with value: 0

best_params: {'hidden1': 58, 'hidden2': 56, 'lr': 0.009970682275036459, 'epochs': 54}


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_new_scaler=scaler.transform(X_test_new)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

X_test_new_scaler_tensor=torch.tensor(X_test_new_scaler, dtype=torch.float32)
y_test_new_tensor=torch.tensor(y_test_new.values, dtype=torch.float32).view(-1, 1)

class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 58),
            nn.ReLU(),
            nn.Linear(58, 56),
            nn.ReLU(),
            nn.Linear(56, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        return self.model(x)
set_seed(42)
model = MLP(input_dim=X_tensor.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.009970682275036459)


epochs = 54
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

model.eval()
with torch.no_grad():

    y_pred = model(X_test_new_scaler_tensor)
    y_pred_labels = (y_pred > 0.5).float()

    y_true = y_test_new_tensor.cpu().numpy()
    y_pred_labels_np = y_pred_labels.cpu().numpy()
 
    y_pred_train=model(X_tensor)
    y_pred_labels_train = (y_pred_train > 0.5).float()

    y_pred_labels_train_np = y_pred_labels_train.cpu().numpy()

    train_acc=accuracy_score(y, y_pred_labels_train)
    print(f"\ntrain Accuracy: {train_acc:.4f}")
    

    acc = accuracy_score(y_true, y_pred_labels_np)
    print(f"\nTest Accuracy:  {acc:.4f}")

Epoch 0, Loss: 0.6930
Epoch 10, Loss: 0.4301
Epoch 20, Loss: 0.4136
Epoch 30, Loss: 0.4067
Epoch 40, Loss: 0.4023
Epoch 50, Loss: 0.3991

train Accuracy: 0.8325

Test Accuracy:  0.8325


In [7]:
model.eval()
with torch.no_grad():

    y_pred = model(X_test_new_scaler_tensor)
    y_pred_labels = (y_pred > 0.5).float()

    y_true = y_test_new_tensor.cpu().numpy()
    y_pred_labels_np = y_pred_labels.cpu().numpy()



    
    acc = accuracy_score(y_true, y_pred_labels_np)
    precision = precision_score(y_true, y_pred_labels_np)
    recall = recall_score(y_true, y_pred_labels_np)
    f1 = f1_score(y_true, y_pred_labels_np)


    print(f"\nTest Accuracy:  {acc:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall:    {recall:.4f}")
    print(f"Test F1 Score:  {f1:.4f}")
    print("\nClassification Report:\n", classification_report(y_true, y_pred_labels_np, digits=4))

    
    y_pred_train=model(X_tensor)
    y_pred_labels_train = (y_pred_train > 0.5).float()
    train_acc=accuracy_score(y, y_pred_labels_train)
    print(f"\ntrain Accuracy: {train_acc:.4f}")


Test Accuracy:  0.8325
Test Precision: 0.7859
Test Recall:    0.9140
Test F1 Score:  0.8451

Classification Report:
               precision    recall  f1-score   support

         0.0     0.8973    0.7510    0.8176      1000
         1.0     0.7859    0.9140    0.8451      1000

    accuracy                         0.8325      2000
   macro avg     0.8416    0.8325    0.8314      2000
weighted avg     0.8416    0.8325    0.8314      2000


train Accuracy: 0.8325


In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_idx, val_idx in skf.split(X, y):
    
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]

    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_val_scaled = scaler.transform(X_val_fold)

    
    X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
    y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

    
    class MLP(nn.Module):
        def __init__(self, input_dim):
            super(MLP, self).__init__()
            self.model = nn.Sequential(
                nn.Linear(input_dim, 58),
                nn.ReLU(),
                nn.Linear(58, 56),
                nn.ReLU(),
                nn.Linear(56, 1),
                nn.Sigmoid()
        )
        
        def forward(self, x):
            return self.model(x)
    set_seed(42)
    model = MLP(X_train_tensor.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.009970682275036459)
    epochs = 54
    
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    
    model.eval()
    with torch.no_grad():
        y_pred = model(X_val_tensor)
        y_pred_labels = (y_pred > 0.5).float()
        acc = accuracy_score(y_val_tensor, y_pred_labels)


        
        y_pred_train = model(X_train_tensor)
        y_pred_train_labels = (y_pred_train > 0.5).float()
        acc_train = accuracy_score(y_train_tensor, y_pred_train_labels)
        print(f"\ntrain Accuracy: {acc_train:.4f}")
        print(f"\ntest Accuracy: {acc:.4f}")
        print("--"*30)

    
        





train Accuracy: 0.8327

test Accuracy: 0.8260
------------------------------------------------------------

train Accuracy: 0.8330

test Accuracy: 0.8265
------------------------------------------------------------

train Accuracy: 0.8324

test Accuracy: 0.8305
------------------------------------------------------------

train Accuracy: 0.8316

test Accuracy: 0.8347
------------------------------------------------------------

train Accuracy: 0.8342

test Accuracy: 0.8210
------------------------------------------------------------


In [9]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(8,5))
# plt.plot(train_losses, label='Train Loss')
# plt.plot(test_losses, label='Test Loss')
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Train vs Test Loss Curve")
# plt.legend()
# plt.grid(True)
# plt.show()

In [10]:
# from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score
# import torch.nn as nn
# import torch.optim as optim
# import torch
# import numpy as np

# kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# fold_accuracies = []

# for fold, (train_idx, test_idx) in enumerate(kfold.split(X_scaled)):
#     print(f"\n=== Fold {fold + 1} ===")

    
#     X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
#     y_train, y_test = y.values[train_idx], y.values[test_idx]

    
#     X_train = torch.tensor(X_train, dtype=torch.float32)
#     X_test = torch.tensor(X_test, dtype=torch.float32)
#     y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
#     y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

    
#     model = MLP(input_dim=X_train.shape[1])
#     criterion = nn.BCELoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)

    
#     epochs = 300
#     for epoch in range(epochs):
#         model.train()
#         optimizer.zero_grad()
#         outputs = model(X_train)
#         loss = criterion(outputs, y_train)
#         loss.backward()
#         optimizer.step()

    
#     model.eval()
#     with torch.no_grad():
#         y_pred = model(X_test)
#         y_pred_labels = (y_pred > 0.5).float()
#         acc = accuracy_score(y_test, y_pred_labels)
#         print(f"Fold {fold + 1} Accuracy: {acc:.4f}")
#         fold_accuracies.append(acc)


# print("\n=== Cross-Validation Summary ===")
# print(f"All Accuracies: {fold_accuracies}")
# print(f"Average Accuracy: {np.mean(fold_accuracies):.4f}")