In [1]:
# import optuna
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import os

In [4]:
dataset = "heart"
choice = "MLP"

In [18]:
filename = f"{dataset}.csv".format(dataset)
base_dir = r"C:\Users\Rein\Documents\AIyear3\Thesis\HPO\DataSets"
input_dir = os.path.join(base_dir, filename)
df = pd.read_csv(input_dir, sep = ',')
X = df.drop(['target'],axis=1)
X = pd.get_dummies(X, drop_first=True)
y = df['target']
print(X.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  
0   2     3  
1   0     3  
2   0     3  
3   1     3  
4   3     2  


In [8]:
#Scaler
scaler = StandardScaler()
X = scaler.fit_transform(X)


In [11]:
pca = PCA(n_components=0.95)
X = pca.fit_transform(X)

In [19]:

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Random Forest Classifier
if choice == "RandomForest":
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 10, 1000)
        max_depth = trial.suggest_int("max_depth", 1, 100)          
        min_samples_split = trial.suggest_float("min_samples_split", 0.0001, 1.0, log=True)
        min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.0001, 0.5, log=True)
        max_features = trial.suggest_categorical("max_features", ['sqrt', 'log2', None])
        bootstrap = trial.suggest_categorical("bootstrap", [True, False])
        criterion = trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"])
        max_leaf_nodes = trial.suggest_int("max_leaf_nodes", 2, 1000)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            max_features=max_features,
            bootstrap=bootstrap,
            criterion=criterion,
            # class_weight=class_weight,
            max_leaf_nodes=max_leaf_nodes,
            # min_weight_fraction_leaf=min_weight_fraction_leaf,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        acc = accuracy_score(y_valid, preds)
        return acc

In [21]:
if choice == "LR":
    def objective(trial):
        C = trial.suggest_float("C", 1e-5, 1e5, log=True)
        penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
        solver = trial.suggest_categorical("solver", ["liblinear", "saga"])
        max_iter = trial.suggest_int("max_iter", 100, 1000)
        tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)
        class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])

        model = LogisticRegression(
            C=C,
            penalty=penalty,
            solver=solver,
            max_iter=max_iter,
            tol=tol,
            class_weight=class_weight,
            random_state=42
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        acc = accuracy_score(y_valid, preds)
        return acc

In [20]:
if choice == "MLP":
    def objective(trial):
        hidden_layer_sizes = trial.suggest_int("hidden_layer_sizes", 1, 100)
        activation = trial.suggest_categorical("activation", ["identity", "logistic", "tanh", "relu"])
        solver = trial.suggest_categorical("solver", ["lbfgs", "sgd", "adam"])
        alpha = trial.suggest_float("alpha", 1e-5, 1e-1, log=True)
        learning_rate = trial.suggest_categorical("learning_rate", ["constant", "invscaling", "adaptive"])
        max_iter = trial.suggest_int("max_iter", 100, 1000)

        model = MLPClassifier(
            hidden_layer_sizes=hidden_layer_sizes,
            activation=activation,
            solver=solver,
            alpha=alpha,
            learning_rate=learning_rate,
            max_iter=max_iter,
            random_state=42
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        acc = accuracy_score(y_valid, preds)
        return acc

In [21]:
study = optuna.create_study(sampler= optuna.samplers.RandomSampler(seed=42))
study.optimize(objective, n_trials = 330, n_jobs = -1)

[I 2025-05-13 14:35:49,750] A new study created in memory with name: no-name-f790089d-d1df-4080-8597-ec825bd27a34
[I 2025-05-13 14:35:50,118] Trial 2 finished with value: 0.935 and parameters: {'hidden_layer_sizes': 81, 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 3.590693287291008e-05, 'learning_rate': 'constant', 'max_iter': 318}. Best is trial 2 with value: 0.935.
[I 2025-05-13 14:35:50,292] Trial 10 finished with value: 0.935 and parameters: {'hidden_layer_sizes': 35, 'activation': 'identity', 'solver': 'lbfgs', 'alpha': 6.57406851045178e-05, 'learning_rate': 'invscaling', 'max_iter': 818}. Best is trial 2 with value: 0.935.
[I 2025-05-13 14:35:52,475] Trial 6 finished with value: 0.76 and parameters: {'hidden_layer_sizes': 5, 'activation': 'relu', 'solver': 'sgd', 'alpha': 0.0005635364743665227, 'learning_rate': 'invscaling', 'max_iter': 906}. Best is trial 6 with value: 0.76.
[I 2025-05-13 14:35:53,521] Trial 14 finished with value: 0.935 and parameters: {'hidden_layer_s

In [22]:
base_filename = f'{dataset}_MD_{choice}'.format(dataset,choice)
base_dir = r"C:\Users\Rein\Documents\AIyear3\Thesis\HPO\{}\{}".format(dataset, choice)

os.makedirs(base_dir, exist_ok=True)

counter = 1
while True:
    filename = f"{base_filename}_{counter}.csv"
    filepath = os.path.join(base_dir, filename) 
    if not os.path.exists(filepath):
        break
    counter += 1

df_trials = study.trials_dataframe()
df_trials.drop(columns=['datetime_start', 'datetime_complete', 'duration', 'state'], inplace=True)
df_trials.to_csv(path_or_buf= filepath, index=False, sep=";")