## Brain-MDD project

In [3]:
import os
import numpy as np
import joblib
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import dotenv
import warnings

dotenv.load_dotenv()
warnings.filterwarnings("ignore")
FEATURES_DIR_PATH = os.getenv("FEATURES_DIR_PATH")

controlCdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/cd", "control"))
mddCdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/cd", "mdd"))
controlHfdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/hfd", "control"))
mddHfdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/hfd", "mdd"))
controlCohFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/coh", "control"))
mddCohFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/coh", "mdd"))
controlPhaseFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/phase", "control"))
mddPhaseFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/phase", "mdd"))
controlPsdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/psd", "control"))
mddPsdFeatures = np.load(os.path.join(FEATURES_DIR_PATH + "/psd", "mdd"))

# check shape
print(controlCdFeatures.shape)
print(mddCdFeatures.shape)
print(controlHfdFeatures.shape)
print(mddHfdFeatures.shape)
print(controlCohFeatures.shape)
print(mddCohFeatures.shape)
print(controlPhaseFeatures.shape)
print(mddPhaseFeatures.shape)
print(controlPsdFeatures.shape)
print(mddPsdFeatures.shape)


(1753, 16, 1250)
(1328, 16, 1250)
(1753, 16)
(1328, 16)
(1753, 5, 120)
(1328, 5, 120)
(1753, 16, 1250)
(1328, 16, 1250)
(1753, 5, 16)
(1328, 5, 16)


In [4]:
def flatten_and_concat(features_list):
    flattened = [feat.reshape(feat.shape[0], -1) for feat in features_list]
    return np.concatenate(flattened, axis=1)


def save_model(model, folder_name, model_name, batch_size, feature_selection_method):
    folder_path = os.path.join("models", folder_name, f"{feature_selection_method}", f"batch_{batch_size}")
    os.makedirs(folder_path, exist_ok=True)
    model_path = os.path.join(folder_path, f"{model_name}.joblib")
    joblib.dump(model, model_path)
    print(f"Model saved: {model_path}")

In [None]:
controlFeatures = flatten_and_concat([
    controlCdFeatures, controlHfdFeatures, controlCohFeatures, controlPhaseFeatures, controlPsdFeatures
])

mddFeatures = flatten_and_concat([
    mddCdFeatures, mddHfdFeatures, mddCohFeatures, mddPhaseFeatures, mddPsdFeatures
])

X = np.concatenate((controlFeatures, mddFeatures), axis=0)
y = np.concatenate((np.zeros(len(controlFeatures)), np.ones(len(mddFeatures))), axis=0)

models = {
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"]}),
    "SVM": (SVC(), {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}),
    "LDA": (LinearDiscriminantAnalysis(), {"solver": ["svd", "lsqr", "eigen"]}),
    "Decision Tree": (DecisionTreeClassifier(), {"max_depth": [3, 5, 10], "criterion": ["gini", "entropy"]}),
    "Random Forest": (RandomForestClassifier(), {"n_estimators": [50, 100, 200], "max_depth": [5, 10]}),
    "Logistic Regression": (LogisticRegression(), {"C": [0.1, 1, 10], "solver": ["lbfgs", "liblinear"]}),
}

batch_sizes = [16, 32, 64, 128]

feature_selectors = {
    "NoFeatureSelection": None,
    "PCA": PCA(n_components=0.95),  # Keeps 95% of variance
    "SelectKBest": SelectKBest(score_func=f_classif, k=50),  # Selects top 50 features
    "VarianceThreshold": VarianceThreshold(threshold=0.01)  # Removes low variance features
}
performance_records = []


for batch_size in batch_sizes:
    print(f"\n=== Processing Batch Size: {batch_size} ===\n")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    for feature_selection_method, selector in feature_selectors.items():
        print(f"\nApplying Feature Selection: {feature_selection_method}")
        if selector is None:
            X_train_transformed, X_test_transformed = X_train, X_test
        else:
            selector.fit(X_train, y_train)
            X_train_transformed = selector.transform(X_train)
            X_test_transformed = selector.transform(X_test)

        for name, (model, param_grid) in tqdm(models.items(), desc=f"Training Models ({feature_selection_method}, Batch {batch_size})"):
            print(f"🔍 Tuning {name}...")
            grid_search = GridSearchCV(model, param_grid, cv=10, scoring="accuracy", n_jobs=-1)
            grid_search.fit(X_train_transformed, y_train)
            best_model = grid_search.best_estimator_
            best_params = grid_search.best_params_
            cv_scores = cross_val_score(best_model, X_train_transformed, y_train, cv=10, scoring="accuracy")
            mean_cv_score = np.mean(cv_scores)
            y_pred = best_model.predict(X_test_transformed)
            test_accuracy = accuracy_score(y_test, y_pred)
            performance_records.append({
                "Model": name,
                "Batch Size": batch_size,
                "Feature Selection": feature_selection_method,
                "CV Accuracy": mean_cv_score,
                "Test Accuracy": test_accuracy,
                "Best Parameters": best_params
            })
            save_model(best_model, "Models", name, batch_size, feature_selection_method)

df_performance = pd.DataFrame(performance_records)
os.makedirs("models", exist_ok=True)
df_performance.to_csv("models/model_performance.csv", index=False)

print("\nModel training and evaluation completed. Results saved in 'models/model_performance.csv'.")


In [None]:
# def flatten_and_concat(features_list):
#     flattened = [feat.reshape(feat.shape[0], -1) for feat in features_list]
#     return np.concatenate(flattened, axis=1)


# def save_model(model, folder_name, model_name):
#     folder_path = os.path.join("models", folder_name)
#     os.makedirs(folder_path, exist_ok=True)
#     model_path = os.path.join(folder_path, f"{model_name}.pkl")
#     with open(model_path, "wb") as file:
#         pickle.dump(model, file)
#     print(f"Model saved: {model_path}")

# controlFeatures = flatten_and_concat([
#     controlCdFeatures, controlHfdFeatures, controlCohFeatures, controlPhaseFeatures, controlPsdFeatures
# ])

# mddFeatures = flatten_and_concat([
#     mddCdFeatures, mddHfdFeatures, mddCohFeatures, mddPhaseFeatures, mddPsdFeatures
# ])

# X = np.concatenate((controlFeatures, mddFeatures), axis=0)
# y = np.concatenate((np.zeros(len(controlFeatures)), np.ones(len(mddFeatures))), axis=0)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# models = {
#     "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7, 9], "weights": ["uniform", "distance"]}),
#     "SVM": (SVC(), {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}),
#     "LDA": (LinearDiscriminantAnalysis(), {"solver": ["svd", "lsqr", "eigen"]}),
#     "Decision Tree": (DecisionTreeClassifier(), {"max_depth": [3, 5, 10], "criterion": ["gini", "entropy"]}),
#     "Random Forest": (RandomForestClassifier(), {"n_estimators": [50, 100, 200], "max_depth": [5, 10]}),
#     "Logistic Regression": (LogisticRegression(), {"C": [0.1, 1, 10], "solver": ["lbfgs", "liblinear"]}),
# }

# for use_feature_selection in [False, True]:
#     folder_name = "WithFeatureSelection" if use_feature_selection else "WithoutFeatureSelection"

#     if use_feature_selection:
#         print("🔍 Applying Feature Selection...")
#         selector = PCA(n_components=0.95)  # Keeps 95% of variance
#         X_train_transformed = selector.fit_transform(X_train, y_train)
#         X_test_transformed = selector.transform(X_test)
#     else:
#         X_train_transformed = X_train
#         X_test_transformed = X_test

#     best_models = {}

#     for name, (model, param_grid) in tqdm(models.items(), desc=f"Training Models ({folder_name})"):
#         print(f"🔍 Tuning {name}...")

#         grid_search = GridSearchCV(model, param_grid, cv=10, scoring="accuracy", n_jobs=-1)
#         grid_search.fit(X_train_transformed, y_train)

#         best_model = grid_search.best_estimator_
#         best_params = grid_search.best_params_

#         cv_scores = cross_val_score(best_model, X_train_transformed, y_train, cv=10, scoring="accuracy")
#         mean_cv_score = np.mean(cv_scores)

#         y_pred = best_model.predict(X_test_transformed)
#         test_accuracy = accuracy_score(y_test, y_pred)

#         best_models[name] = (best_model, test_accuracy, mean_cv_score, best_params)

#         print(f"Best {name} Parameters: {best_params}")
#         print(f"{name} CV Accuracy: {mean_cv_score:.4f}")
#         print(f"{name} Test Accuracy: {test_accuracy:.4f}\n")

#         save_model(best_model, folder_name, name)

#     print(f"\n=== Best Models Summary ({folder_name}) ===")
#     for name, (model, test_accuracy, cv_score, params) in best_models.items():
#         print(f"{name}: CV Accuracy={cv_score:.4f}, Test Accuracy={test_accuracy:.4f}, Best Params={params}")


Training Models (WithoutFeatureSelection):   0%|          | 0/6 [00:00<?, ?it/s]

🔍 Tuning KNN...
Best KNN Parameters: {'n_neighbors': 5, 'weights': 'distance'}
KNN CV Accuracy: 0.6035
KNN Test Accuracy: 0.5867



Training Models (WithoutFeatureSelection):  17%|█▋        | 1/6 [00:30<02:34, 30.89s/it]

Model saved: models/WithoutFeatureSelection/KNN.pkl
🔍 Tuning SVM...
Best SVM Parameters: {'C': 1, 'kernel': 'rbf'}
SVM CV Accuracy: 0.5816
SVM Test Accuracy: 0.6288



Training Models (WithoutFeatureSelection):  33%|███▎      | 2/6 [3:11:57<7:31:18, 6769.58s/it]

Model saved: models/WithoutFeatureSelection/SVM.pkl
🔍 Tuning LDA...
