In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import ElasticNet, RidgeClassifier, Lasso, PassiveAggressiveClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from hyperopt import hp
import numpy as np


Load the data from the uploaded files

In [2]:

data = pd.read_csv("..\\Data\\BP_features.csv")
labels = pd.read_csv("..\\Data\\final_labels.csv")
labels = labels.iloc[:, 1]

In [3]:
count_normal = 0
for i in labels:
    count_normal += i
count_normal

128

Splitting the datasets into training and testing sets

In [4]:

# Split ratio = 80:20 -> (441: 111)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.25, random_state=42, shuffle=True)


In [5]:
count_normal = 0
for i in y_train:
    count_normal += i
count_normal

98

Scaling the data

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Train and evaluate model

In [7]:

def train_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid, model_name):
    
    # Hyperparameter tuning
    
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs = -1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    best_params =  grid_search.best_params_
    best_score =  grid_search.best_score_
    # Predicting
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    # Evaluation
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    train_f1 = f1_score(y_train, y_train_pred)
    test_f1 = f1_score(y_test, y_test_pred)


    return best_model, train_accuracy, test_accuracy, best_params, best_score, train_f1, test_f1


In [8]:

models = {
    'Naive Bayes': (GaussianNB(), {'var_smoothing': np.logspace(0,-9, num=100)}),

    'Gradient Boosting': (GradientBoostingClassifier(random_state=42),{
        'loss': ['log_loss', 'exponential'],
        'criterion': ['friedman_mse', 'squared_error'],
        'learning_rate': [0.01, 0.1, 0.2, 0.5, 1, 10, 100],
        'n_estimators': [50, 100, 200, 300, 500],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],

    }),

    'K-Nearest Neighbors': (KNeighborsClassifier(),{
        'n_neighbors': [1,3,5,7, 9, 11, 13, 17],
        'leaf_size': [5, 10, 15, 20, 30, 40,  50],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'weights': ['uniform', 'distance'],
        'p': [1, 2, 4]
    }),

    'Logistic Regression': (LogisticRegression(max_iter = 5000, n_jobs=-1, random_state=42), {
        'penalty': ['l1','l2', 'elasticnet'], 
        'C': [0.001,0.01,0.1,1,10,100,1000],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    }),
    # # 'Elastic Net': (ElasticNet(),{}),
    'Ridge': (RidgeClassifier(random_state=42, max_iter=5000),{
        'solver': [ 'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs'],
        'alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 5, 10, 100, 1000],
        'positive': [True, False],
        'fit_intercept': [True, False],
 
    }),
    # 'Lasso': (Lasso(),{}),
    'Extra Trees': (ExtraTreesClassifier(random_state=42, n_jobs=-1),{
        'n_estimators': [100, 150, 200, 250, 300], 
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }),
    'AdaBoost': (AdaBoostClassifier(random_state=42, algorithm='SAMME'),{
        'n_estimators': [50, 70, 90, 120, 160, 180, 200],
        'learning_rate': [0.001, 0.01, 0.1 , 0.5, 0.8, 1, 1.5, 5, 10, 100],
        'algorithm': ['SAMME', 'SAMME.R']
    }),

    'Passive Aggressive': (PassiveAggressiveClassifier(max_iter=5000, random_state=42, n_jobs=-1), {
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        'loss': ['hinge', 'squared_hinge']
    }),
    'Support Vector Classification': (SVC(random_state=42), {
        'C': [0.1, 1, 10, 100, 1000],  
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'decision_function_shape': ['ovo', 'ovr']
    }),

    'Decision Trees': (DecisionTreeClassifier(random_state=42), {
        'max_depth': [None, 10, 20, 30],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_features': ['auto', 'sqrt', 'log2'],
        'splitter': ['best', 'random'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],

    }),

    'Random Forest': (RandomForestClassifier(random_state=42, n_jobs=-1), {
        'n_estimators': [100, 150, 200, 250, 300], 
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2']
    }),
    'XGBClassifier': (XGBClassifier(),{
        'n_estimators': [50, 100, 150, 200, 300],  # Number of boosting rounds
        'learning_rate': [0.01, 0.1, 0.3, 0.5, 1.0],  # Learning rate
        # 'base_estimator__max_depth': [1, 2, 3]  # Depth of the base estimator (Decision Tree)

        'alpha': [0, 0.001, 0.1, 1, 10, 100],
        # 'max_depth': [3, 5, 7, 9],
        # 'learning_rate': [0.1, 0.01, 0.001, 0.2, 0.5, 0.9],
        # 'subsample': [0.6, 0.8, 1],
        # 'learning_rate': [0.01, 0.1, 0.2],
        # 'n_estimators': [100, 200, 300],
        # 'max_depth': [3, 5, 7],
        # 'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.001, 0.1, 1, 10, 100],
        'lambda': [0, 0.001, 0.1, 1, 10, 100]
        # 'subsample': [0.8, 1.0],
        # 'colsample_bytree': [0.8, 1.0]
    }),

}

# {'roc_auc_ovr', 'neg_log_loss', 'neg_median_absolute_error', 'neg_root_mean_squared_log_error', 
# 'recall_samples', 'recall_micro', 'positive_likelihood_ratio', 'normalized_mutual_info_score', 'f1_samples', 
# \'neg_mean_poisson_deviance', 'explained_variance', 'max_error', 'r2', 'v_measure_score', 'accuracy', 
# 'jaccard_micro', 'average_precision', 'jaccard_macro', 'f1_weighted', 'neg_brier_score', 'rand_score', 
# 'completeness_score', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'fowlkes_mallows_score', 
# 'roc_auc', 'adjusted_mutual_info_score', 'homogeneity_score', 'jaccard', 'precision_weighted', 
# \'recall_weighted', 'roc_auc_ovo', 'neg_mean_absolute_percentage_error', 'precision_macro', 
# 'roc_auc_ovr_weighted', 'jaccard_samples', 'precision', 'top_k_accuracy', 'd2_absolute_error_score', 
# 'matthews_corrcoef', 'roc_auc_ovo_weighted', 'neg_mean_absolute_error', 'f1_micro', 'jaccard_weighted', 
# 'neg_negative_likelihood_ratio', 'recall_macro', 'balanced_accuracy', 'f1',
# 'neg_mean_gamma_deviance', 'mutual_info_score', 'recall', 'neg_root_mean_squared_error', 'f1_macro', 'adjusted_rand_score', 'precision_micro', 'precision_samples'}


In [9]:

best_models = {}


In [10]:

for model_name, (model, param_grid) in models.items():
    best_model, train_accuracy, test_accuracy, best_params, best_score, train_f1, test_f1 = train_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid, model_name)
    best_models[model_name] = best_model
    result = {}
    result[model_name] = [train_accuracy, test_accuracy, train_f1, test_f1, best_score, best_params]
    results = pd.DataFrame(result).T
    results.columns = ['Train Accuracy', 'Test Accuracy', 'Train F1', 'Test F1', 'Best Score', 'Best Params']
    path = model_name + ".json"
    results.T[model_name].to_json(path)


350 fits failed out of a total of 630.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
35 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\shrey\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(s

In [11]:
# results = pd.DataFrame(result).T
# results.columns = ['Train Accuracy', 'Test Accuracy', 'Train F1', 'Test F1', 'Best Score', 'Best Params']
# results


In [12]:
for i in models.keys():
    # print(results[i])
    path = i + ".json"
    results.T[i].to_json(path)


KeyError: 'Naive Bayes'

In [None]:
results['Best Params'].iloc[0]

{'alpha': 0,
 'gamma': 0.001,
 'lambda': 0,
 'learning_rate': 1.0,
 'n_estimators': 150}