In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import ElasticNet, RidgeClassifier, Lasso, PassiveAggressiveClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

from xgboost import XGBClassifier
from sklearn.svm import LinearSVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

Load the data from the uploaded files

In [12]:

data = pd.read_csv("Data\\BP_features.csv")
labels = pd.read_csv("Data\\final_labels.csv")
labels = labels.iloc[:, 1]

In [25]:
count_normal = 0
for i in labels:
    count_normal += i
count_normal

128

Splitting the datasets into training and testing sets

In [13]:

# Split ratio = 80:20 -> (441: 111)
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, shuffle=True)


Scaling the data

In [14]:
ss_train = StandardScaler()
X_train = ss_train.fit_transform(X_train)
ss_test = StandardScaler()
X_test = ss_test.fit_transform(X_test)

Train and evaluate model

In [15]:

def train_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid, model_name):
    
    # Hyperparameter tuning
    
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy',)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    best_params =  grid_search.best_params_
    best_score =  grid_search.best_score_
    # Predicting
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    # Evaluation
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)


    return best_model, train_accuracy, test_accuracy, best_params, best_score


In [16]:

models = {
    'Gradient Boosting': (GradientBoostingClassifier(),{
        
    }),

    'K-Nearest Neighbors': (KNeighborsClassifier(),{
        'n_neighbors': [1,3,5,7, 9, 11],
        'leaf_size': [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'weights': ['uniform', 'distance']
    }),

    'XGBClassifier': (XGBClassifier(),{
        'alpha': [0.001, 0.01, 0.1, 0.20, 0.25, 0.30],
    }),

    'Logistic Regression': (LogisticRegression(), {
        'penalty': ['l1','l2'], 
        'C': [0.001,0.01,0.1,1,10,100,1000]
    }),
    # 'Elastic Net': (ElasticNet(),{}),
    # 'Ridge': (RidgeClassifier(),{}),
    # 'Lasso': (Lasso(),{}),
    'Extra Trees': (ExtraTreesClassifier(),{}),
    'AdaBoost': (AdaBoostClassifier(),{}),

    'Passive Aggressive': (PassiveAggressiveClassifier(max_iter=1000, random_state=42, tol=1e-3), {}),
    'Support Vector Machines': (LinearSVC(), {}),

    'Decision Trees': (DecisionTreeClassifier(), {
        'max_depth' : [i for i in range(2,10)],
        'criterion' :['gini', 'entropy']
    }),

    'Random Forest': (RandomForestClassifier(), {}),
    'Naive Bayes': (GaussianNB(), {}),

}


In [17]:

best_models = {}
result = {}

for model_name, (model, param_grid) in models.items():
    best_model, train_accuracy, test_accuracy, best_params, best_score = train_evaluate_model(X_train, X_test, y_train, y_test, model, param_grid, model_name)
    best_models[model_name] = best_model
    result[model_name] = [train_accuracy, test_accuracy, best_score, best_params]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [18]:
results = pd.DataFrame(result).T
results.columns = ['Train Accuracy', 'Test Accuracy', 'Best Score', 'Best Params']
results

Unnamed: 0,Train Accuracy,Test Accuracy,Best Score,Best Params
Gradient Boosting,0.981859,0.801802,0.830184,{}
K-Nearest Neighbors,1.0,0.801802,0.807303,"{'algorithm': 'auto', 'leaf_size': 5, 'n_neigh..."
XGBClassifier,1.0,0.846847,0.857329,{'alpha': 0.3}
Logistic Regression,0.764172,0.783784,0.764173,"{'C': 0.001, 'penalty': 'l2'}"
Extra Trees,1.0,0.828829,0.807354,{}
AdaBoost,0.904762,0.801802,0.764377,{}
Passive Aggressive,0.761905,0.738739,0.684934,{}
Support Vector Machines,0.854875,0.747748,0.739479,{}
Decision Trees,0.92517,0.81982,0.793744,"{'criterion': 'entropy', 'max_depth': 7}"
Random Forest,1.0,0.801802,0.809678,{}
