In [1]:
# Import necessary libraries
import os
import sys
import joblib
import numpy as np
import pandas as pd
from joblib import dump
import subprocess as sp
import dask.array as da
from pprint import pprint
import matplotlib.pyplot as plt
from odc.io.cgroups import get_cpu_quota
from sklearn.metrics import roc_curve, auc, balanced_accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, ShuffleSplit, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

In [2]:
training_data = "S_training_data(1).txt"

# Define the classifier
Classifier = RandomForestClassifier

# Define the metric
metric = 'balanced_accuracy'

In [3]:
# Set cross-validation splits
inner_cv_splits = 5
outer_cv_splits = 5
test_size = 0.20

In [4]:
# Get the number of CPUs available
ncpus = round(get_cpu_quota())
print('ncpus = ' + str(ncpus))

ncpus = 4


In [5]:
# load the data
model_input = np.loadtxt(training_data)

# load the column_names
with open(training_data, 'r') as file:
    header = file.readline()
    
column_names = header.split()[1:]

# Extract relevant indices from training data
model_col_indices = [column_names.index(var_name) for var_name in column_names[1:]]

#convert variable names into sci-kit learn nomenclature
X = model_input[:, model_col_indices]
y = model_input[:, 0]

In [6]:
# Parameter grid for GridSearchCV
param_grid = {
    'class_weight': ['balanced'],
    'n_estimators': [250],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10],
    'max_features': ['sqrt', 'log2']
}

In [7]:
# Define the outer cross-validation
outer_cv = KFold(n_splits=outer_cv_splits, shuffle=True, random_state=0)

# lists to store results of CV testing
acc = []
f1 = []
roc_auc = []

i = 1
for train_index, test_index in outer_cv.split(X, y):
    print(f"Working on {i}/{outer_cv_splits} outer cv split", end='\r')
    model = Classifier(random_state=1)
    
    # index training, testing, and coordinate data
    X_tr, X_tt = X[train_index, :], X[test_index, :]
    y_tr, y_tt = y[train_index], y[test_index]
    
    # inner split on data within outer split
    inner_cv = KFold(n_splits=inner_cv_splits, shuffle=True, random_state=0)
    
    clf = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=metric,
        n_jobs=ncpus,
        refit=True,
        cv=inner_cv,
    )
    
    clf.fit(X_tr, y_tr)
    # predict using the best model
    best_model = clf.best_estimator_
    pred = best_model.predict(X_tt)

    # evaluate model w/ multiple metrics
    # ROC AUC
    y_tt_bin = label_binarize(y_tt, classes=np.unique(y))
    probs = best_model.predict_proba(X_tt)
    
    for i in range(len(np.unique(y))):
        fpr, tpr, _ = roc_curve(y_tt_bin[:, i], probs[:, i])
        auc_ = auc(fpr, tpr)
        roc_auc.append(auc_)
    
    # Overall accuracy
    ac = balanced_accuracy_score(y_tt, pred)
    acc.append(ac)
    
    # F1 scores
    f1_ = f1_score(y_tt, pred, average='weighted')  # Use 'weighted' or 'macro' for multiclass
    f1.append(f1_)
    
    i += 1

print(f"Mean ROC AUC: {np.mean(roc_auc):.4f}")
print(f"Mean Accuracy: {np.mean(acc):.4f}")
print(f"Mean F1 Score: {np.mean(f1):.4f}")

#Plot mean scores (ROC, Accuracy, F1 score) , graph
#compare classification performance of neural networks, rf, 


Mean ROC AUC: 0.9658 cv split
Mean Accuracy: 0.8263
Mean F1 Score: 0.8266


In [8]:
print("=== Nested K-Fold Cross-Validation Scores ===")
print("Mean balanced accuracy: " + str(round(np.mean(acc), 2)))
print("Std balanced accuracy: " + str(round(np.std(acc), 2)))
print('\n')
print("Mean F1: " + str(round(np.mean(f1), 2)))
print("Std F1: " + str(round(np.std(f1), 2)))
print('\n')
print("Mean roc_auc: " + str(round(np.mean(roc_auc), 3)))
print("Std roc_auc: " + str(round(np.std(roc_auc), 2)))
print('=============================================')

=== Nested K-Fold Cross-Validation Scores ===
Mean balanced accuracy: 0.83
Std balanced accuracy: 0.02


Mean F1: 0.83
Std F1: 0.02


Mean roc_auc: 0.966
Std roc_auc: 0.03


In [9]:
# Generate n_splits of train-test_split
rs = ShuffleSplit(n_splits=outer_cv_splits, test_size=test_size, random_state=0)

In [10]:
# Instantiate a GridSearchCV
clf = GridSearchCV(Classifier(),
                   param_grid,
                   scoring=metric,
                   verbose=1,
                   cv=rs.split(X, y),
                   n_jobs=ncpus)

clf.fit(X, y)

print('\n')
print("The most accurate combination of tested parameters is: ")
pprint(clf.best_params_)
print('\n')
print("The " + metric + " score using these parameters is: ")
print(round(clf.best_score_, 2))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


The most accurate combination of tested parameters is: 
{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'log2',
 'n_estimators': 250}


The balanced_accuracy score using these parameters is: 
0.84


In [11]:
# Create a new model with the best parameters
new_model = Classifier(**clf.best_params_, random_state=1, n_jobs=ncpus)
new_model.fit(X, y)

In [12]:
# Save the trained model
dump(new_model, 'S_model(1).joblib')

['S_model(1).joblib']