In [1]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np
from medmnist import PneumoniaMNIST

In [2]:
datasetA_train = PneumoniaMNIST(split="train", download=True)
datasetA_Validation = PneumoniaMNIST(split="val", download=True)
datasetA_Test = PneumoniaMNIST(split="test", download=True)

Using downloaded and verified file: C:\Users\zhr\.medmnist\pneumoniamnist.npz
Using downloaded and verified file: C:\Users\zhr\.medmnist\pneumoniamnist.npz
Using downloaded and verified file: C:\Users\zhr\.medmnist\pneumoniamnist.npz


In [3]:
train_images_x = datasetA_train.imgs
train_labels_y = datasetA_train.labels

valid_images_x = datasetA_Validation.imgs
valid_labels_y = datasetA_Validation.labels

test_images_x = datasetA_Test.imgs
test_labels_y = datasetA_Test.labels

size = train_images_x[0].size

X_train = train_images_x.reshape(train_images_x.shape[0], size, )
X_val = valid_images_x.reshape(valid_images_x.shape[0], size, )
X_test = test_images_x.reshape(test_images_x.shape[0], size, )

y_train = train_labels_y.ravel()
y_val = valid_labels_y.ravel()
y_test = test_labels_y.ravel()

In [4]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

params = {
    'max_depth': [None,2,5,10,20],
    'min_samples_leaf': [1,5,10,20,50,100],
    'n_estimators': [10,50,100,150,170,200]
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 3,
                           n_jobs=-1, verbose=1, scoring="accuracy")

grid_search.fit(X_train, y_train)

print("Best score of GridSearchCV: ", grid_search.best_score_)
print("Best Estimator by GridSearchCV: ", grid_search.best_estimator_)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
Best score of GridSearchCV:  0.9475371414575662
Best Estimator by GridSearchCV:  RandomForestClassifier(max_depth=10, n_estimators=170, n_jobs=-1,
                       random_state=42)


In [5]:
rf_best = grid_search.best_estimator_

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = cross_val_score(rf_best, X_train, y_train, cv=kf, scoring='accuracy')

print("Each 5-fold's accuracy: ", cv_scores)
print(f'Average 5-Fold CV Accuracy: {np.mean(cv_scores)}')

Each 5-fold's accuracy:  [0.94798301 0.93312102 0.9522293  0.95536663 0.95430393]
Average 5-Fold CV Accuracy: 0.9486007793127879


In [6]:
clf = rf_best
clf.fit(X_train, y_train)

validation_pred = clf.predict(X_val)
validation_accuracy = accuracy_score(y_val, validation_pred)

test_pred = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_pred)

print("The final Validation Accuracy: ", validation_accuracy)
print("The final Test Accuracy: ", test_accuracy)

The final Validation Accuracy:  0.9599236641221374
The final Test Accuracy:  0.8573717948717948
