# Random Forest in Brain Tumor Data

Trying the random forest model on brain tumor mri data. This is an example where $p >> n$, so random forest may not handle that well. Furthermore, standardization is not required and does not make a difference in random forest

## Import Libraries

In [56]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
from tqdm import tqdm
from tensorflow.keras.applications import EfficientNetB0
from keras import backend as K
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split

## Preprocess Data

Shuffle the test and training dataset because there is an imbalance in quality for the glioma class particularly

In [76]:
labels = ['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']
X = []
y = []
image_size = 150
for i in labels:
    folderPath = os.path.join('/kaggle/input/brain-tumor-classification-mri','Training',i)
    for j in tqdm(os.listdir(folderPath)):
        img = cv2.imread(os.path.join(folderPath,j), cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img,(image_size, image_size))
        img = tf.cast(img, tf.float32)
        X.append(img)
        y.append(i)
for i in labels:
    folderPath = os.path.join('/kaggle/input/brain-tumor-classification-mri','Testing',i)
    for j in tqdm(os.listdir(folderPath)):
        img = cv2.imread(os.path.join(folderPath,j), cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img,(image_size, image_size))
        img = tf.cast(img, tf.float32)
        X.append(img)
        y.append(i)
        
X = np.array(X)
y = np.array(y)

X = X.reshape(X.shape[0], X.shape[1]*X.shape[2])

X.shape, y.shape

100%|██████████| 826/826 [00:02<00:00, 409.31it/s]
100%|██████████| 822/822 [00:01<00:00, 432.24it/s]
100%|██████████| 395/395 [00:00<00:00, 434.77it/s]
100%|██████████| 827/827 [00:02<00:00, 369.24it/s]
100%|██████████| 100/100 [00:00<00:00, 382.82it/s]
100%|██████████| 115/115 [00:00<00:00, 432.56it/s]
100%|██████████| 105/105 [00:00<00:00, 505.44it/s]
100%|██████████| 74/74 [00:00<00:00, 249.72it/s]


((3264, 22500), (3264,))

## Fit Model

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [78]:
y_train.shape

(2611,)

In [79]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [80]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8637059724349158


## Cross Validation

In [81]:
param_dist = {'n_estimators': list(range(5,21)),
              'max_depth': list(range(1,21))}

rf = RandomForestClassifier()
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=5,
                                 cv=5)
rand_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=5,
                   param_distributions={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12, 13, 14, 15,
                                                      16, 17, 18, 19, 20],
                                        'n_estimators': [5, 6, 7, 8, 9, 10, 11,
                                                         12, 13, 14, 15, 16, 17,
                                                         18, 19, 20]})

In [82]:
rand_search.best_estimator_

RandomForestClassifier(max_depth=16, n_estimators=17)

In [83]:
rand_search.best_score_

0.8644212947700783