## Importing modules

In [8]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from load_data import load_data, load_resized_data
import matplotlib.pyplot as plt
%matplotlib inline

## Specifying size of images

In [2]:
#dimensions of image to be loaded in 
img_rows=100 
img_cols=100

## Loading in train and validation datasets

In [3]:
X_train, X_valid, Y_train, Y_valid = load_resized_data(img_rows, img_cols)

['Parasitized', 'Uninfected']
------------------------------
Creating training images...
------------------------------
Loading done.


## Resizing data for Random Forest

In [6]:
#changes data from shape (22096,100,100,3) to (22096,30000)
new_x_train=X_train.reshape([X_train.shape[0],np.product(X_train.shape[1:4])]) 
#changes data from shape (5512,100,100,3) to (5512,30000)
new_x_test=X_valid.reshape([X_valid.shape[0],np.product(X_valid.shape[1:4])])

## Training with default model

In [9]:
clf=RandomForestClassifier() #creates model
clf.fit(new_x_train,Y_train,random_state=1408) #fits model
preds = clf.predict(new_x_test) #predicts using validation data
print("Accuracy:", accuracy_score(Y_valid,preds)) #prints accuracy

## Plot of the most important features

In [None]:
arr=clf.feature_importances_ # makes an array of the feature importance values
num_top_values=15 # number of top features one wants to acquire
arr=arr.argsort()[-num_top_values:][::-1] # creates a list of indices of the top values based on the value specified 
y=clf.feature_importances_[arr] # creates an subset of the feat. import. array using indices from top feat. import. vals.

fig, ax = plt.subplots() 
width = 0.4 # the width of the bars 
ind = np.arange(len(y)) # the x locations for the groups
ax.barh(ind, y, width, color="green")
ax.set_yticks(ind+width/10)
ax.set_yticklabels(arr, minor=False)
plt.title('Feature importance in RandomForest Classifier')
plt.xlabel('Relative importance')
plt.ylabel('feature') 
plt.figure(figsize=(5,5))
fig.set_size_inches(6.5, 4.5, forward=True)

## Hyperparameter sweep for better accuracy

In [None]:
n_estimator_parameters=[10,20,50,10,20,50] # values for n_estimators used
oob_score_parameters=[True,True,True,False,False,False] # values for oob_score used
accuracy_score_list=[] #stores accuracy scores from various models

for i in range(len(oob_score_parameters)):
    clf = RandomForestClassifier(oob_score=oob_score_parameters[i],n_estimators=n_estimator_parameters[i],random_state=1408)
    clf.fit(new_x_train,Y_train)
    preds = clf.predict(new_x_test)
    accuracy_score_list.append(accuracy_score(Y_valid,preds))

## Plot of Hyperparameter sweep

In [None]:
labels=["10T","20T","50T","10F","20F","50F"]
colors=["g","g","b","r","r","r"]
plt.rcParams["font.size"] = 25
plt.rcParams["axes.titlepad"] = 20
fig=plt.figure(figsize=[20,10])

plt.bar(labels,accuracy_score_list,color=colors,label="Parameters Used: # = n_estimators, letter = if oob_score is True or False")
plt.ylabel("Accuracy",fontsize="large")
plt.title("Parameter swipe for our Random Forest Model")
plt.axis(ymin=0,ymax=1,fontsize="small")
plt.legend(loc=1,prop={'size': 20})