In [271]:
# import packages 
import pandas as pd
import scipy.io
import numpy as np
 

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf 


In [194]:
from tensorflow.keras.datasets import mnist

# load the mnist data 
mnist = tf.keras.datasets.mnist
(X_train, y_train), (X_test,y_test) = mnist.load_data()


# Preprocessing training and test sets 

In [267]:
# split data into training and test sets by firstly converting from 4D to 2D array
X_train = X_train.reshape(60000,28*28)
X_test = X_test.reshape(10000,28*28)


# choose two machine learning algorithms and explain why you chose them 


1. Random Forest Classifier, because it is very robust as we need a classifier that will look at the general patterns of the images with numbers and make aggregate predictions on it. 



2. Bagging Classifier, the mnist data contains images with qualitative labels which will require a method that makes predictions based on majority vote. 

In [219]:
############ 1. Random Forest Classifier ############

For the Random Forest Classifier, the parameter to tune is max_depth since it drastically affects the accuracy score i.e. when increased, the score also drastically increases.

In [314]:
# Random Forest Classifier 

rf = RandomForestClassifier(max_depth=10, n_estimators = 6, random_state = 42)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=10, n_estimators=6, random_state=42)

Use max_depth=10 for testing the test data because it yields higher accuracy score.

In [315]:
# Accuracy score
preds = rf.predict(X_test)
print('Accuracy:',accuracy_score(y_test,preds))

Accuracy: 0.9122


In [316]:
#confusion matrix

conf_mat = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(conf_mat)
print('confusion matrix:')
cm_df

confusion matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,960,0,1,1,0,4,4,1,8,1
1,0,1114,6,3,0,1,4,0,6,1
2,12,1,939,13,9,4,14,15,17,8
3,5,1,18,884,1,36,4,13,34,14
4,3,2,3,2,873,4,14,2,14,65
5,15,5,5,46,8,770,16,5,10,12
6,13,3,3,1,15,16,898,1,7,1
7,1,13,29,4,5,1,0,932,6,37
8,6,3,12,30,14,19,9,11,840,30
9,6,5,7,14,33,4,0,15,13,912


The confusion matrix suggests that the model struggled with predicting the images of label/class 8. By using the f1_score below, will determine which class the model struggled on. 

In [327]:
# average f1 score

av_f1 = f1_score(y_test,preds, average = 'micro')
print('f1_score:',av_f1)

# f1 score per class
f = f1_score(y_test, preds, average=None)
lowest_score = min(f)
hardest_class = cm_df[list(f).index(lowest_score)]
print('Hardest class:', hardest_class)

f1_score: 0.9122000000000001
Hardest class: 0      8
1      6
2     17
3     34
4     14
5     10
6      7
7      6
8    840
9     13
Name: 8, dtype: int64


The f1_score per class/ label suggests that the model struggled with class 8 which can be seen from the confusion matrix. 

In [328]:
# precision and recall

prec = precision_score(y_test, preds, average = 'micro')
rec = recall_score(y_test, preds, average = 'weighted')

print('Precision:', prec)
print('Recall:', rec)


Precision: 0.9122
Recall: 0.9122


In [329]:
############################## 2. Bagging Classifier ####################

In [330]:
bg = BaggingClassifier(DecisionTreeClassifier(),max_samples = 0.5,random_state = 42)
bg.fit(X_train,y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  random_state=42)

Use max_samples = 0.5 for testing data since it takes longer to run the model on full size. 

In [331]:
# Accuracy score
preds = bg.predict(X_test)
print('Accuracy:',accuracy_score(y_test,preds))

Accuracy: 0.9365


In [332]:
#confusion matrix

conf_mat = confusion_matrix(y_test, preds)
cm_df = pd.DataFrame(conf_mat)
print('confusion matrix:')
cm_df

confusion matrix:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,961,0,3,0,2,8,0,3,2,1
1,0,1118,6,4,0,1,5,0,1,0
2,13,5,967,13,7,0,4,13,7,3
3,5,1,24,925,0,27,2,11,13,2
4,5,5,2,4,921,1,9,2,5,28
5,15,3,3,27,6,815,8,3,6,6
6,16,1,7,3,12,11,902,1,5,0
7,1,8,22,13,3,3,0,967,3,8
8,8,4,20,21,8,14,15,5,861,18
9,8,9,6,13,20,7,2,10,6,928


The confusion matrix suggests that the model struggled with predicting the images of label/class 3. By using the f1_score below, will determine which class the model struggled on. 

In [334]:
# average f1 score

av_f1 = f1_score(y_test,preds, average = 'micro')
print('f1_score:',av_f1)

# f1 score per class
f = f1_score(y_test, preds, average=None)
lowest_score = min(f)
hardest_class = cm_df[list(f).index(lowest_score)]
print('Hardest class:', hardest_class)

f1_score: 0.9365
Hardest class: 0      0
1      4
2     13
3    925
4      4
5     27
6      3
7     13
8     21
9     13
Name: 3, dtype: int64


The f1_score per class/ label suggests that the model struggled with class 3 which can be seen from the confusion matrix. 

In [335]:
# precision and recall

prec = precision_score(y_test, preds, average = 'micro')
rec = recall_score(y_test, preds, average = 'weighted')

print('Precision:', prec)
print('Recall:', rec)

Precision: 0.9365
Recall: 0.9365


# Comment on the differences in model performances 

The Bagging Classifier model performed better than the Random Forest Classifier.

The accuracy of bagging model was 2.43% more than that of Random Forest Model. 

The f1_score,precision and recall for bagging model was higher than Random Forest Model.

The bagging model was more accurate than the forest model but also the Forest model predictions were very well on point.

Overall, the Bagging Classsifier did the best job. 