In [3]:
# Importing the required Libraries
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import cv2

# Fetching Dataset

In [4]:
mnist = fetch_openml('mnist_784')

In [5]:
# Extracting Data and target
x, y = np.array(mnist['data']), np.array(mnist['target'])

In [6]:
# Shuffling the dataset
shuffle_index = np.random.permutation(70000)
x, y = x[shuffle_index], y[shuffle_index]

In [7]:
# Train-test split
x_train, x_test = x[:60000], x[60000:]
y_train, y_test = y[:60000].astype(int), y[60000:].astype(int)

# Training the model

In [8]:
# Random Forest
clf1 = RandomForestClassifier()
clf1.fit(x_train, y_train)

RandomForestClassifier()

In [9]:
# KNN
clf2 = KNeighborsClassifier()
clf2.fit(x_train, y_train)

KNeighborsClassifier()

# Testing the model

In [10]:
# Random Forest
y_predicted_RF = clf1.predict(x_test)
print("Random Forest :")
print("Accuracy :", accuracy_score(y_test, y_predicted_RF)*100, "%")
print("F1 Score :", f1_score(y_test, y_predicted_RF, average='macro')*100, "%")

Random Forest :
Accuracy : 97.16 %
F1 Score : 97.1321321216276 %


In [11]:
# KNN
y_predicted_KNN = clf2.predict(x_test)
print("\nKNN :")
print("Accuracy :", accuracy_score(y_test, y_predicted_KNN)*100, "%")
print("F1 Score :", f1_score(y_test, y_predicted_KNN, average='macro')*100, "%")


KNN :
Accuracy : 97.22 %
F1 Score : 97.20303402652786 %


# Confusion Matrix

In [12]:
# Random Forest
print("Random Forest :")
print(confusion_matrix(y_test, y_predicted_RF))

Random Forest :
[[ 964    0    1    0    0    1    1    0    8    0]
 [   0 1150    4    3    2    0    3    2    0    0]
 [   8    1  997    7    3    0    3    6    9    0]
 [   1    1    5  947    0    4    2    6    8    5]
 [   2    1    1    0  932    0    2    5    2   15]
 [   2    3    2    4    2  912    6    1    6    2]
 [   3    1    1    0    1    6  951    0    2    0]
 [   2    3    8    1    2    0    0 1028    1   15]
 [   1    5    6    6    6    4    3    2  924    7]
 [   7    4    1   15    8    2    0    8    3  911]]


In [13]:
# KNN
print("KNN : ")
print(confusion_matrix(y_test, y_predicted_KNN))

KNN : 
[[ 970    1    1    0    0    2    1    0    0    0]
 [   0 1160    1    0    1    0    0    1    0    1]
 [   5   10  995    3    0    0    2   14    5    0]
 [   0    4    3  952    0    7    0    5    3    5]
 [   1    9    1    0  927    0    3    2    0   17]
 [   2    3    0    9    0  913    6    1    3    3]
 [   3    2    0    0    1    4  955    0    0    0]
 [   1    8    3    0    2    0    0 1035    1   10]
 [   6   14    2   13    7   14    4    3  891   10]
 [   4    4    1    6    4    5    0   10    1  924]]


# Cross Validation

In [14]:
# Random Forest
print(cross_val_score(clf1, x_train, y_train, cv=3, scoring="accuracy"))

[0.9651  0.9665  0.96385]


In [15]:
# KNN
print(cross_val_score(clf2, x_train, y_train, cv=3, scoring="accuracy"))

[0.96795 0.9685  0.967  ]


# Testing on Random Image outside Dataset

In [None]:
# taking the path of image for testing as input
path = input("Enter the path of image :")

# reading the image into variable as grayscale
img = cv2.imread(path, cv2.IMREAD_GRAYSCALE) 

# resizing the image to 28 x 28 shape
new_img = cv2.resize(img, (28,28)) 

# converting the image to numpy array for processing
np_img = np.array(new_img) 

# reshaping the numpy array as our model is trained for 1 x 784 shaped arrays
np_img = np_img.reshape((1, 784))

# pre-processing done

In [None]:
# Predicting
print("Random Forest predicts the digit : ",clf1.predict(np_img))
print("KNN predicts the digit : ",clf2.predict(np_img))

In [None]:
# showing the image
plt.imshow(np_img.reshape((28,28)))

In [None]:
# the probabilities for each digit
y_pr1 = clf1.predict_proba(np_img) # Random Forest
y_pr2 = clf2.predict_proba(np_img) # KNN
print("Chances for each digit prediction")
print("Digit \t Random Forest \t KNN")
for i in range(10):
    print(i," -> ", int(y_pr1[0][i]*100), "% \t", int(y_pr2[0][i]*100), "%")