<a href="https://colab.research.google.com/github/RamaBharti/Classification-of-Handwritten-Digit-on-MNSIT-dataset/blob/master/mnsit_random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import urllib.request
import gzip
import numpy as np
from scipy.stats import randint
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

url = 'http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz'
filename, headers = urllib.request.urlretrieve(url, 'train-images-idx3-ubyte.gz')
# Load training images
with gzip.open(filename, 'rb') as f:
    data = np.frombuffer(f.read(), np.uint8, offset=16)
    X_train = data.reshape(-1, 28*28)

url = 'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz'
filename, headers = urllib.request.urlretrieve(url, 'train-labels-idx1-ubyte.gz')
# Load training labels
with gzip.open(filename, 'rb') as f:
    data = np.frombuffer(f.read(), np.uint8, offset=8)
    y_train = data

url = 'http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz'
filename, headers = urllib.request.urlretrieve(url, 't10k-images-idx3-ubyte.gz')
# Load test images
with gzip.open(filename, 'rb') as f:
    data = np.frombuffer(f.read(), np.uint8, offset=16)
    X_test = data.reshape(-1, 28*28)

url = 'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz'
filename, headers = urllib.request.urlretrieve(url, 't10k-labels-idx1-ubyte.gz')
# Load test labels
with gzip.open(filename, 'rb') as f:
    data = np.frombuffer(f.read(), np.uint8, offset=8)
    y_test = data


print("_RF on gini_")
rf = RandomForestClassifier(criterion='gini', random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print("Accuracy_rf:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Accuracy_cross_on_train: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(rf, X_test, y_test, cv=5)
print("Accuracy_cross_test: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Define parameter grid for grid search
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [10, 50, 100, None],
    'max_features': ['sqrt', 'log2']
}
# Perform grid search using k-fold cross-validation
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters found by grid search:", grid_search.best_params_)
print("Best accuracy score found by grid search:", grid_search.best_score_)
y_pred = grid_search.predict(X_test)
print("Accuracy_grid:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Define parameter distribution for randomized search
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [10, 50, 100, None],
    'max_features': ['sqrt', 'log2']
}
# Perform randomized search using k-fold cross-validation
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, cv=5, n_iter=10)
random_search.fit(X_train, y_train)
print("Best parameters found by randomized search:", random_search.best_params_)
print("Best accuracy score found by randomized search:", random_search.best_score_)
y_pred = random_search.predict(X_test)
print("Accuracy_random:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))



print("_RF on entropy_")
rf = RandomForestClassifier(criterion='entropy', random_state=42)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print("Accuracy_rf:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

scores = cross_val_score(rf, X_train, y_train, cv=5)
print("Accuracy_cross_on_train: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
scores = cross_val_score(rf, X_test, y_test, cv=5)
print("Accuracy_cross_test: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Define parameter grid for grid search
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [10, 50, 100, None],
    'max_features': ['sqrt', 'log2']
}
# Perform grid search using k-fold cross-validation
grid_search = GridSearchCV(rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters found by grid search:", grid_search.best_params_)
print("Best accuracy score found by grid search:", grid_search.best_score_)
y_pred = grid_search.predict(X_test)
print("Accuracy_grid:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# Define parameter distribution for randomized search
param_dist = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [10, 50, 100, None],
    'max_features': ['sqrt', 'log2']
}
# Perform randomized search using k-fold cross-validation
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, cv=5, n_iter=10)
random_search.fit(X_train, y_train)
print("Best parameters found by randomized search:", random_search.best_params_)
print("Best accuracy score found by randomized search:", random_search.best_score_)
y_pred = random_search.predict(X_test)
print("Accuracy_random:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1-score:", f1_score(y_test, y_pred, average='macro'))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

_RF on gini_
Accuracy_rf: 0.9705
Precision: 0.9703666911082005
Recall: 0.9702072950839545
F1-score: 0.970260923866558
Confusion matrix:
 [[ 971    0    0    0    0    2    3    1    3    0]
 [   0 1127    2    2    0    1    2    0    1    0]
 [   6    0 1002    5    3    0    3    8    5    0]
 [   1    0    9  972    0    9    0    9    8    2]
 [   1    0    0    0  955    0    5    1    4   16]
 [   5    1    1    9    2  860    5    2    5    2]
 [   7    3    0    0    3    3  937    0    5    0]
 [   1    4   20    2    0    0    0  990    2    9]
 [   4    0    6    7    5    5    5    4  930    8]
 [   7    6    2   12   12    1    0    4    4  961]]
Accuracy_cross_on_train: 0.97 (+/- 0.01)
Accuracy_cross_test: 0.95 (+/- 0.04)
Best parameters found by grid search: {'max_depth': 50, 'max_features': 'sqrt', 'n_estimators': 200}
Best accuracy score found by grid search: 0.9672833333333333
Accuracy_grid: 0.9706
Precision: 0.9704388859765455
Recall: 0.9703212075507931
F1-score: 0.9