In [1]:
import numpy as np
import struct
from array import array
import random
import matplotlib.pyplot as plt
from os.path import join
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import time
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

In [2]:
class MnistDataloader(object):
    def __init__(self, training_images_filepath, training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        # Read labels
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError(f'Magic number mismatch, expected 2049, got {magic}')
            labels = np.frombuffer(file.read(), dtype=np.uint8)

        # Read images
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError(f'Magic number mismatch, expected 2051, got {magic}')
            image_data = np.frombuffer(file.read(), dtype=np.uint8)
            images = image_data.reshape(size, rows, cols)  # (num, 28, 28)

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test   = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train), (x_test, y_test)


In [3]:
# Set file paths based on added MNIST Datasets

input_path = '/content/drive/MyDrive/MNIST/'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

#
# dictionaries to record performance metrics
#
acc = {}
train_time = {}
pred_time = {}

#
# Load MINST dataset
#
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()


In [4]:
# preprocessing data
X_train = x_train.reshape(x_train.shape[0], -1)
X_test  = x_test.reshape(x_test.shape[0], -1)

print(X_train.shape, y_train.shape)  # (60000, 28, 28) (60000,)
print(X_test.shape, y_test.shape)



(60000, 784) (60000,)
(10000, 784) (10000,)


In [None]:
# --- Decision Tree Algorithm ---
clf = DecisionTreeClassifier(max_depth=20, random_state=42)

start_train = time.time()
clf.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = clf.predict(X_test)
end_pred = time.time()

acc['DT'] = accuracy_score(y_test, y_pred)
train_time['DT'] = end_train - start_train
pred_time['DT'] = end_pred - start_pred

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")


In [None]:
# --- Naive bayes Gaussian ---
nb = GaussianNB()

start_train = time.time()
nb.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = nb.predict(X_test)
end_pred = time.time()

acc['NBG'] = accuracy_score(y_test, y_pred)
train_time['NBG'] = end_train - start_train
pred_time['NBG'] = end_pred - start_pred

print("Naive Bayes Gaussian Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- Naive bayes multinomial ---
mnb = MultinomialNB()

start_train = time.time()
mnb.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = mnb.predict(X_test)
end_pred = time.time()

acc['NBM'] = accuracy_score(y_test, y_pred)
train_time['NBM'] = end_train - start_train
pred_time['NBM'] = end_pred - start_pred

print("Naive Bayes Multinominal Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- KNN algorithm ---
knn = KNeighborsClassifier(n_neighbors=3)

start_train = time.time()
knn.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = knn.predict(X_test)
end_pred = time.time()

acc['KNN'] = accuracy_score(y_test, y_pred)
train_time['KNN'] = end_train - start_train
pred_time['KNN'] = end_pred - start_pred

print("KNN Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- SVM classifier linear kernel ---
svm = LinearSVC(class_weight='balanced', max_iter=10000, random_state=42)

start_train = time.time()
svm.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = svm.predict(X_test)
end_pred = time.time()

acc['SVM'] = accuracy_score(y_test, y_pred)
train_time['SVM'] = end_train - start_train
pred_time['SVM'] = end_pred - start_pred

print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- SVM classifier rbf kernel ---
svm = SVC(kernel='rbf', class_weight='balanced', random_state=42)


start_train = time.time()
svm.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = svm.predict(X_test)
end_pred = time.time()

acc['SVM_RBF'] = accuracy_score(y_test, y_pred)
train_time['SVM_RBF'] = end_train - start_train
pred_time['SVM_RBF'] = end_pred - start_pred

print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- bagging with decision tree as base estimator ---
base_tree = DecisionTreeClassifier(max_depth=20, random_state=42)
bagging = BaggingClassifier(
    estimator = base_tree,
    n_estimators = 10,     # number of trees
    max_samples = 0.8,     # each tree trained on 80% of training data (bootstrap)
    max_features = 1.0,    # use all features
    bootstrap = True,
    random_state=42
)

start_train = time.time()
bagging.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = bagging.predict(X_test)
end_pred = time.time()

acc['BAG'] = accuracy_score(y_test, y_pred)
train_time['BAG'] = end_train - start_train
pred_time['BAG'] = end_pred - start_pred

print("Bagging with Decision Trees Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")


In [None]:
# --- Random Forest ---
rf = RandomForestClassifier(
    n_estimators=100,      # number of trees
    max_depth=20,          # limit depth (prevent overfitting)
    max_features="sqrt",
    random_state=42
)

start_train = time.time()
rf.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = rf.predict(X_test)
end_pred = time.time()

acc['RF'] = accuracy_score(y_test, y_pred)
train_time['RF'] = end_train - start_train
pred_time['RF'] = end_pred - start_pred

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
base_tree = DecisionTreeClassifier(max_depth=1, random_state=42)

# --- AdaBoost Classifier ---
adaboost = AdaBoostClassifier(
    estimator=base_tree,     # base learner (weak classifier)
    n_estimators=50,         # number of weak learners
    learning_rate=1.0,       # weight applied to each classifier
    random_state=42
)

start_train = time.time()
adaboost.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = adaboost.predict(X_test)
end_pred = time.time()

acc['AB'] = accuracy_score(y_test, y_pred)
train_time['AB'] = end_train - start_train
pred_time['AB'] = end_pred - start_pred

print("AdaBoost Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
# --- XGBoost Classifier ---
xgb_model = XGBClassifier(
    n_estimators=100,      # number of boosting rounds (trees)
    max_depth=6,           # depth of each tree
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softmax",
    num_class=10,
    tree_method="hist",
    random_state=42
)

start_train = time.time()
xgb_model.fit(X_train, y_train)
end_train = time.time()

start_pred = time.time()
y_pred = xgb_model.predict(X_test)
end_pred = time.time()

acc['XG'] = accuracy_score(y_test, y_pred)
train_time['XG'] = end_train - start_train
pred_time['XG'] = end_pred - start_pred

print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print(f"Training time: {end_train - start_train:.2f} seconds")
print(f"Prediction time: {end_pred - start_pred:.2f} seconds")

In [None]:
print(acc)
print(train_time)
print(pred_time)

In [None]:
models = list(acc.keys())
accuracies = list(acc.values())

plt.figure(figsize=(12,6))
plt.plot(models, accuracies, marker="o", linestyle="-", color="b")

plt.ylabel("Accuracy")
plt.title("Model Accuracies on MNIST (Line Graph)")

for i, a in enumerate(accuracies):
    plt.text(i, a + 0.005, f"{a:.3f}", ha='center', fontsize=9)

plt.grid(True)
plt.show()

In [None]:
models1 = list(train_time.keys())
train = list(train_time.values())

plt.figure(figsize=(12,6))
plt.plot(models1, train, marker="o", linestyle="-", color="b")

plt.ylabel("Training Time")
plt.title("Model training time on MNIST (Line Graph)")

for i, a in enumerate(train):
    plt.text(i, a + 0.005, f"{a:.3f}", ha='center', fontsize=9)

plt.grid(True)
plt.show()

In [None]:
models2 = list(pred_time.keys())
predict = list(pred_time.values())

plt.figure(figsize=(12,6))
plt.plot(models2, predict, marker="o", linestyle="-", color="b")

plt.ylabel("Accuracy")
plt.title("Model prediction time on MNIST (Line Graph)")

for i, a in enumerate(predict):
    plt.text(i, a + 0.005, f"{a:.3f}", ha='center', fontsize=9)

plt.grid(True)
plt.show()