#Project 1 : Kannada MNIST - Classification Problem

In [1]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve


In [3]:
# Load the dataset
def load_data():
    train_data1 = np.load('/Users/shivamtiwari/Desktop/Guvi_Projects/ Final_project/X_kannada_MNIST_train.npz')['arr_0']
    test_data1 = np.load('/Users/shivamtiwari/Desktop/Guvi_Projects/ Final_project/X_kannada_MNIST_test.npz')['arr_0']
    train_labels = np.load('/Users/shivamtiwari/Desktop/Guvi_Projects/ Final_project/y_kannada_MNIST_train.npz')['arr_0']
    test_labels = np.load('/Users/shivamtiwari/Desktop/Guvi_Projects/ Final_project/y_kannada_MNIST_test.npz')['arr_0']
    nsamples, nx, ny = train_data1.shape
    train_data = train_data1.reshape((nsamples,nx*ny))
    msamples, mx, my = test_data1.shape
    test_data = test_data1.reshape((msamples,mx*my))

    return train_data, test_data, train_labels, test_labels


In [4]:
load_data()

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 array([0, 1, 2, ..., 7, 8, 9], dtype=uint8),
 array([0, 1, 2, ..., 7, 8, 9], dtype=uint8))

In [None]:
#block of code to Visual representation of classes with corresponding images
plt.figure(figsize=(16,16))
j=1
for i in np.random.randint(0,1000,20):
    plt.subplot(4,5,j)
    j=j+1
    plt.imshow(load_data[i, :].values.reshape(28, 28),cmap="gray") #Reshaping it into  28x28 image
    plt.axis("off")
    plt.title(f"Class  {load_data[i]}")
    plt.tight_layout()
plt.show() 

In [5]:
# Apply PCA
def apply_pca(train_data, test_data, n_components):
    pca = PCA(n_components=n_components)
    train_data_pca = pca.fit_transform(train_data)
    test_data_pca = pca.transform(test_data)
    return train_data_pca, test_data_pca


In [6]:
# Train and evaluate classifiers
def train_and_evaluate_classifier(classifier, train_data, test_data, train_labels, test_labels):
    classifier.fit(train_data, train_labels)
    predictions = classifier.predict(test_data)

    # Metrics
    print("Classification Report:")
    print(classification_report(test_labels, predictions))

    print("\nConfusion Matrix:")
    print(confusion_matrix(test_labels, predictions))

    # Calculate ROC-AUC
    if len(np.unique(train_labels)) == 2:
        roc_auc = roc_auc_score(test_labels, predictions)
        print(f"\nROC-AUC Score: {roc_auc}")

        # Plot ROC curve for binary classification
        fpr, tpr, _ = roc_curve(test_labels, predictions)
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.show()


In [7]:
# Main function
def main():
    # Load data
    train_data, test_data, train_labels, test_labels = load_data()

    # Perform PCA with different component sizes
    component_sizes = [10, 15, 20, 25, 30]

    for n_components in component_sizes:
        print(f"\nPCA Components: {n_components}")
        train_data_pca, test_data_pca = apply_pca(train_data, test_data, n_components)

        # Decision Tree
        print("\nDecision Tree:")
        dt_classifier = DecisionTreeClassifier()
        train_and_evaluate_classifier(dt_classifier, train_data_pca, test_data_pca, train_labels, test_labels)

        # Random Forest
        print("\nRandom Forest:")
        rf_classifier = RandomForestClassifier()
        train_and_evaluate_classifier(rf_classifier, train_data_pca, test_data_pca, train_labels, test_labels)

        # Naive Bayes
        print("\nNaive Bayes:")
        nb_classifier = GaussianNB()
        train_and_evaluate_classifier(nb_classifier, train_data_pca, test_data_pca, train_labels, test_labels)

        # K-NN Classifier
        print("\nK-NN Classifier:")
        knn_classifier = KNeighborsClassifier()
        train_and_evaluate_classifier(knn_classifier, train_data_pca, test_data_pca, train_labels, test_labels)

        # SVM
        print("\nSVM:")
        svm_classifier = SVC(probability=True)
        train_and_evaluate_classifier(svm_classifier, train_data_pca, test_data_pca, train_labels, test_labels)

if __name__ == "__main__":
    main()


PCA Components: 10

Decision Tree:
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.72      0.75      1000
           1       0.76      0.81      0.78      1000
           2       0.95      0.93      0.94      1000
           3       0.76      0.77      0.77      1000
           4       0.78      0.86      0.82      1000
           5       0.83      0.79      0.81      1000
           6       0.77      0.77      0.77      1000
           7       0.76      0.68      0.71      1000
           8       0.81      0.86      0.84      1000
           9       0.83      0.84      0.83      1000

    accuracy                           0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000


Confusion Matrix:
[[722 145   9  41   7   1   3   8  37  27]
 [ 79 810   5  24   6  16   6   9  31  14]
 [  4   3 928  11   4  20  12   7   5   6]
 [ 21  19   4 773  33  21  30  58 