Here We are going to preform Handwritten Character Recognition on Devanagari Image Dataset Using Naive Bayes Classifier through Scikit-Learn.

The Dataset is consists of two folder one is Train and the other is Test. Both Train and Test folders consists of 46 folders of  Devanagari Characters and Digits. The folders in Train folder consists of 1700 images of each Character and Digit whereas in Test folder, each character and Digit folder consists of 300 images.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# Paths to get images.
train_path = "./DevanagariHandwrittenCharacterDataset/Train"
test_path = "./DevanagariHandwrittenCharacterDataset/Test"

In [3]:
unique_labels = os.listdir(train_path)

In [4]:
# It will go to one folder path fetch all the images from one path and will return one list of all names of images of all folders.
def list_of_images(folder):
    
    return os.listdir(os.path.join(train_path,folder))

In [5]:
def read_image(folder,image):
    
    folder_path = os.path.join(train_path,folder) # path to each folder
    
    image_path = os.path.join(folder_path,image) # path to each image of each folder
    
    image = plt.imread(image_path) # read one image and will return 32X32 numpy array.
    
    return image.reshape(image.shape[0]*image.shape[1],) # Reshaping image from 32X32 into 1X1024.

In [6]:
# Stacking all the lists of image data to a matrix.
def stacking_row_vectors(folder):
    
    images_list = list_of_images(folder) # Calling list_of_Images Function
    
    images=[]
    
    for img in images_list:
        
        images.append(read_image(folder,img))
        
    # 1360 is the number of data we have in training and rest is for cross validation
    return np.array(images[0:1360]), np.array(images[1360:])

In [7]:
# We are appending training data and cross validation data into lists and then concatenating it.

train_data = []
cv_data = []

for folder in unique_labels:
    
    train_folder_matrix, cv_folder_matrix = stacking_row_vectors(folder)
    
    train_data.append(train_folder_matrix)
    
    cv_data.append(cv_folder_matrix)
    
train_data = np.concatenate(train_data,axis=0)
cv_data = np.concatenate(cv_data,axis=0)

In [8]:
# Getting labels for training data and cross validation data into lists.

train_labels = []

cv_labels = []

for folder_name in unique_labels:
    
    train_labels = train_labels + [folder_name]*1360
    
    cv_labels = cv_labels + [folder_name]*340

In [9]:
cv_data.shape

(15640, 1024)

In [10]:
len(cv_labels)

15640

In [11]:
len(train_labels)

62560

In [12]:
train_data = pd.DataFrame(data=train_data)

In [13]:
train_data['label'] = train_labels

In [14]:
train_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
62556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
62557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
62558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9


In [15]:
cv_data = pd.DataFrame(data=cv_data)

In [16]:
cv_data['label'] = cv_labels

In [17]:
cv_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,character_10_yna
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
15636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
15637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9
15638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,digit_9


In [18]:
# To get the complete data we are concatenating training and cross validation data to apply PCA on it.
data = pd.concat([train_data,cv_data])

In [19]:
X = np.array(data.iloc[:,0:1024])

In [20]:
cv_labels = np.array(cv_labels)

In [21]:
len(train_labels)

62560

In [22]:
train_labels = np.array(train_labels)

In [23]:
# Creating data to zero mean and one standard deviation.

scaler = StandardScaler()
X_s = scaler.fit_transform(X)

In [24]:
# This function will get take the eigen vectors from number_eig_vectors list to apply PCA and then tranfrom the data to
# new vector space. Then we'll apply the algorithm on it and will save the result accuracy into the dictionary "D_performance"
# with keys as the number of features taken and accuracy on those number of features as the values.

def cross_validation(n_eig_vectors):
    
    pca_obj = PCA(n_components=n_eig_vectors)
    
    X_new = pca_obj.fit_transform(X_s)
    
    X_train_new = X_new[0:62560,:]
    
    X_cv_new = X_new[62560:,:]
    
    obj = GaussianNB()
    
    obj.fit(X_train_new,train_labels)
    
    cv_predicted_category = obj.predict(X_cv_new) # Predicting the outcome we get.
    
    # Getting accuracy through comparing the output we get and the true labels
    return accuracy_score(cv_labels,cv_predicted_category)

In [25]:
# Here we are giving different values(number of features) to the cross validation function in order to get the best accuracy.

number_eig_vectors = [512, 256, 128, 64, 32, 16, 8, 4, 2, 1]

D_performance = {}

for number in number_eig_vectors:
    
    D_performance[number] = cross_validation(number)

In [26]:
D_performance

{512: 0.4377877237851662,
 256: 0.49578005115089513,
 128: 0.5535805626598466,
 64: 0.5429667519181586,
 32: 0.5174552429667519,
 16: 0.44782608695652176,
 8: 0.3055626598465473,
 4: 0.14584398976982096,
 2: 0.06975703324808184,
 1: 0.04469309462915601}

Here as we can see that the highest accuracy we get is of 55.35% when we passed 128 number of features in the model.

Please do check the next repository where we have achived higher accuracy on the same dataset, by performing the code from Scratch(Without using Scikit-Learn).