### **Preprocessing:**

In [None]:
import random

DATA_WIDTH=28
DATA_HEIGHT=28
NUMBER_OF_TRAINING_EXAMPLES=5000
NUMBER_OF_VALIDATION_EXAMPLES=1000

ALL_TRAINING_IMAGES=[]
ALL_TRAINING_LABELS=[]
ALL_VALIDATION_IMAGES=[]
ALL_VALIDATION_LABELS=[]

'''
Convert ASC-II pixel into numerical data and vice versa
    - ' ' is converted to ..., which means it's part of the background
    - '#' is converted to ..., part of the image interior
    - '+' is converted to ..., part of the edges
    
'''

def _pixel_to_value(character):
    if(character == ' '):
        return 0
    elif(character == '#'):
        return 1
    elif(character == '+'):
        return 2  
    
def _value_to_pixel(value):
    if(value == 0):
        return ' '
    elif(value == 1):
        return '#'
    elif(value == 2):
        return '+'
'''
Function for loading data and label files
'''

def _load_data_file(filename, n, width, height):
    fin = [l[:-1] for l in open(filename).readlines()]
    fin.reverse()
    items = []
    for i in range(n):
        data = []
        for j in range(height):
            row = list(map(_pixel_to_value, list(fin.pop())))
            data.append(row)
        items.append(data)
    return items
        
def _load_label_file(filename, n):
    fin = [l[:-1] for l in open(filename).readlines()]
    labels = []
    for i in range(n):
        labels.append(int(fin[i]))
    return labels

def _load_all_data():
    global ALL_TRAINING_IMAGES
    global ALL_TRAINING_LABELS
    global ALL_VALIDATION_IMAGES
    global ALL_VALIDATION_LABELS

    ALL_TRAINING_IMAGES = _load_data_file("trainingimages",
        NUMBER_OF_TRAINING_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
    ALL_TRAINING_LABELS = _load_label_file("traininglabels",
        NUMBER_OF_TRAINING_EXAMPLES)

    ALL_VALIDATION_IMAGES = _load_data_file("validationimages",
        NUMBER_OF_VALIDATION_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
    ALL_VALIDATION_LABELS = _load_label_file("validationlabels",
        NUMBER_OF_VALIDATION_EXAMPLES)

'''
Function for printing digits
'''

def _print_digit_image(data):
    for row in range(len(data)):
        print(''.join(map(_value_to_pixel, data[row])))
    
_load_all_data()

example_number = random.randint(0, NUMBER_OF_TRAINING_EXAMPLES)
print("Printing digit example #" + str(example_number + 1) + " with label: " \
        + str(ALL_TRAINING_LABELS[example_number]))
_print_digit_image(ALL_TRAINING_IMAGES[example_number])

Printing digit example #464 with label: 5
                            
                            
                            
                            
                            
                            
                            
                            
              +++++++       
             +### ### #++   
            +###+ +++++++   
           ###+             
          ##++              
         +##+               
         +####+             
          ++####+           
             +##+           
              +#+           
             ++#+           
          ++###++           
     ##+# ### +             
     +++++++                
                            
                            
                            
                            
                            
                            


In [None]:
## Importing libraries:
import numpy as np

## Loading data:
DATA_WIDTH=28
DATA_HEIGHT=28
NUMBER_OF_TRAINING_EXAMPLES=5000
NUMBER_OF_VALIDATION_EXAMPLES=1000

x_train = _load_data_file("trainingimages", NUMBER_OF_TRAINING_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
y_train = _load_label_file("traininglabels", NUMBER_OF_TRAINING_EXAMPLES)
x_test = _load_data_file("validationimages", NUMBER_OF_VALIDATION_EXAMPLES, DATA_WIDTH, DATA_HEIGHT)
y_test = _load_label_file("validationlabels", NUMBER_OF_VALIDATION_EXAMPLES)

In [None]:
# Printing a sample digit with the new format:
for row in range(int(len(x_train[50]))):
    line = list(''.join(map(_value_to_pixel, x_train[50][row])))         ## Top half
    print(line)

[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '+', '+', '+', '+', '#', '+', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '+', '+', '#', '#', '#', '#', ' ', '#', '#', '+', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '+', '+', '+', '+', '+', '+', '+', '+', '#', '+', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
[' ', ' ', ' 

### **Feature 1: Calculate the width and height of the digit**

In [None]:
def Feature1(data):
  ind_width = []
  ind_height = []
  for row in range(len(data)):
    line = list(''.join(map(_value_to_pixel, data[row])))
    ind = np.where(np.array(line) != ' ')
    if ind[0].any(): ind_width.append(np.max(ind) - np.min(ind))
    else: ind_width.append(0)
    if (line.count('#') != 0) | (line.count('+') != 0):
      ind_height.append(row)
  height = np.max(ind_height) - np.min(ind_height)+1
  width = np.max(ind_width)
  return width, height

### **Feature 2: Calculate the number of #s and +s in the top and bottom half**

In [None]:
def Feature2(data):
  sharp_count_top = 0
  plus_count_top = 0
  sharp_count_bottom = 0
  plus_count_bottom = 0
  data_top = data[0:int(len(data)/2)]
  data_bottom = data[int(len(data)/2):]
  for row in range(int(len(data)/2)):
    line = list(''.join(map(_value_to_pixel, data_top[row])))     ## Top half
    sharp_count_top += line.count('#')
    plus_count_top += line.count('+')
    line = list(''.join(map(_value_to_pixel, data_bottom[row])))  ## Bottom half
    sharp_count_bottom += line.count('#')
    plus_count_bottom += line.count('+')

  return sharp_count_top, plus_count_top, sharp_count_bottom, plus_count_bottom

### **Feature 3: Average of the line width**

In [None]:
def Feature3(data):
  line_width = []
  for row in range(len(data)):
    line = list(''.join(map(_value_to_pixel, data[row])))
    line_width.append(line.count('#') + line.count('+'))

  avg_line_width = np.mean(line_width)
  return avg_line_width

### **Implementing the algorithm:**

In [None]:
# A function for calculating the Gaussian distribution:
def Gaussian_dist(mu, sigma, x):
  return (1/(sigma*np.sqrt(2*np.pi)))*np.exp(-0.5*((x-mu)/sigma)**2)

# ------------------------------------------------------------------------------
# A function for calculating first and second features' mean and standar deviation:
def F1_F2_parameters(x_train, y_train, class_num):
  ind_class = [i for i in range(len(y_train)) if y_train[i] == class_num]
  data = list(np.array(x_train)[ind_class])
  Width = []
  Height = []
  for j in range(len(data)):
    digit = data[j]
    [width, height] = Feature1(digit)
    Width.append(width)
    Height.append(height)

  mu_width = np.mean(Width)
  mu_height = np.mean(Height)
  std_width = np.std(Width)
  std_height = np.std(Height)
  return mu_width, mu_height, std_width, std_height

# ------------------------------------------------------------------------------
# A function for calculating features 3 through 6 parameters:
def F3_6_parameters(x_train, y_train, class_num):
  ind_class = [i for i in range(len(y_train)) if y_train[i] == class_num]
  data = list(np.array(x_train)[ind_class])
  feature3 = []
  feature4 = []
  feature5 = []
  feature6 = []
  for j in range(len(data)):
    digit = data[j]
    [sharp_count_top, plus_count_top, sharp_count_bottom, plus_count_bottom] = Feature2(digit)
    feature3.append(sharp_count_top)
    feature4.append(plus_count_top)
    feature5.append(sharp_count_bottom)
    feature6.append(plus_count_bottom)
  mu3 = np.mean(feature3)
  mu4 = np.mean(feature4)
  mu5 = np.mean(feature5)
  mu6 = np.mean(feature6)
  std3 = np.std(feature3)
  std4 = np.std(feature4)
  std5 = np.std(feature5)
  std6 = np.std(feature6)
  return mu3, mu4, mu5, mu6, std3, std4, std5, std6

# ------------------------------------------------------------------------------
# A function for calculating feature 7 parameters:
def F7_parameters(x_train, y_train, class_num):
  ind_class = [i for i in range(len(y_train)) if y_train[i] == class_num]
  data = list(np.array(x_train)[ind_class])
  Line_widths = []
  for j in range(len(data)):
    digit = data[j]
    avg_line_width = Feature3(digit)
    Line_widths.append(avg_line_width)
  mu_line_width = np.mean(Line_widths)
  std_line_width = np.std(Line_widths)
  return mu_line_width, std_line_width

# ------------------------------------------------------------------------------
# A function for calculating all the parameters of 7 features in each class:
def Calculate_parameters(x_train):
  MU_F1 = []
  MU_F2 = []
  MU_F3 = []
  MU_F4 = []
  MU_F5 = []
  MU_F6 = []
  MU_F7 = []

  STD_F1 = []
  STD_F2 = []
  STD_F3 = []
  STD_F4 = []
  STD_F5 = []
  STD_F6 = []
  STD_F7 = []

  for i in range(10):
    [mu_width, mu_height, std_width, std_height] = F1_F2_parameters(x_train, y_train, i)
    MU_F1.append(mu_width)
    MU_F2.append(mu_height)
    STD_F1.append(std_width)
    STD_F2.append(std_height)
    [mu3, mu4, mu5, mu6, std3, std4, std5, std6] = F3_6_parameters(x_train, y_train, i)
    MU_F3.append(mu3)
    MU_F4.append(mu4)
    MU_F5.append(mu5)
    MU_F6.append(mu6)
    STD_F3.append(std3)
    STD_F4.append(std4)
    STD_F5.append(std5)
    STD_F6.append(std6)
    [mu_line_width, std_line_width] = F7_parameters(x_train, y_train, i)
    MU_F7.append(mu_line_width)
    STD_F7.append(std_line_width)

  return MU_F1, MU_F2, MU_F3, MU_F4, MU_F5, MU_F6, MU_F7, STD_F1, STD_F2, STD_F3, STD_F4, STD_F5, STD_F6, STD_F7

# ------------------------------------------------------------------------------
# A function for calculating the prior probabilities of each class:
def prior_probs(y_train):
  prior_prob = []
  for i in range(10):
    prior_prob.append(y_train.count(i)/len(y_train))
  return prior_prob

In [None]:
def Classifier(x_train, y_train, prior_prob, test_data, MU_F1, MU_F2, MU_F3, MU_F4, MU_F5, MU_F6, MU_F7, STD_F1, STD_F2, STD_F3, STD_F4, STD_F5, STD_F6, STD_F7):
  classes_probs = []
  for i in range(10):
    # Calculating the probabilities for each class:
    [width_test, height_test] = Feature1(test_data)
    [f3_test, f4_test, f5_test, f6_test] = Feature2(test_data)
    avg_line_width = Feature3(test_data)
    Gauss_F1 = Gaussian_dist(MU_F1[i], STD_F1[i], width_test)
    Gauss_F2 = Gaussian_dist(MU_F2[i], STD_F2[i], height_test)
    Gauss_F3 = Gaussian_dist(MU_F3[i], STD_F3[i], f3_test)
    Gauss_F4 = Gaussian_dist(MU_F4[i], STD_F4[i], f4_test)
    Gauss_F5 = Gaussian_dist(MU_F5[i], STD_F5[i], f5_test)
    Gauss_F6 = Gaussian_dist(MU_F6[i], STD_F6[i], f6_test)
    Gauss_F7 = Gaussian_dist(MU_F7[i], STD_F7[i], avg_line_width)
    classes_probs.append(prior_prob[i]*Gauss_F1*Gauss_F2*Gauss_F3*Gauss_F4*Gauss_F5*Gauss_F6*Gauss_F7)

  predicted_label = np.argmax(classes_probs)
  return predicted_label

### **Accuracy of the model:**

In [None]:
def accuracy(y_test, y_pred):
  count = 0
  for i in range(len(y_test)):
    if y_test[i] == y_pred[i]:
      count += 1
  acc = count/len(y_test)
  print('The accuracy of the model is: {}%'.format(acc*100))

### **Testing the model and finding the accuracy:**

In [None]:
# Calculate the parameters for each class:
[MU_F1, MU_F2, MU_F3, MU_F4, MU_F5, MU_F6, MU_F7, STD_F1, STD_F2, STD_F3, STD_F4, STD_F5, STD_F6, STD_F7] = Calculate_parameters(x_train)

# Calculating Prior probabilities:
Prior_Probabilities = prior_probs(y_train)

# Evaluating the model by test data:
y_pred = []
for i in range(len(x_test)):
  y_pred.append(Classifier(x_train, y_train, Prior_Probabilities, x_test[i], MU_F1, MU_F2, MU_F3, MU_F4, MU_F5, MU_F6, MU_F7, STD_F1, STD_F2, STD_F3, STD_F4, STD_F5, STD_F6, STD_F7))

# Accuracy:
accuracy(y_test, y_pred)

The accuracy of the model is: 34.699999999999996%
