# **Imports**

In [None]:
import math as M
import pandas as pd
from google.colab import drive
import numpy as np
import collections

# **Bayesian Classifier Functions:**

## **The Dataset Separation Function that separates any dataset bt its different Classes:**

**This function takes the dataset and the index of the Output (Class or Label) as arguments, and returns a dictionary for a new dataset that includes all data separated by Class**

**This dictionary innludes all data samples as lists, and the indicies of the Dictionary are The Classes Numbers**

In [None]:
def SeparateClasses(DS,ClassIndx):
	DS_Separated = dict()
	for i in range(len(DS)):
		vector = DS[i]
		ClassNumber = vector[ClassIndx]
		if (ClassNumber not in DS_Separated):
			DS_Separated[ClassNumber] = list()
		DS_Separated[ClassNumber].append(vector)
	return DS_Separated

## **Calculate Statistics for each Class in the dataset**

**Mean Function and Standerd Deviation Function**

In [None]:
# Calculate the Mean of a list of Numbers
def Mean(List):
	return sum(List)/float(len(List))

# Calculate the standard deviation of a list of List
def StanderdDeviation(List):
  avg = Mean(List)
  if len(List)-1 == 0:
    variance = sum([(x-avg)**2 for x in List]) / float(len(List))
  else:
    variance = sum([(x-avg)**2 for x in List]) / float(len(List)-1)
  #	variance = sum([(x-avg)**2 for x in List]) / float(len(List)-1)

  return M.sqrt(variance)

##**The Prepare For Classification Function:**

**This Function is gathering all of the values for each column into a list and calculating the mean and standard deviation on that list. Once calculated, All the statistics can ve gathered together into a list or tuple of statistics. Then, this process is repeated for each column in the dataset and return a list of tuples of statistics.**

**The zip() function will aggregate elements from each provided argument. It takes the dataset with the * operator as an argument, and takes the Index of the Output Column to delete it from the training data after calculating all means and standard deviations.**

**The "*" Operator separates the dataset (that is a list of lists) into separate lists for each row. The zip() function then iterates over each element of each row and returns a column from the dataset as a list of numbers. A tuple is created from these 3 numbers and a list of these tuples is stored.**

In [None]:
# Calculate the Mean, StanderdDeviation and count for each column in a DS
def PrepareForClassification(DS,ClassesIndx):
  DataStatistics = [(Mean(column), StanderdDeviation(column), len(column)) for column in zip(*DS)]
  del(DataStatistics[ClassesIndx])
  return DataStatistics

## **The Final Separation Function**

**This Function puts all of separated data and its calculated statistics together in a dataset organized by class values.**

**This Funcrion returns a list of tuples of statistics are then stored in a dictionary by their class value.**

In [None]:
# Split DS by class then calculate statistics for each row
def FinalSeparation(DS,ClassesIndx):
	DS_Separated = SeparateClasses(DS,ClassesIndx)
	DataStatistics = dict()
	for ClassNumber, rows in DS_Separated.items():
		DataStatistics[ClassNumber] = PrepareForClassification(rows,ClassesIndx)
	return DataStatistics

## **The Gaussian Probability Distribution Function:**

**This Function calculates the Gaussian probability distribution for any x.**

In [None]:
def GaussianProbability(x, mean, stdev):
  if mean == 0:
    exponent = 0;
  elif stdev != 0:
	  exponent = M.exp(-((x-mean)**2 / (2 * stdev**2 )))
  else:
    exponent = M.exp(-((x-mean)**2 / (2 * mean**2 )))
  return (1 / (M.sqrt(2 * M.pi) * 2)) * exponent

## **Finally!! The Classification Function:**

**This Function takes a prepared dataset and a test data as input arguments.**

**All Statistics, and Probabilities are calculated for each input value in the row using the Gaussian probability density function and the statistics for that column and of that class. Probabilities are multiplied together as they accumulated.**

**This process is repeated for each class in the dataset.**

**This Function returns a dictionary of probabilities with one entry for each class.**

In [None]:
# Calculate the Propabilities of predicting each class for a given Test Sample
def Classify(DataStatistics, TestSample):
	total_TestSamples = sum([DataStatistics[label][0][-1] for label in DataStatistics])
	Propabilities = dict()
	for ClassNumber, class_DataStatistics in DataStatistics.items():
		Propabilities[ClassNumber] = DataStatistics[ClassNumber][0][-1]/float(total_TestSamples)
		for i in range(len(class_DataStatistics)):
			Mean, StanderdDeviation, _ = class_DataStatistics[i]
			p = GaussianProbability(TestSample[i], Mean, StanderdDeviation)
			if p != 0 :Propabilities[ClassNumber] += M.log(p)
	return Propabilities

# **Testing on MNIST Dataset:**

In [None]:
drive.mount('/content/drive')
mnist = pd.read_csv ("/content/drive/MyDrive/ML_Project1/MNIST.csv")
data = np.array(mnist)
m , n = data.shape
data_development = data[0:1000] # the first 1000 rows for the development data.
data_train = data[1000:m]
data_trainL = data_train.tolist()
data_developmentL = data_development.tolist()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DataStatistics = FinalSeparation(data_trainL,-1)

In [None]:
Correct = 0;
for i in range(len(data_development)):

  Propabilities = Classify(DataStatistics, data_developmentL[i])
  O_Propabilities = collections.OrderedDict(sorted(Propabilities.items()))
  inverse = [(value, key) for key, value in O_Propabilities.items()]
  Predicted = max(inverse)[1]
  if Predicted == data_development[i,-1]: Correct+=1;

In [None]:
Accuracy = 100* Correct / len(data_development)

In [None]:
print("Accuracy = " ,Accuracy, "%")

Accuracy =  75.5 %


In [None]:
data_development[:,-1]

array([1, 0, 1, 4, 0, 0, 7, 3, 5, 3, 8, 9, 1, 3, 3, 1, 2, 0, 7, 5, 8, 6,
       2, 0, 2, 3, 6, 9, 9, 7, 8, 9, 4, 9, 2, 1, 3, 1, 1, 4, 9, 1, 4, 4,
       2, 6, 3, 7, 7, 4, 7, 5, 1, 9, 0, 2, 2, 3, 9, 1, 1, 1, 5, 0, 6, 3,
       4, 8, 1, 0, 3, 9, 6, 2, 6, 4, 7, 1, 4, 1, 5, 4, 8, 9, 2, 9, 9, 8,
       9, 6, 3, 6, 4, 6, 2, 9, 1, 2, 0, 5, 9, 2, 7, 7, 2, 8, 8, 5, 0, 6,
       0, 0, 2, 9, 0, 4, 7, 7, 1, 5, 7, 9, 4, 6, 1, 5, 7, 6, 5, 0, 4, 8,
       7, 6, 1, 8, 7, 3, 7, 3, 1, 0, 3, 4, 5, 4, 0, 5, 4, 0, 3, 5, 1, 0,
       8, 3, 7, 0, 9, 6, 6, 9, 5, 4, 6, 9, 3, 5, 4, 2, 4, 8, 7, 7, 5, 8,
       8, 8, 2, 6, 9, 3, 1, 0, 4, 1, 5, 9, 0, 6, 2, 1, 3, 0, 6, 0, 0, 8,
       3, 2, 0, 0, 6, 0, 0, 4, 7, 2, 7, 1, 9, 9, 3, 9, 8, 4, 6, 6, 5, 3,
       8, 1, 8, 7, 1, 3, 7, 6, 3, 6, 3, 6, 3, 2, 3, 2, 2, 7, 9, 2, 3, 2,
       7, 5, 5, 8, 8, 2, 0, 1, 4, 0, 6, 3, 7, 1, 1, 1, 4, 7, 0, 2, 9, 2,
       0, 5, 6, 0, 8, 9, 6, 2, 0, 0, 7, 2, 0, 4, 2, 0, 9, 1, 6, 9, 3, 0,
       0, 2, 0, 6, 8, 4, 0, 7, 2, 1, 9, 5, 2, 4, 8,