# Bayesian Classifier
***

### Import library's

In [3]:
import numpy as np
import pandas as pd

from math import sqrt
from math import pi
from math import exp

import sklearn
from sklearn.model_selection import train_test_split

In [4]:
# Load csv file 
all_df = pd.read_csv('./Iris.csv', index_col=False)

# drop ID column
all_df.drop('Id',axis=1,inplace=True)
all_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
# basic statistics for each column
all_df.describe()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
all_df["Species"].value_counts()#look at class count

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [7]:
# Assign features to X
X = all_df

# transform Species into integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Species']= le.fit_transform(X['Species'])

# assign numerical label to y
#y = X['Species']
#X = X.drop('Species', axis=1)
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Gaussian distribution
$$P(x)=\frac{1}{\sigma \sqrt 2 \pi } e^{\frac{-(x-\mu)^2}{2\sigma^2}}$$

In [8]:
# Gaussian p.d.f
def gauss_pdf(x, mean, stdev):
    return (1 / (sqrt(2 * pi) * stdev))* exp(-((x-mean)**2 / (2 * stdev**2 )))

In [9]:
print(gauss_pdf(1.0,1.0,1.0))#test can delete

0.3989422804014327


## TODO
Split data into test set and training also split into classes, Get Mean & std dev for each feature in each class for both sets. try make a function to make this more tidy.

In [10]:
X_train, X_test = train_test_split(X, test_size=0.10, random_state=1)

In [11]:
X_train['Species'].value_counts()

2    46
0    45
1    44
Name: Species, dtype: int64

In [12]:
X_test['Species'].value_counts()

1    6
0    5
2    4
Name: Species, dtype: int64

In [13]:
def class_split(dataset):
    classes = {}
    by_class = X.groupby('Species')
    #by_class.head()

    for groups, data in by_class:
        classes[groups] = data
    return pd.DataFrame(classes[0]), pd.DataFrame(classes[1]), pd.DataFrame(classes[2])

In [52]:
Xt1, Xt2, Xt3 = class_split(X_train)


In [15]:
Xt1['SepalLengthCm'].mean(axis=0)

5.005999999999999

In [16]:
Xt1['SepalLengthCm'].std(axis=0)

0.3524896872134512

$$P(A|B) = \frac{P(B|A)P(B)}{P(A)}$$

## TODO
$P(B|A)$ is the product of the gaussian probabilty of each test set from each class multiplied by $P(B)$ which i THINK is $\frac{1}{3}$ test and see. $P(A)$ may not be needed. $P(A|B)$ is the probability of the 3 classes using maximium posteria

In [47]:
Xt2= Xt1.to_numpy()

In [30]:
mean = Xt2.mean(axis=0)
mean

array([5.006, 3.418, 1.464, 0.244, 0.   ])

In [26]:
Xt1.mean(axis=0)

SepalLengthCm    5.006
SepalWidthCm     3.418
PetalLengthCm    1.464
PetalWidthCm     0.244
Species          0.000
dtype: float64

In [31]:
std = Xt2.std(axis=0)
std

array([0.34894699, 0.37719491, 0.17176728, 0.10613199, 0.        ])

In [44]:
gauss_pdf(Xt2[1][0],mean[0], std[0])*gauss_pdf(Xt2[1][1],mean[1], std[1])*gauss_pdf(Xt2[1][2],mean[2], std[2])*gauss_pdf(Xt2[1][3],mean[3], std[3]*0.33)/Xt2.sum()

0.01382848968957493

In [66]:
Xt3 = Xt2.to_numpy()
Xt3

array([[7. , 3.2, 4.7, 1.4, 1. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6.9, 3.1, 4.9, 1.5, 1. ],
       [5.5, 2.3, 4. , 1.3, 1. ],
       [6.5, 2.8, 4.6, 1.5, 1. ],
       [5.7, 2.8, 4.5, 1.3, 1. ],
       [6.3, 3.3, 4.7, 1.6, 1. ],
       [4.9, 2.4, 3.3, 1. , 1. ],
       [6.6, 2.9, 4.6, 1.3, 1. ],
       [5.2, 2.7, 3.9, 1.4, 1. ],
       [5. , 2. , 3.5, 1. , 1. ],
       [5.9, 3. , 4.2, 1.5, 1. ],
       [6. , 2.2, 4. , 1. , 1. ],
       [6.1, 2.9, 4.7, 1.4, 1. ],
       [5.6, 2.9, 3.6, 1.3, 1. ],
       [6.7, 3.1, 4.4, 1.4, 1. ],
       [5.6, 3. , 4.5, 1.5, 1. ],
       [5.8, 2.7, 4.1, 1. , 1. ],
       [6.2, 2.2, 4.5, 1.5, 1. ],
       [5.6, 2.5, 3.9, 1.1, 1. ],
       [5.9, 3.2, 4.8, 1.8, 1. ],
       [6.1, 2.8, 4. , 1.3, 1. ],
       [6.3, 2.5, 4.9, 1.5, 1. ],
       [6.1, 2.8, 4.7, 1.2, 1. ],
       [6.4, 2.9, 4.3, 1.3, 1. ],
       [6.6, 3. , 4.4, 1.4, 1. ],
       [6.8, 2.8, 4.8, 1.4, 1. ],
       [6.7, 3. , 5. , 1.7, 1. ],
       [6. , 2.9, 4.5, 1.5, 1. ],
       [5.7, 2

In [65]:

gauss_pdf(Xt3[1][0],mean[0], std[0])*gauss_pdf(Xt3[1][1],mean[1], std[1])*gauss_pdf(Xt3[1][2],mean[2], std[2])*gauss_pdf(Xt3[1][3],mean[3], std[3]*0.33)

0.0