# Bayesian_classifier_Iris

- Split data for test and training
- split data by class
- Calculate mean/stddev for each column
- Calculate Gaussian probability for each class
- test on unseen data

### Library's

In [3]:
import numpy as np
import pandas as pd

from math import sqrt
from math import pi
from math import exp

import sklearn
from sklearn.model_selection import train_test_split

### Upload Iris dataset
Upload Iris dataset from Kaggle.com. Drop Id column

In [39]:
# Load csv file 
all_df = pd.read_csv('./Iris.csv', index_col=False)

# drop ID column
all_df.drop('Id',axis=1,inplace=True)
all_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


### Encode label column

In [40]:
# Assign features to X
X = all_df

# transform Species into integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Species']= le.fit_transform(X['Species'])

# assign numerical label to y
#y = X['Species']
#X = X.drop('Species', axis=1)
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Split dataset into training and testing set

In [41]:
X_train, X_test = train_test_split(X, test_size=0.10, random_state=1)
X_train['Species'].value_counts()

2    46
0    45
1    44
Name: Species, dtype: int64

### Split dataset by Class
spli data set by class and return numpy array so statistics can be performed on the set.

In [42]:
def class_split(dataset):
    classes = {}
    by_class = dataset.groupby('Species')
    #by_class.head()

    for groups, data in by_class:
        classes[groups] = data
    return pd.DataFrame(classes[0]).to_numpy(), pd.DataFrame(classes[1]).to_numpy(), pd.DataFrame(classes[2]).to_numpy()

In [43]:
X_train = class_split(X_train)
#X_train

In [44]:
def mu(data):
    return sum(data)/float(len(data))

In [45]:
def stdev(data):
    avg = mu(data)
    variance = sum([(x-avg)**2 for x in data]) / float(len(data)-1)
    return sqrt(variance)

In [46]:
def dispersions(dataset):
    stats = [(mu(column), stdev(column)) for column in zip(*dataset)]
    del(stats[-1])
    return stats

In [48]:
stats =[]
for i in range(len(X_train)):
    stats.append(dispersions(X_train[i]))
stats

[[(4.975555555555555, 0.33720479045688356),
  (3.38, 0.37087121018684455),
  (1.4555555555555555, 0.15891043154093204),
  (0.23555555555555538, 0.10478453120474408)],
 [(5.927272727272728, 0.5159608946618881),
  (2.7477272727272735, 0.31214832220092764),
  (4.26590909090909, 0.45491792350349947),
  (1.3159090909090905, 0.19993392143077976)],
 [(6.534782608695653, 0.623687026677946),
  (2.9478260869565216, 0.30965151052596934),
  (5.532608695652173, 0.5545986928806519),
  (2.0130434782608697, 0.28016558251003704)]]

### Create Gaussian Distribution Function
$$P(x)=\frac{1}{\sigma \sqrt 2 \pi } e^{\frac{-(x-\mu)^2}{2\sigma^2}}$$

In [13]:
# Gaussian p.d.f
def gauss_pdf(x, mean, stdev):
    return (1 / (sqrt(2 * pi) * stdev))* exp(-((x-mean)**2 / (2 * stdev**2 )))

In [14]:
print(gauss_pdf(1.0,1.0,1.0))#test can delete

0.3989422804014327


## Bayesian classifier

### Pseudocode
function bayes_classifier(testdata, training_stats):<br>
    -for every x in testdata:<br>
        -for every tuple in training_stats:<br>
            -multiply every gauss_pdf for ever column<br>
            -create posterior probabilty on that tuple<br>
        -compare maximum a posterior (MAP) with target label<br>
    -return array of actual and predicted labels<br>

In [95]:
X_test = X_test.to_numpy()
X_test

array([[5.8, 4. , 1.2, 0.2, 0. ],
       [5.1, 2.5, 3. , 1.1, 1. ],
       [6.6, 3. , 4.4, 1.4, 1. ],
       [5.4, 3.9, 1.3, 0.4, 0. ],
       [7.9, 3.8, 6.4, 2. , 2. ],
       [6.3, 3.3, 4.7, 1.6, 1. ],
       [6.9, 3.1, 5.1, 2.3, 2. ],
       [5.1, 3.8, 1.9, 0.4, 0. ],
       [4.7, 3.2, 1.6, 0.2, 0. ],
       [6.9, 3.2, 5.7, 2.3, 2. ],
       [5.6, 2.7, 4.2, 1.3, 1. ],
       [5.4, 3.9, 1.7, 0.4, 0. ],
       [7.1, 3. , 5.9, 2.1, 2. ],
       [6.4, 3.2, 4.5, 1.5, 1. ],
       [6. , 2.9, 4.5, 1.5, 1. ]])

In [97]:
X_test[0][0]

5.8

In [119]:
X = X_test[0]
X[0]


5.8

In [109]:
stats[0][0]


(4.975555555555555, 0.33720479045688356)

In [177]:
cp=1
for j in range(len(X)-1):
    temp = gauss_pdf(X[j],stats[0][j][0],stats[0][j][1])
    cp *= temp
cp

0.03922581952235828

In [172]:
c=0
for i in range(4):
    
print(c)
    

0


In [178]:
def bayes_classifier(X, stats):
    posterior_prob =[]
    for i in range(len(stats)):
        #print(f"i:{i}")
        class_prob=1
        for j in range(len(X)-1):
            #print(f"j:{j}")
            class_prob*= gauss_pdf(X[j], stats[i][j][0],stats[i][j][1])
            #print(class_prob)
        posterior_prob.append(class_prob)
    return posterior_prob
        
    

In [182]:
pb = bayes_classifier(X_test[0], stats)
pb

[0.03922581952235828, 1.2655162595009397e-20, 5.911549453700815e-26]

In [183]:
# TODO create a maximum posterior 
pb.index(max(pb))

0

In [None]:
# TODO create a predict class

In [None]:
# TODO Evaluate results