In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from scipy.stats import norm

## IMPORT

In [2]:
data = load_iris()
X, y, column_names = data['data'], data['target'], data['feature_names']
X = pd.DataFrame(X, columns = column_names)
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


# Splitting data

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

# Evaluating statistics and freq

In [4]:
means = X_train.groupby(y_train).mean()
stds = X_train.groupby(y_train).std()

In [5]:
means

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,4.997143,3.428571,1.46,0.245714
1,5.887179,2.751282,4.230769,1.315385
2,6.539474,2.978947,5.534211,2.002632


In [6]:
stds

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.337365,0.386973,0.176901,0.109391
1,0.535173,0.316121,0.491054,0.208426
2,0.661505,0.313789,0.541907,0.293614


In [7]:
probs = X_train.groupby(y_train).apply(lambda x: len(x)) / X_train.shape[0]

In [8]:
probs

0    0.312500
1    0.348214
2    0.339286
dtype: float64

# Predicting

In [9]:
y_pred = []

$$
X=x_1,x_2,x_3,x_4
$$
$$
p(y_i|X)\propto p(y_i) \cdot \displaystyle\prod_{j=1}^4 p(x_j|y_i)
$$

In [10]:
for elem in range(X_val.shape[0]):
    p = {}
    for cl in np.unique(y_train):
        p[cl] = probs.iloc[cl]
        for index, param in enumerate(X_val.iloc[elem]):
            p[cl] *= norm.pdf(param, means.iloc[cl, index], stds.iloc[cl, index])
    #y_pred.append(pd.Series(p).values.argmax())
    y_pred.append(pd.Series(p).argmax())

$$
Pred = max(p(y_0|X),p(y_1|X),p(y_2|X))
$$

In [11]:
from sklearn.metrics import accuracy_score
accuracy1 = accuracy_score(y_val, y_pred)

# Comparison

In [12]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
accuracy2 = accuracy_score(y_val, model.predict(X_val))

In [13]:
print(accuracy1)
print(accuracy2)

1.0
1.0
