#### Importation des donnees

In [129]:
import numpy as np
import pandas as pd

In [130]:
data = pd.read_csv("./data/flowers_dataset.csv", index_col=False )
data = data.drop(columns=['id'], errors="ignore")
data.head()

Unnamed: 0,couleur_dominante,largeur_petale_cm,longueur_petale_cm,largeur_feuille_cm,longueur_tige_cm,parfum_intensite,espece
0,Violet,1.4,4.7,2.3,35.2,0.2,Iris d'Or
1,Violet,1.3,4.5,2.0,32.1,0.1,Iris d'Or
2,Violet,1.5,4.9,2.5,38.5,0.3,Iris d'Or
3,Bleu,0.8,2.1,1.2,15.4,0.8,Lys de Nuit
4,Bleu,0.9,2.3,1.1,12.8,0.7,Lys de Nuit


### Pretraitements

#### Encodage des donnees categorielles

In [131]:
data = pd.get_dummies(data, prefix="couleur", columns=["couleur_dominante"], dtype=int)

#### Decoupage en donner d'entrainement et de test

In [84]:
def train_test_split(X, y, test_size=0.2, random_state=None, shuffle=True):
    n = len(X)
    indices = np.arange(n)

    if random_state is not None:
        np.random.seed(random_state)

    # mellange des donnees
    if shuffle:
        np.random.shuffle(indices)

    # separateur
    split = int(n * (1 - test_size))

    train_idx = indices[:split]
    test_idx = indices[split:]

    return X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]


In [132]:
X = data.drop(columns=["espece"], axis=0)
y = data["espece"]
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Entrainement du model

In [153]:
model = {}
classes = np.unique(y_train)

for c  in classes:
    X_c = X[y == c]
    
    # probabilite des classes P(C)
    prior = len(X_c) / len(X)

    # probabilite par features
    mean = X_c.mean(axis = 0)
    var = X_c.var(axis=0) + 1e-9
    
    model[c] = {
        "prior": prior,
        "mean": mean,
        "var": var
    }

### Predictions et evaluations

In [None]:
def gaussian_probability(x, mean, var):
    # return (1/np.sqrt(2 * np.pi * var)) * np.exp(-(x - mean)**2 / (2 * var))
    return -0.5 * np.log(2 * np.pi * var) - ((x - mean)**2 / (2 * var)) # avec logarithme ppour eviter underflow

def class_score(x, prior, mean, var):
    log_prior = np.log(prior)
    log_prob = np.sum(gaussian_probability(x, mean, var))
    return log_prob + log_prior

def predict(X):
    prediction = []
    
    for x in X:
        scores = {}

        for c in model:
            prior = model[c]["prior"]
            mean = model[c]["mean"]
            var = model[c]["var"]

            score = class_score(x, prior, mean, var)
            scores[c] = score
        
        prediction.append(max(scores, key=scores.get))
    
    return np.array(prediction)

In [154]:
### prediction du le jeux de test
y_pred = predict(np.array(X_test))

#### Evaluations

In [149]:
# calcul du accuracy score
def accuracy(y_pred, y_true):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.sum(y_true == y_pred) / len(y_true)

In [155]:
score = accuracy(y_pred, y_test)
print(f"accuracy: {score*100:.2f}%")

accuracy: 0.00%
