In [2]:
from sklearn.datasets import load_wine
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from scipy.stats import norm
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from scipy import stats


import warnings
warnings.filterwarnings("ignore")

In [3]:
def print_metrics(y_test, preds):
    print('Balanced accuracy: ', metrics.balanced_accuracy_score(y_test, preds))
    print('F1 Weighted', metrics.f1_score(y_test, preds, average='weighted'))
    # print(metrics.classification_report(y_test, preds))

In [4]:
# X, y = load_wine(return_X_y=True, as_frame=True)

In [5]:
cols = [f'X{i+1}' for i in range(9)]
cols = cols + ['TARGET']

In [6]:
data = pd.DataFrame(np.genfromtxt(os.path.join('data','DATA.txt')), columns=cols)

In [7]:
X = data.copy()

In [8]:
y = X.pop('TARGET')

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, shuffle=True, stratify=y, random_state=172312873)

In [10]:
number_estimators = 3

In [11]:
class BayesClassifier:
    def __init__(self, classes, for_ensemble=False):
        self.classes = classes
        self.for_ensemble = for_ensemble
        self.numeric_probs = pd.DataFrame()
        var_class = []
        stats = []
        for class_name in self.classes:
            var_class.append(class_name)
            var_class.append(class_name)
            stats.append('mean')
            stats.append('std')
        self.numeric_probs['class_name'] = var_class
        self.numeric_probs['stats'] = stats
        


    def fit(self, X, y):
        """ receives a pandas dataframe """

        # calculate classes probabilities
        self.class_probs = pd.DataFrame(y.value_counts() / len(y)).T

        # divides the feature in numeric and categorical
        self.numeric_features = X.select_dtypes(include=np.number).columns.to_list()
        self.categorical_features = [feature for feature in X.columns if feature not in self.numeric_features]
        X = X.copy()
        X['target'] = y

        # for numeric features
        for feature in self.numeric_features:
            self.numeric_probs[feature] = 0.0
            for class_name in self.classes:
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'mean' ), feature
                    ] = X.loc[X.target == class_name, feature].mean()
                # get the stds
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'std' ), feature
                    ] = X.loc[X.target == class_name, feature].std()

        # for categorical features
        self.classes_counts = pd.DataFrame(y.value_counts()).T
        self.categorical_probs = {}

        # TODO: add laplacian correction for categorical features
        for feature in X.columns:
            probs = pd.DataFrame(index=self.classes)
            for value in X[feature].unique():
                probs[value] = 0.000001
                for class_name in self.classes:
                    conditional_prob = len(X.loc[(y == class_name) & (X[feature] == value)]) / self.classes_counts[class_name].values[0]
                    probs.loc[class_name, value] = conditional_prob
            self.categorical_probs[feature] = probs



    def _predict_row(self, X):  
        predicted_probs = pd.DataFrame()
        for class_name in self.classes:
            print('Class', class_name)
            prob = 1
            # numeric features
            for feature in self.numeric_features:
                mean = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'mean'), feature]
                std = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'std'), feature]
                aux_prob = norm.pdf(X[feature], mean, std)
                prob *= aux_prob
                print(feature, aux_prob)
            # categorical features
            for feature in self.categorical_features:
                aux_probs = self.categorical_probs[feature][X[feature]].T
                print(feature, aux_probs)
                prob *= aux_probs[class_name].values

            print(prob)
            if not self.for_ensemble:
                prob *= self.class_probs[class_name].values[0]
            predicted_probs[class_name] = prob
        
        # softmax normalization
        if self.for_ensemble == False:
            self.predicted_probs = predicted_probs.apply(self.softmax_norm, axis=1)
        return predicted_probs

    def softmax_norm(self, X):
        return X / X.sum()
        
    def predict_proba(self, X):
        return self._predict_row(X)

    def predict(self, X):
        return self._predict_row(X).idxmax(axis=1)


In [12]:
class BayesEnsemblePredictions:
    def __init__(self, dataset, classes_probs):
        self.dataset = dataset
        self.classifiers = []
        self.classes_list = list(classes_probs.index)
        self.classes_probs = classes_probs

    def fit(self):
        for x_train, y_train in self.dataset:
            classifier = BayesClassifier(self.classes_list)
            classifier.fit(x_train, y_train)
            self.classifiers.append(classifier)


    def _predict_data(self, X):
        predictions = np.ones((len(X), len(self.classes_probs.index)))
        predictions *= self.classes_probs.to_numpy()
        save_preds = []

        for classifier in self.classifiers:
            # multiply the probability of the different estimators
            preds = classifier.predict_proba(X)
            save_preds.append(preds)
            predictions *= preds

        predictions.apply(self.softmax_norm, axis=1)
        return predictions
    
    def predict(self, X):
        return self._predict_data(X).idxmax(axis=1)
        
    def predict_proba(self, X):
        return self._predict_data(X)

    def softmax_norm(self, X):
        return X / X.sum()

In [13]:
predictions = []
predictions_probs = []
classifiers = []

dataset = []
kfold = StratifiedKFold(n_splits=number_estimators, shuffle=True)
for i, (train_indexes, test_indexes) in enumerate(kfold.split(x_train, y_train)):
    # sample data
    x_aux = x_train.iloc[train_indexes]
    y_aux = y_train.iloc[train_indexes]

    #save for ensemble
    dataset.append([x_aux, y_aux])

    classifier = BayesClassifier(list(y_train.unique()))
    classifier.fit(x_aux, y_aux)
    preds = classifier.predict(x_test)
    print_metrics(y_test, preds)    
    classifiers.append(classifier)
    predictions.append(preds)
    predictions_probs.append(classifier.predict_proba(x_test))

Class 1.0
X1 [0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.23430683 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.23430683 0.74474489 0.74474489 0.74474489 0.74474489
 0.23430683 0.74474489 0.74474489 0.74474489 0.74474489 0.23430683
 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.74474489 0.23430683 0.23430683 0.74474489 0.23430683
 0.74474489 0.23430683 0.74474489 0.74474489 0.23430683 0.74474489
 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.74474489 0.74474489 0.23430683 0.23430683 0.74474489
 0.74474489 0.74474489 0.23430683 0.74474489 0.74474489 0.74474489
 0.23430683 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489 0.74474489
 0.74474489 0.23430683 0.74474489 0.23430683 0.74

## Majority Voting

In [14]:
predictions = np.array(predictions) 
majority_preds = stats.mode(predictions, axis=0).mode.T
print_metrics(y_test, majority_preds)

Balanced accuracy:  0.7785714285714286
F1 Weighted 0.8079663394109398


## Baseline optimizer

In [15]:
baseline_classifier = BayesClassifier(list(y_train.unique()))

In [16]:
baseline_classifier.fit(x_train, y_train)
preds = baseline_classifier.predict(x_test)

Class 1.0
X1 [0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.1916557  0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.1916557  0.79780541 0.79780541 0.79780541 0.79780541
 0.1916557  0.79780541 0.79780541 0.79780541 0.79780541 0.1916557
 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.79780541 0.1916557  0.1916557  0.79780541 0.1916557
 0.79780541 0.1916557  0.79780541 0.79780541 0.1916557  0.79780541
 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.79780541 0.79780541 0.1916557  0.1916557  0.79780541
 0.79780541 0.79780541 0.1916557  0.79780541 0.79780541 0.79780541
 0.1916557  0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541 0.79780541
 0.79780541 0.1916557  0.79780541 0.1916557  0.7978

In [17]:
print_metrics(y_test, preds)

Balanced accuracy:  0.7896825396825398
F1 Weighted 0.8179737882255045


## Ensemble Bayes

In [18]:
# calculate probability
value_counts = y_train.value_counts()
value_counts

TARGET
0.0    208
1.0    134
Name: count, dtype: int64

In [19]:
probs_classes = value_counts / len(y_train)

In [20]:
ensemble_model = BayesEnsemblePredictions(dataset, probs_classes)

In [21]:
ensemble_model.fit()

In [22]:
ensemble_preds = ensemble_model.predict(x_test)

Class 0.0
X1 [0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.17956748 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.17956748 0.81398444 0.81398444 0.81398444 0.81398444
 0.17956748 0.81398444 0.81398444 0.81398444 0.81398444 0.17956748
 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.81398444 0.17956748 0.17956748 0.81398444 0.17956748
 0.81398444 0.17956748 0.81398444 0.81398444 0.17956748 0.81398444
 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.81398444 0.81398444 0.17956748 0.17956748 0.81398444
 0.81398444 0.81398444 0.17956748 0.81398444 0.81398444 0.81398444
 0.17956748 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444 0.81398444
 0.81398444 0.17956748 0.81398444 0.17956748 0.81

In [23]:
print_metrics(y_test, ensemble_preds)

Balanced accuracy:  0.7785714285714286
F1 Weighted 0.8079663394109398


## Test from classes data

In [26]:
data = pd.read_csv(os.path.join('data', 'foot_size.csv'))
y = data.pop('GENDER')

In [28]:
model = BayesClassifier(list(y.unique()))

In [29]:
model.fit(data, y)

In [37]:
model.numeric_probs

Unnamed: 0,class_name,stats,HEIGHT (cm),WEIGHT (kg),FOOT SIZE (CM)
0,1,mean,178.4575,77.6675,42.8625
1,1,std,5.708744,8.75945,3.647797
2,0,mean,168.7825,58.9575,36.4775
3,0,std,12.023545,9.798263,1.656007


In [42]:
test = pd.DataFrame()
test['HEIGHT (cm)'] = [175]
test['WEIGHT (kg)'] = 75
test['FOOT SIZE (CM)'] = 38

In [43]:
test

Unnamed: 0,HEIGHT (cm),WEIGHT (kg),FOOT SIZE (CM)
0,175,75,38


In [45]:
model.predict_proba(test)

Class 1
HEIGHT (cm) [0.05817245]
WEIGHT (kg) [0.0434806]
FOOT SIZE (CM) [0.04498161]
[0.00011378]
Class 0
HEIGHT (cm) [0.02902764]
WEIGHT (kg) [0.01065755]
FOOT SIZE (CM) [0.15787089]
[4.88394982e-05]


Unnamed: 0,1,0
0,5.7e-05,2.4e-05
