In [27]:
from sklearn.datasets import load_wine
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from scipy.stats import norm
from sklearn.naive_bayes import GaussianNB

In [28]:
# X, y = load_wine(return_X_y=True, as_frame=True)

In [29]:
data = pd.read_csv(os.path.join('data', 'computers.csv'))

In [30]:
for col in data:
    data[col] = data[col].str.title()

In [31]:
X = data.copy()
y = X.pop('BUY')

In [32]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [33]:
class BayesClassifier:
    def __init__(self, classes):
        self.classes = classes
        self.numeric_probs = pd.DataFrame()
        var_class = []
        stats = []
        for class_name in self.classes:
            var_class.append(class_name)
            var_class.append(class_name)
            stats.append('mean')
            stats.append('std')
        self.numeric_probs['class_name'] = var_class
        self.numeric_probs['stats'] = stats


    def fit(self, X, y):
        """ receives a pandas dataframe """

        # calculate classes probabilities
        self.class_probs = pd.DataFrame(y.value_counts() / len(y)).T

        # divides the feature in numeric and categorical
        self.numeric_features = X.select_dtypes(include=np.number).columns.to_list()
        self.categorical_features = [feature for feature in X.columns if feature not in self.numeric_features]
        X = X.copy()
        X['target'] = y

        # for numeric features
        for feature in self.numeric_features:
            self.numeric_probs[feature] = 0.0
            for class_name in self.classes:
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'mean' ), feature
                    ] = X.loc[X.target == class_name, feature].mean()
                # get the stds
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'std' ), feature
                    ] = X.loc[X.target == class_name, feature].std()

        # for categorical features
        self.classes_counts = pd.DataFrame(y.value_counts()).T
        self.categorical_probs = {}

        for feature in X.columns:
            probs = pd.DataFrame(index=self.classes)
            for value in X[feature].unique():
                probs[value] = 0.0
                for class_name in self.classes:
                    conditional_prob = len(X.loc[(y == class_name) & (X[feature] == value)]) / self.classes_counts[class_name].values[0]
                    probs.loc[class_name, value] = conditional_prob
            self.categorical_probs[feature] = probs



    def _predict_row(self, X):  
        predicted_probs = pd.DataFrame()
        for class_name in self.classes:
            prob = self.class_probs[class_name].values[0]
            # numeric features
            for feature in self.numeric_features:
                mean = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'mean'), feature]
                std = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'std'), feature]
                prob *= norm.pdf(X[feature], mean, std)
            # categorical features
            for feature in self.categorical_features:
                aux_probs = self.categorical_probs[feature][X[feature]].T
                prob *= aux_probs[class_name].values
                print(feature, prob)

            predicted_probs[class_name] = prob
        
        # softmax normalization
        predicted_probs = predicted_probs.apply(self.softmax_norm, axis=1)
        return predicted_probs

    def softmax_norm(self, X):
        return X / X.sum()
        
    def predict_proba(self, X):
        return self._predict_row(X)

    def predict(self, X):
        return self._predict_row(X).idxmax(axis=1)


In [34]:
classifier = BayesClassifier(list(y.unique()))

In [35]:
classifier.fit(x_train, y_train)

In [36]:
test = pd.DataFrame()
test['age'] = ['Youth']
test['income'] = ['Medium']
test['student'] = ['Yes']
test['credit'] = ['Fair']

In [37]:
test

Unnamed: 0,age,income,student,credit
0,Youth,Medium,Yes,Fair


In [38]:
preds_bayes = classifier.predict(x_test)

age [0.2 0.  0.  0. ]
income [0.1 0.  0.  0. ]
student [0.1 0.  0.  0. ]
credit [0.05 0.   0.   0.  ]
age [0.2 0.2 0.2 0.2]
income [0.05 0.05 0.1  0.05]
student [0.01875 0.03125 0.0375  0.03125]
credit [0.01171875 0.01171875 0.0140625  0.01953125]


In [39]:
preds_bayes

0     No
1    Yes
2    Yes
3    Yes
dtype: object