In [848]:
from sklearn.datasets import load_wine
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import os
from scipy.stats import norm
from sklearn.naive_bayes import GaussianNB

In [849]:
# X, y = load_wine(return_X_y=True, as_frame=True)

In [850]:
data = pd.read_csv(os.path.join('data', 'computers.csv'))

In [851]:
for col in data:
    data[col] = data[col].str.title()

In [852]:
X = data.copy()
y = X.pop('BUY')

In [853]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [906]:
class BayesClassifier:
    def __init__(self, classes):
        self.classes = classes
        self.numeric_probs = pd.DataFrame()
        var_class = []
        stats = []
        for class_name in self.classes:
            var_class.append(class_name)
            var_class.append(class_name)
            stats.append('mean')
            stats.append('std')
        self.numeric_probs['class_name'] = var_class
        self.numeric_probs['stats'] = stats


    def fit(self, X, y):
        """ receives a pandas dataframe """

        # calculate classes probabilities
        self.class_probs = pd.DataFrame(y.value_counts() / len(y)).T

        # divides the feature in numeric and categorical
        self.numeric_features = X.select_dtypes(include=np.number).columns.to_list()
        self.categorical_features = [feature for feature in X.columns if feature not in self.numeric_features]
        X = X.copy()
        X['target'] = y

        # for numeric features
        for feature in self.numeric_features:
            self.numeric_probs[feature] = 0.0
            for class_name in self.classes:
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'mean' ), feature
                    ] = X.loc[X.target == class_name, feature].mean()
                # get the stds
                self.numeric_probs.loc[
                    (self.numeric_probs.class_name == class_name) & 
                    (self.numeric_probs.stats == 'std' ), feature
                    ] = X.loc[X.target == class_name, feature].std()

        # for categorical features
        self.classes_counts = pd.DataFrame(y.value_counts()).T
        self.categorical_probs = {}

        for feature in X.columns:
            probs = pd.DataFrame(index=self.classes)
            for value in X[feature].unique():
                probs[value] = 0.000001
                for class_name in self.classes:
                    conditional_prob = len(X.loc[(y == class_name) & (X[feature] == value)]) / self.classes_counts[class_name].values[0]
                    probs.loc[class_name, value] = conditional_prob
            self.categorical_probs[feature] = probs



    def _predict_row(self, X):  
        predicted_probs = pd.DataFrame()
        for class_name in self.classes:
            prob = self.class_probs[class_name].values[0]
            # numeric features
            for feature in self.numeric_features:
                mean = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'mean'), feature]
                std = self.numeric_probs.loc[(self.numeric_probs.class_name == class_name) & 
                                              (self.numeric_probs.stats == 'std'), feature]
                prob *= norm.pdf(X[feature], mean, std)
            # categorical features
            for feature in self.categorical_features:
                aux_probs = self.categorical_probs[feature][X[feature]].T
                print(aux_probs[class_name].values)
                prob *= aux_probs[class_name].values

            predicted_probs[class_name] = prob
        
        print(predicted_probs)
        # softmax normalization
        self.predicted_probs = predicted_probs.apply(self.softmax_norm, axis=1)
        return predicted_probs

    def softmax_norm(self, X):
        return X / X.sum()
        
    def predict_proba(self, X):
        return self._predict_row(X)

    def predict(self, X):
        return self._predict_row(X).idxmax(axis=1)


In [907]:
classifier = BayesClassifier(list(y.unique()))

In [908]:
classifier.fit(x_train, y_train)

In [857]:
test = pd.DataFrame()
test['age'] = ['Youth']
test['income'] = ['Medium']
test['student'] = ['Yes']
test['credit'] = ['Fair']

In [858]:
test

Unnamed: 0,age,income,student,credit
0,Youth,Medium,Yes,Fair


In [909]:
preds_bayes = classifier.predict(x_test)

[0.33333333 0.33333333 0.33333333 0.33333333]
[0.33333333 0.33333333 0.33333333 0.33333333]
[0.66666667 0.33333333 0.66666667 0.33333333]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0.42857143 0.42857143 0.28571429 0.28571429]
[0.42857143 0.57142857 0.42857143 0.57142857]
[0.71428571 0.28571429 0.71428571 0.71428571]
         No  Yes
0  0.000000  0.0
1  0.011111  0.0
2  0.000000  0.0
3  0.000000  0.0


In [910]:
probs_predict = classifier.predict_proba(x_test)

[0.33333333 0.33333333 0.33333333 0.33333333]
[0.33333333 0.33333333 0.33333333 0.33333333]
[0.66666667 0.33333333 0.66666667 0.33333333]
[0. 1. 0. 0.]
[0. 0. 0. 0.]
[0.42857143 0.42857143 0.28571429 0.28571429]
[0.42857143 0.57142857 0.42857143 0.57142857]
[0.71428571 0.28571429 0.71428571 0.71428571]
         No  Yes
0  0.000000  0.0
1  0.011111  0.0
2  0.000000  0.0
3  0.000000  0.0


In [911]:
x_train

Unnamed: 0,age,income,student,credit
6,Middle-Aged,Low,Yes,Excellent
4,Senior,Low,Yes,Fair
3,Senior,Medium,No,Fair
5,Senior,Low,Yes,Excellent
13,Senior,Medium,No,Excellent
11,Middle-Aged,Medium,No,Excellent
2,Middle-Aged,High,No,Fair
12,Middle-Aged,High,Yes,Fair
9,Senior,Medium,Yes,Fair
1,Youth,High,No,Excellent


In [912]:
y_train

6     Yes
4     Yes
3     Yes
5      No
13     No
11    Yes
2     Yes
12    Yes
9     Yes
1      No
Name: BUY, dtype: object

In [913]:
x_test

Unnamed: 0,age,income,student,credit
7,Youth,Medium,No,Fair
10,Youth,Medium,Yes,Excellent
0,Youth,High,No,Fair
8,Youth,Low,Yes,Fair


In [891]:
classifier.categorical_probs

{'age':      Middle-Aged    Senior     Youth
 No      0.000000  0.666667  0.333333
 Yes     0.571429  0.428571  0.000000,
 'income':           Low    Medium      High
 No   0.333333  0.333333  0.333333
 Yes  0.285714  0.428571  0.285714,
 'student':           Yes        No
 No   0.333333  0.666667
 Yes  0.571429  0.428571,
 'credit':      Excellent      Fair
 No    1.000000  0.000000
 Yes   0.285714  0.714286,
 'target':      Yes   No
 No   0.0  1.0
 Yes  1.0  0.0}

In [872]:
classifier.class_probs

BUY,Yes,No
count,0.7,0.3


In [866]:
classifier.numeric_probs

Unnamed: 0,class_name,stats
0,No,mean
1,No,std
2,Yes,mean
3,Yes,std


In [914]:
preds_bayes

0    No
1    No
2    No
3    No
dtype: object