In [1]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB

### Get the data.

In [2]:
car_evaluation = fetch_ucirepo(id=19) 

X = car_evaluation.data.features 
y = car_evaluation.data.targets 

In [3]:
print(car_evaluation.metadata) 

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [4]:
print(car_evaluation.variables) 

       name     role         type demographic  \
0    buying  Feature  Categorical        None   
1     maint  Feature  Categorical        None   
2     doors  Feature  Categorical        None   
3   persons  Feature  Categorical        None   
4  lug_boot  Feature  Categorical        None   
5    safety  Feature  Categorical        None   
6     class   Target  Categorical        None   

                                         description units missing_values  
0                                       buying price  None             no  
1                           price of the maintenance  None             no  
2                                    number of doors  None             no  
3              capacity in terms of persons to carry  None             no  
4                           the size of luggage boot  None             no  
5                        estimated safety of the car  None             no  
6  evaulation level (unacceptable, acceptable, go...  None             no  

In [5]:
y.value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

### Split and preprocess the data.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8,random_state=100,shuffle=True)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1382, 6), (346, 6), (1382, 1), (346, 1))

In [7]:
encoder = OrdinalEncoder()
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [8]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

### Let's check the predictions using sklearn's Naive Bayes Classifier.

In [9]:
# Parameters were adjusted to match the custom implementation
naive_bayes_sklearn = CategoricalNB(alpha=0.000001, force_alpha=True, fit_prior=True)
naive_bayes_sklearn.fit(X_train, y_train)

In [10]:
from sklearn.metrics import classification_report

pred_sklearn = naive_bayes_sklearn.predict(X_test)
print(classification_report(y_test, pred_sklearn))

              precision    recall  f1-score   support

         acc       0.64      0.75      0.69        72
        good       0.67      0.24      0.35        17
       unacc       0.94      0.96      0.95       248
       vgood       1.00      0.11      0.20         9

    accuracy                           0.86       346
   macro avg       0.81      0.52      0.55       346
weighted avg       0.86      0.86      0.85       346



In [34]:
naive_bayes_sklearn.feature_log_prob_

[array([[ -1.32416258,  -1.37355534,  -1.1891263 ,  -1.73567   ],
        [-17.76675435,  -0.48550786,  -0.95551147, -17.76675435],
        [ -1.31218639,  -1.54600447,  -1.51242818,  -1.21302264],
        [-17.84086232,  -0.47000367,  -0.98082928, -17.84086232]]),
 array([[ -1.33628394,  -1.43893809,  -1.22121461,  -1.5841201 ],
        [-17.76675435,  -0.36772483,  -1.17865501, -17.76675435],
        [ -1.35156155,  -1.51242818,  -1.52190692,  -1.19569118],
        [ -1.6274564 ,  -0.8064759 ,  -1.02961944, -17.84086232]]),
 array([[-1.48032331, -1.38629436, -1.33628394, -1.34855403],
        [-1.31218639, -1.31218639, -1.46633706, -1.46633706],
        [-1.30066995, -1.41369334, -1.42227708, -1.41369334],
        [-1.82812707, -1.46040233, -1.25276298, -1.13497995]]),
 array([[-19.55851376,  -0.66159883,  -0.72572335],
        [-17.76675433,  -0.58394791,  -0.81574952],
        [ -0.7686955 ,  -1.35156155,  -1.28176579],
        [-17.8408623 ,  -0.76725517,  -0.62415433]]),
 array([

In [31]:
np.exp(naive_bayes_sklearn.class_log_prior_)

array([0.22575977, 0.03762663, 0.69609262, 0.04052098])

### Create the custom Naive Bayes Classifier model.

In [13]:
class NaiveBayesClassifier:
    def __init__(self):
        self.P_classes = {}
        self.P_features_given_class = {}
        self.X_features_unique_values = {}
        self.y_dtype = None

    def fit(self, X, y):
        if not isinstance(y, np.ndarray):
            y = np.array(y)
        self.y_dtype = y.dtype
        self.X_features_unique_values = {feature: np.unique(X[:, feature]) for feature in range(X.shape[1])}
        for label, count in zip(*np.unique(y, return_counts=True)):
            self.P_classes[label] = count / y.size
            X_label_subset = X[y==label]
            P_feature_val_given_class = {}
            for feature in self.X_features_unique_values.keys():
                for value in self.X_features_unique_values[feature]:
                    X_feature_val = X_label_subset[:, feature] == value
                    P_feature_val_given_class[feature, value] = 0 if len(X_label_subset) == 0 else np.mean(X_feature_val)
            self.P_features_given_class[label] = P_feature_val_given_class
        
    def predict(self, X):
        predictions = np.empty(shape=(X.shape[0],), dtype=self.y_dtype)
        for i in range(X.shape[0]):
            obs = X[i]
            class_probs = []
            class_labels = []
            for label in self.P_classes.keys():
                class_prob = self.P_classes[label]
                for feature in self.X_features_unique_values.keys():
                    class_prob *= self.P_features_given_class[label][feature, obs[feature]]
                class_probs.append(class_prob)
                class_labels.append(label)
            predictions[i] = class_labels[np.argmax(class_probs)]
        return predictions

In [14]:
naive_bayes_custom = NaiveBayesClassifier()
naive_bayes_custom.fit(X_train, y_train)

In [15]:
pred_custom = naive_bayes_custom.predict(X_test)
print(classification_report(y_test, pred_custom))

              precision    recall  f1-score   support

         acc       0.64      0.75      0.69        72
        good       0.67      0.24      0.35        17
       unacc       0.94      0.96      0.95       248
       vgood       1.00      0.11      0.20         9

    accuracy                           0.86       346
   macro avg       0.81      0.52      0.55       346
weighted avg       0.86      0.86      0.85       346



In [16]:
print(classification_report(y_test, pred_sklearn))

              precision    recall  f1-score   support

         acc       0.64      0.75      0.69        72
        good       0.67      0.24      0.35        17
       unacc       0.94      0.96      0.95       248
       vgood       1.00      0.11      0.20         9

    accuracy                           0.86       346
   macro avg       0.81      0.52      0.55       346
weighted avg       0.86      0.86      0.85       346



In [17]:
(pred_custom == pred_sklearn).mean()

1.0

#### The predictions are same for sklearn and our custom naive bayes models.

In [30]:
naive_bayes_custom.P_classes

{'acc': 0.22575976845151954,
 'good': 0.03762662807525326,
 'unacc': 0.6960926193921853,
 'vgood': 0.04052098408104197}

In [32]:
np.exp(naive_bayes_sklearn.class_log_prior_)

array([0.22575977, 0.03762663, 0.69609262, 0.04052098])

In [18]:
naive_bayes_custom.P_features_given_class

{'acc': {(0, 0.0): 0.266025641025641,
  (0, 1.0): 0.2532051282051282,
  (0, 2.0): 0.30448717948717946,
  (0, 3.0): 0.1762820512820513,
  (1, 0.0): 0.26282051282051283,
  (1, 1.0): 0.23717948717948717,
  (1, 2.0): 0.2948717948717949,
  (1, 3.0): 0.20512820512820512,
  (2, 0.0): 0.22756410256410256,
  (2, 1.0): 0.25,
  (2, 2.0): 0.26282051282051283,
  (2, 3.0): 0.25961538461538464,
  (3, 0.0): 0.0,
  (3, 1.0): 0.5160256410256411,
  (3, 2.0): 0.483974358974359,
  (4, 0.0): 0.3717948717948718,
  (4, 1.0): 0.3525641025641026,
  (4, 2.0): 0.27564102564102566,
  (5, 0.0): 0.532051282051282,
  (5, 1.0): 0.0,
  (5, 2.0): 0.46794871794871795},
 'good': {(0, 0.0): 0.0,
  (0, 1.0): 0.6153846153846154,
  (0, 2.0): 0.38461538461538464,
  (0, 3.0): 0.0,
  (1, 0.0): 0.0,
  (1, 1.0): 0.6923076923076923,
  (1, 2.0): 0.3076923076923077,
  (1, 3.0): 0.0,
  (2, 0.0): 0.2692307692307692,
  (2, 1.0): 0.2692307692307692,
  (2, 2.0): 0.23076923076923078,
  (2, 3.0): 0.23076923076923078,
  (3, 0.0): 0.0,
  (3, 