### Install Required Packages

In [1]:
! pip install numpy pandas sklearn



### Imports

In [2]:
import pandas as pd
import numpy as np
import pprint

from sklearn.metrics import classification_report
from collections import defaultdict

### Preprocess methods

In [3]:
def remove_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    """Removes missing/corrupted values from a dataframe
    :param df: The given dataframe
    :return: The cleaned up dataframe
    """
    df = df.replace(r'\?', np.nan, regex=True)
    return df.dropna()


def binning(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    """Transforms string categorical feature values to integers
    :param df: The dataframe to process
    :param columns: The columns that contain categorical values
    :return: The normalized dataframe
    """
    df[columns] = df[columns].apply(lambda x: pd.factorize(x)[0])
    return df

### The Naive Bayes model

This Naive Bayes model accepts datasets that have mixed feature type, categorical & continuous. The model's engine
estimates the prior probabilities $\pi_k = p(C_k)$ for each given class & the densities
$f(\textbf{x})_k = p(\textbf{x}|C_k)$ of each feature. For the categorical attributes the densities are yielded by
dividing the count of each discrete value of the feature to the number of items that correspond to each class and for
continuous attributes the densities are extracted by calculating Gaussian Distributions, estimating mu and sigma
parameters. The parameters estimations are based to MLE (_Maximum Likelihood Estimation_).

In order to predict, initially we estimate the posterior probability of the given data point by applying the following:
$ \hat{\textbf{P}_k} = \log \left( p(C_k)\prod_{i=1}^{n}(p(\textbf{x}_{i}|C_k)) \right)$

Finally, the predicted class will be extracted by applying: $\underset{k\in {1,...,K}} {argmax} \hat{\textbf{P}_k}$

In [4]:
class CustomNaiveBayes:
    def __init__(self):
        self._labels = list()
        self._priors = dict()
        self._categorical_cols = list()
        self._continuous_cols = list()
        self._continuous_params = defaultdict(lambda: defaultdict(dict))
        self._categorical_params = defaultdict(lambda: defaultdict(dict))

    def __estimate_priors(self, Y: pd.DataFrame) -> None:
        """Estimates the prior probabilities for each class
        :param Y: The train's dataset labels
        :return: None
        """
        y_np = Y.to_numpy()
        labels, counts = np.unique(y_np, return_counts=True)
        labels = dict(zip(labels, counts))
        self._labels = labels.keys()
        for label, count in labels.items():
            self._priors.update({label: count / y_np.shape[0]})

    def __parameter_estimation(self, X: pd.DataFrame, Y: pd.DataFrame) -> None:
        """Main method that estimates the density parameters, based on the type of each
        feature (categorical or continuous)
        :param X: The train data set points
        :param Y: The train's dataset labels
        :return: None
        """
        y_np = Y.to_numpy()
        labels_w_indexes = dict()
        for label in self._labels:
            indexes = np.where(y_np == label)
            labels_w_indexes.update({label: indexes})

        for column in X:
            col_np = X[column].to_numpy()
            if column in self._categorical_cols:
                self.__categorical_parameter_estimation(col_np, labels_w_indexes, column)
            elif column in self._continuous_cols:
                self.__continuous_parameter_estimation(col_np, labels_w_indexes, column)

    def __categorical_parameter_estimation(self, x_np: np.array, labels_w_indexes: dict, column_name: str) -> None:
        """Estimates the density parameters for categorical features
        :param x_np: 1-D Numpy array that contains an entire feature column from the train dataframe
        :param labels_w_indexes: Dictionary that contains the label associated with the index of each data point
        :param column_name: The name of the feature
        :return: None
        """
        for label, indexes in labels_w_indexes.items():
            x_np_split = np.take(x_np, indexes[0])
            discrete_categorical_val, counts = np.unique(x_np_split, return_counts=True)
            idx = 0
            for val in discrete_categorical_val:
                self._categorical_params[column_name][label][val] = counts[idx] / x_np_split.shape[0]
                idx += 1

    def __continuous_parameter_estimation(self, x_np: np.array, labels_w_indexes: dict, column_name: str) -> None:
        """Estimates the density parameters for continuous features
        :param x_np: 1-D Numpy array that contains an entire feature column from the train dataframe
        :param labels_w_indexes: Dictionary that contains the label associated with the index of each data point
        :param column_name: The name of the feature
        :return: None
        """
        for label, indexes in labels_w_indexes.items():
            x_np_split = np.take(x_np, indexes[0])
            mu = np.mean(x_np_split)
            sigma = np.std(x_np_split)
            self._continuous_params[column_name][label]['mu'] = mu
            self._continuous_params[column_name][label]['sigma'] = sigma

    def __log_posterior(self, data_point: pd.Series, label: str):
        """Calculates the log posterior probability of a data point according to a label/class
        :param data_point: The data point values
        :param label: The class
        :return: The log posterior probability
        """
        densities = np.array([])
        for col_name, value in data_point.items():
            try:
                density = None
                if col_name in self._categorical_cols:
                    density = self._categorical_params[col_name][label][value]
                elif col_name in self._continuous_cols:
                    density = self.__calculate_normal_dist(data_point[col_name],
                                                           self._continuous_params[col_name][label]['mu'],
                                                           self._continuous_params[col_name][label]['sigma'])
                if density is not None:
                    densities = np.append(densities, density)
            except KeyError:
                continue

        return np.log10(np.prod(densities) * self._priors[label])

    @staticmethod
    def __calculate_normal_dist(x: float, mu: float, sigma: float) -> float:
        """Calculates the normal distribution's value for a test data point given the values of continuous attributes
        :param x: The value of the continuous attribute of the test data point
        :param mu: The mu factor of the trained Gaussian Distribution
        :param sigma: The sigma factor of the trained Gaussian Distribution
        :return: The calculated value/probability
        """
        return (2. * np.pi * sigma ** 2.) ** -.5 * np.exp(-.5 * (x - mu) ** 2. / sigma ** 2.)
    

    @staticmethod
    def __pretty_print(nested_dict: defaultdict, file) -> None:
        """Writes to a file a dict that contains the model's parameters
        :param nested_dict: The dict to print
        :return: None
        """
        def to_dict(d: defaultdict) -> dict:
            """Converts a defaultdict to dict
            :param d: The defaultdict
            :return: The dict
            """
            if not isinstance(d, dict):
                return d
            return {k: to_dict(v) for k, v in d.items()}
        
        pprint.pprint(to_dict(nested_dict), stream=file)

    def __print_parameters(self) -> None:
        """Print the parameters of the trained model
        :return: None
        """
        with open('nb_parameters.txt', 'w') as f:
            f.write(f'Priors\n{self._priors}\n')
            f.write('\nCategorical Parameters\n')
            self.__pretty_print(self._categorical_params, f)
            f.write('\nContinuous Parameters\n')
            self.__pretty_print(self._continuous_params, f)


    def fit(self, X: pd.DataFrame, Y: pd.DataFrame, categorical_cols: list, continuous_cols: list) -> None:
        """Fit/train method for a Gaussian model with mixed categorical & continuous attributes
        :param X: The train data set points
        :param Y: The train's dataset labels
        :param categorical_cols: The column names that are categorical features
        :param continuous_cols: The column names that are continuous features
        :return: None
        """
        self._categorical_cols = categorical_cols
        self._continuous_cols = continuous_cols
        self.__estimate_priors(Y)
        self.__parameter_estimation(X, Y)
        self.__print_parameters()

    def predict(self, X: pd.DataFrame) -> list:
        """Predicts a test set labels
        :param X: The data points to predict
        :return: List with the predicted labels
        """
        predictions = []
        for i, row in X.iterrows():
            extracted_probabilities = list()
            for label in self._labels:
                extracted_probabilities.append((label, self.__log_posterior(row, label)))

            predicted_class = max(extracted_probabilities, key=lambda item: item[1])[0]
            predictions.append(predicted_class)

        return predictions

### Read data and train model

In [5]:
train_df = pd.read_csv('dataset/adult.data')
train_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
              'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
              'label']

categorical_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
                    'native-country', 'label']
continuous_cols = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

train_df = remove_missing_values(train_df)
train_df = binning(train_df, categorical_cols)
print(train_df)

train_X = train_df.loc[:, train_df.columns != 'label']
train_Y = train_df['label']

naive_bayes = CustomNaiveBayes()
priors = naive_bayes.fit(train_X, train_Y, categorical_cols=categorical_cols, continuous_cols=continuous_cols)

       age  workclass  fnlwgt  education  education-num  marital-status  \
0       50          0   83311          0             13               0   
1       38          1  215646          1              9               1   
2       53          1  234721          2              7               0   
3       28          1  338409          0             13               0   
4       37          1  284582          3             14               0   
...    ...        ...     ...        ...            ...             ...   
32555   27          1  257302          6             12               0   
32556   40          1  154374          1              9               0   
32557   58          1  151910          1              9               6   
32558   22          1  201490          1              9               3   
32559   52          5  287927          1              9               0   

       occupation  relationship  race  sex  capital-gain  capital-loss  \
0               0        

### Predict labels

In [6]:
test_df = pd.read_csv('dataset/adult.test')
test_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                   'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
                   'label']
test_df = remove_missing_values(test_df)
test_df = binning(test_df, categorical_cols)

test_X = test_df.loc[:, test_df.columns != 'label']
test_Y = test_df['label']

print('Predicting Train dataset data-points')
results = naive_bayes.predict(train_X)
print(classification_report(train_Y, results))

print('Predicting Test dataset data-points')
results = naive_bayes.predict(test_X)
print(classification_report(test_Y, results))

with open('predictions.txt', 'w') as f:
    f.write('\n'.join(map(str, results)))

Predicting Train dataset data-points


  return np.log10(np.prod(densities) * self._priors[label])


              precision    recall  f1-score   support

           0       0.85      0.93      0.89     22653
           1       0.71      0.51      0.60      7508

    accuracy                           0.83     30161
   macro avg       0.78      0.72      0.74     30161
weighted avg       0.82      0.83      0.82     30161

Predicting Test dataset data-points


  return np.log10(np.prod(densities) * self._priors[label])


              precision    recall  f1-score   support

           0       0.81      0.93      0.87     11359
           1       0.62      0.35      0.44      3700

    accuracy                           0.79     15059
   macro avg       0.72      0.64      0.66     15059
weighted avg       0.77      0.79      0.76     15059

