# HW-4 Naive Bayes
Wyatt Blair

SID: 10420296

3/10/24


In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 

## Apply same pre-processing as in HW-2-EDA

#### Note: I am using the median to fill NaN values here in order to keep the data categorical

In [2]:
data = pd.read_csv('../data/breast-cancer-wisconsin.csv')

data.replace("?", np.NaN, inplace=True)
data['F6'] = data['F6'].astype(float)

for column in data:

    series = data[column]
    mean, median =  series.mean(), series.median()
    data[column].fillna(median, inplace=True)

## Split dataset into train (70%) and test (30%)

In [3]:
train = data.sample(frac=0.7)
test = data.drop(index=train.index)

# Implement Naive-Bayes

$P(y | a) = \frac{P(a | y)P(y)}{P(a)}$

$P(y | x_1, ..., x_j) = \frac{P(x_1, ..., x_j | y)P(y)}{P(x_1, ..., x_j)}$

___

$a = \{x_1, ..., x_j\}$ : Data-Point

$y$ : Class

$P(y | a)$ : Posterior Probability

$P(a | y)$ : Likelihood of Features

$P(y)$ : Prior Probabilitiy

$P(a)$ : Marginal Probability

___



In [4]:
class NaiveBayes:

    def __init__(self, labelled_data):

        self.data = labelled_data
        self.probabilities = self.calculate_probabilities()
        self.features = [col for col in labelled_data.columns if col.startswith('F')]

    def calculate_probabilities(self):

        data = self.data

        # Initialize Helpful Variables
        probabilities_dict = {}

        num_data_points = len(data.index)
        features = [col for col in data.columns if col.startswith('F')]
        feature_values = train[features].stack().unique()

        # P(y)
        priors = data['Class'].value_counts() / num_data_points
        probabilities_dict.update({'priors': priors})

        # P(x1, x2, ...)
        ind = np.linspace(1, len(feature_values), num=len(feature_values), dtype=np.int64)
        vals = np.zeros(shape=(10, len(features)))
        marginal_probability_table = pd.DataFrame(data=vals, index=ind, columns=features)

        for feature in features:

            probabilities = (data[feature].value_counts() / len(data.index))
            marginal_probability_table[feature].update(probabilities)

        probabilities_dict.update({"marginal": marginal_probability_table})

        # P(x1, x2, ... | y)
        classes = data['Class'].unique()

        ind_arr1 = np.repeat(classes, repeats=len(feature_values))
        ind_arr2 = np.tile(feature_values, reps=len(classes))
        ind_arrs = list(zip(ind_arr1, ind_arr2))

        ind = pd.MultiIndex.from_tuples(ind_arrs, names=['Class', 'Feature Values'])
        vals = np.zeros((len(feature_values) * len(classes), len(features)))
        likelihood_table = pd.DataFrame(data=vals, columns=features, index=ind)

        for y in classes:

            # First segment data based on class (i.e. given y)
            class_wise_data = data[data['Class'] == y]
            num_class_data_points = len(class_wise_data.index)

            for feature in features:
                
                # Calculate probabilities for each value for each feature
                probabilities = pd.concat({y: (
                    class_wise_data[feature].value_counts() / num_class_data_points
                )}, names=['Class'])

                # Update likelihood table with all the values
                likelihood_table[feature].update(probabilities)

        likelihood_table.sort_index(inplace=True)
        probabilities_dict.update({"likelihood": likelihood_table})

        return probabilities_dict

    def get_prior(self, y):

        return self.probabilities['priors'][y]

    def get_marginal(self, x):

        marginal_table = self.probabilities['marginal']

        features = [col for col in marginal_table.columns if col.startswith('F')]
        marginals = []
        for feat, feat_vec_val in zip(features, x[features]):

            marginal_val = marginal_table[feat][feat_vec_val]
            marginals.append(marginal_val)

        marginal = np.prod(marginals)
        return marginal

    def get_likelihood(self, x, y):

        likelihood_table = self.probabilities['likelihood']

        features = [col for col in likelihood_table.columns if col.startswith('F')]
        likelihoods = []
        for feat, feat_vec_val in zip(features, x[features]):

            likelihood_val = likelihood_table[feat][y][feat_vec_val]
            likelihoods.append(likelihood_val)

        likelihood = np.prod(likelihoods)
        return likelihood
    
    def predict(self, x):

        posterior_probabilities = {}

        for y in self.probabilities['priors'].index:

            # prior
            prior = self.get_prior(y)

            # marginal
            marginal = self.get_marginal(x)
            
            # likelihood
            likelihood = self.get_likelihood(x, y)

            # P(y | x1, x2, ...) = P(x1, x2, ... | y) * P(y) / P(x1, x2, ...)
            posterior_probability = (likelihood * prior) / (marginal)
            posterior_probabilities.update({y: posterior_probability})
        
        prediction, highest_posterior_prob = max(
            posterior_probabilities.items(), key=lambda x: x[1]
        )
        return prediction
    
    def __call__(self, x):
        return self.predict(x)

In [5]:
nb_model = NaiveBayes(train)

In [6]:
%%timeit

test['prediction'] = test.apply(nb_model, axis=1)
test['correct'] = test['Class'] == test['prediction']

354 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
test[['Sample', 'Class', 'prediction', 'correct']].head(10)

Unnamed: 0,Sample,Class,prediction,correct
2,1015425,2,2,True
8,1033078,2,4,False
10,1035283,2,2,True
13,1043999,2,2,True
14,1044572,4,4,True
15,1047630,4,4,True
27,1066979,2,2,True
28,1067444,2,2,True
32,1072179,4,4,True
33,1074610,2,2,True


In [8]:
counts = test['correct'].value_counts()
accuracy = counts[True] / len(test.index)

print('Model Accuracy: %1.2f%%' % (100 * accuracy))

Model Accuracy: 98.10%
