# Naive Bayes and Decision Trees

In [1]:
# Import the libraries
from math import log

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Naive Bayes Classification

## Data Preprocessing

Let's first load the training dataset into a pandas dataframe

In [2]:
df = pd.read_csv('car_train.csv')
df.head()

Unnamed: 0,Buying Price,Maintenance Price,Number of Doors,Capacity,Size of Luggage Boot,Estimated Safety,Decision
0,low,low,3,2,small,high,unacc
1,low,high,4,2,big,high,unacc
2,vhigh,low,3,4,small,high,acc
3,vhigh,med,5more,more,small,med,unacc
4,low,vhigh,4,more,med,high,acc


Here we have 6 different features and a `Decision` target variable.

So every single feature here is a categorical feature. Let's try to see which all are the unique values in each of these features.

In [3]:
df.copy().apply(lambda x: x.unique())

Buying Price              [low, vhigh, high, med]
Maintenance Price         [low, high, med, vhigh]
Number of Doors                  [3, 4, 5more, 2]
Capacity                             [2, 4, more]
Size of Luggage Boot            [small, big, med]
Estimated Safety                 [high, med, low]
Decision                [unacc, acc, vgood, good]
dtype: object

In [4]:
# We'll convert this to a dictionary for later usage
unique_values = df.copy().apply(lambda x: x.unique()).to_dict()
unique_values = {k: v.tolist() for k, v in unique_values.items()}

In [5]:
# Following is the form of the dictionary
unique_values

{'Buying Price': ['low', 'vhigh', 'high', 'med'],
 'Maintenance Price': ['low', 'high', 'med', 'vhigh'],
 'Number of Doors': ['3', '4', '5more', '2'],
 'Capacity': ['2', '4', 'more'],
 'Size of Luggage Boot': ['small', 'big', 'med'],
 'Estimated Safety': ['high', 'med', 'low'],
 'Decision': ['unacc', 'acc', 'vgood', 'good']}

## Calculate the Prior and Likelihood values

In [6]:
# You can do this in a single line

prior_list = df['Decision'].value_counts(normalize=True).tolist()

In [7]:
# Let's convert this prior list to a pandas dataframe
prior_df = pd.DataFrame(prior_list, index=['acc', 'unacc', 'good', 'vgood'], columns=['Prior'])
prior_df

Unnamed: 0,Prior
acc,0.70239
unacc,0.222303
good,0.039826
vgood,0.035482


### **Likelihood Probabilities $P(X|Y)$**

In [8]:
likelihood_feature_1 = df.groupby(['Decision', 'Buying Price']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)
likelihood_feature_2 = df.groupby(['Decision', 'Maintenance Price']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)
likelihood_feature_3 = df.groupby(['Decision', 'Number of Doors']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)
likelihood_feature_4 = df.groupby(['Decision', 'Capacity']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)
likelihood_feature_5 = df.groupby(['Decision', 'Size of Luggage Boot']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)
likelihood_feature_6 = df.groupby(['Decision', 'Estimated Safety']).size().unstack().fillna(0).apply(lambda x: x / x.sum(), axis=1)

## Prediction on test set

### **Calculation of Posterior Probabilities $P(Y|X)$**

In [9]:
# Let's store the log prior

features = ['low', 'vhigh', '2', 'more', 'med', 'high']

log_prior = log(prior_df.loc['unacc', 'Prior'])

log_likelihood = 0

for i, feature in enumerate(features):
    log_likelihood += log(eval(f'likelihood_feature_{i+1}').loc['unacc', feature] + 1e-9) # Add the 1e-9 to avoid taking log(0)

log_posterior = log_prior + log_likelihood

print(f'The Log Posterior for class `unacc` is: {log_posterior:.3f}')



The Log Posterior for class `unacc` is: -9.580


In [10]:
def get_log_posterior(class_name: str, features: list):
    """Returns the log posterior for the class name given the features
    """

    log_prior = log(prior_df.loc[class_name, 'Prior'])

    log_likelihood = 0

    for i, feature in enumerate(features):
        log_likelihood += log(eval(f'likelihood_feature_{i+1}').loc[class_name, feature] + 1e-9) # Add the 1e-9 to avoid taking log(0)

    log_posterior = log_prior + log_likelihood

    return log_posterior

Based on the log-posterior, predict what the decision is

In [11]:
log_posterior_unacc = get_log_posterior('unacc', features)
log_posterior_acc = get_log_posterior('acc', features)
log_posterior_good = get_log_posterior('good', features)
log_posterior_vgood = get_log_posterior('vgood', features)

print(f'The Log Posterior for class `unacc` is: {log_posterior_unacc:.3f}')
print(f'The Log Posterior for class `acc` is: {log_posterior_acc:.3f}')
print(f'The Log Posterior for class `good` is: {log_posterior_good:.3f}')
print(f'The Log Posterior for class `vgood` is: {log_posterior_vgood:.3f}')

max_posterior = max(log_posterior_unacc, log_posterior_acc, log_posterior_good, log_posterior_vgood)

if max_posterior == log_posterior_unacc:
    pred = 'unacc'
elif max_posterior == log_posterior_acc:
    pred = 'acc'
elif max_posterior == log_posterior_good:
    pred = 'good'
else:
    pred = 'vgood'

print(f'The predicted class is: {pred}')

The Log Posterior for class `unacc` is: -9.580
The Log Posterior for class `acc` is: -7.718
The Log Posterior for class `good` is: -28.705
The Log Posterior for class `vgood` is: -27.708
The predicted class is: acc


### Prediction with Test set

In [12]:
df_test = pd.read_csv('car_test.csv')
df_test.head()

Unnamed: 0,Buying Price,Maintenance Price,Number of Doors,Capacity,Size of Luggage Boot,Estimated Safety,Decision
0,low,vhigh,2,more,med,high,acc
1,vhigh,high,2,4,big,high,unacc
2,high,med,2,2,small,med,unacc
3,vhigh,med,3,2,big,med,unacc
4,low,med,5more,2,big,low,unacc


Split the dataframe into features and target

In [13]:
df_x = df_test.drop('Decision', axis=1)
df_y = df_test['Decision']

Now using the features in `df_x`, predict the `Decision` using in the same way as above

In [14]:
predictions = []

for _, row in df_x.iterrows():
    features = list(row)
    
    class_names = ['unacc', 'acc', 'good', 'vgood']
    log_posterior_probs = [get_log_posterior(class_name, features) for class_name in class_names]

    prediction = class_names[np.argmax(log_posterior_probs)]
    predictions.append(prediction)

In [15]:
true_values = list(df_y)

correct = 0
for pred, true_val in zip(predictions, true_values):
    if pred == true_val:
        correct += 1

accuracy = correct / len(true_values)

print(f'Test Accuracy using Naive Bayes Classifier is: {accuracy:.4f}')

Test Accuracy using Naive Bayes Classifier is: 0.7914
