### Week 3: The Naive Bayes classifier - Bernoulli model

Instructor: Cornelia Ilin <br>
Email: cilin@ischool.berkeley.edu <br>

### Step 1: Import packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split

### Step 2: Define working directories

### Step 3: Define classes

### Step 3: Define functions

---
### Step 5: Read data
---

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/wine/wine.data',
                      header=None)

df.columns = ['class_label', 'alcohol', 'malic_acid', 'ash',
              'alcalinity_of_ash', 'magnesium', 'total_pphenols',
              'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins',
              'color_intensity', 'hue', 'OD280/OD315_of_diluted_wines',
              'proline']

print('Shape of df wine:', df.shape)
print('Class labels:', df['class_label'].unique())
print()
df.head()

---
### Step 6: Data preprocessing (a.k.a., data cleaning)
---

#### Step 6.1 Data subseting

Today we will only be working with all class labels but only four features ['acohol', 'malic_acid', 'ash', 'flavanoids'].

In [None]:
labels = ['class_label']
features = ['alcohol', 'malic_acid', 'ash', 'color_intensity']
df = df[labels+features]
df.head()

In [None]:
# create X and y arrays
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [None]:
X.head()

In [None]:
y.head()

Notice that the feature values are real valued. Let's look at histograms of each features.

In [None]:
# Create a new figure and set the figsize argument so we get square-ish plots of the 4 features.
plt.figure(figsize=(15, 3))

# Iterate over the features, creating a subplot with a histogram for each one.
for idx, feature in enumerate(features):
    plt.subplot(1, 4, idx+1)
    X[feature].hist(bins=10, grid=False)
    plt.title(feature)

#### Step 6.2 Binarize feature values

To make things simple, let's binarize these feature values. That is, we'll treat each measurement as either "low" or "high". I'm just going to choose a threshold for each feature.

In [None]:
for feature in features:
    X[feature] = np.where(X[feature] >= X[feature].mean(), 1, 0)
X.head()

#### Step 6.3 Split data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=1, stratify=y)

train_df = pd.concat((y_train, X_train), axis=1)
train_df.head()

In [None]:
print('Class labels:', train_df.class_label.unique())
train_df.shape

#### Step 6.4 Compute stats needed for Bayes' probabilities

##### Step 6.4.1 Compute feature = 0 frequency (apply count() by column)

needed to compute conditional probabilities

In [None]:
# compute feature = 0 frequency  
feature0_count = pd.DataFrame()

for feature in features:
    temp_train_df = train_df[['class_label', feature]]
    temp_train_df = temp_train_df[train_df[feature]==0]
    temp_count = temp_train_df.groupby(['class_label']).count()
    feature0_count = pd.concat([feature0_count, temp_count], axis=1)

feature0_count

##### Step 6.4.2 Compute feature = 1 frequency (apply count() by column)

Note that it sufficies to compute only feature = 0 frequency. Why?

In [None]:
# compute feature = 1 frequency
feature1_count = pd.DataFrame()

for feature in features:
    temp_train_df = train_df[['class_label', feature]]
    temp_train_df = temp_train_df[train_df[feature]==1]
    temp_count = temp_train_df.groupby(['class_label']).count()
    feature1_count = pd.concat([feature1_count, temp_count], axis=1)

feature1_count

In [None]:
# As a sanity check/, what should the total sum of all counts (feature0_count + feature1_count) be?
# We have 133 training examples, each with 4 features. So we should have counted 532 things.
print(feature0_count.sum() + feature1_count.sum())

print('\ntrain_df.shape: ', train_df.iloc[:, 1:].shape)
print('Total count: ', train_df.iloc[:,1:].shape[0] * train_df.iloc[:,1:].shape[1])

#### Step 6.4.5 compute number of y_train, y_train=1, y_train=2, y_train=3

In [None]:
count_y_train_all = train_df['class_label'].count()
count_y_train_1 = train_df[train_df.class_label==1]['class_label'].count()
count_y_train_2 = train_df[train_df.class_label==2]['class_label'].count()
count_y_train_3 = train_df[train_df.class_label==3]['class_label'].count()
count_y_train = pd.Series(np.array([count_y_train_1, count_y_train_2, count_y_train_3]), index= [1,2,3])

print('count_y_train_all: ', count_y_train_all)
print('count_y_train_1: ', count_y_train_1)
print('count_y_train_2: ', count_y_train_2)
print('count_y_train_3: ', count_y_train_3)

---
### Step 7: Analysis - Naive Bayes as a classification algorithm
---

#### Step 7.1 Compute conditional probabilities

In [None]:
# compute conditional probability for each feature=0
cond_prob0 = feature0_count.apply(lambda x: x/count_y_train)
cond_prob0

# add conditional probability for all feature=0
cond_prob0['cond_prob0'] = cond_prob0.apply(lambda x: x.alcohol * x.malic_acid * x.ash * x.color_intensity, axis=1)
cond_prob0

In [None]:
# compute conditional probability for each feature=1
cond_prob1 = feature1_count.apply(lambda x: x/count_y_train)
cond_prob1

# add conditional probability for all feature=1
cond_prob1['cond_prob1'] = cond_prob1.apply(lambda x: x.alcohol * x.malic_acid * x.ash * x.color_intensity, axis=1)
cond_prob1

In [None]:
# As a sanity check, which probabilities should sum to 1?
sum_cond_prob = cond_prob1 + cond_prob0
sum_cond_prob[features]

#### Step 7.2 Compute prior probabilities

In [None]:
prior_1 = count_y_train_1/count_y_train_all
prior_2 = count_y_train_2/count_y_train_all
prior_3 = count_y_train_3/count_y_train_all
priors = pd.Series(np.array([prior_1, prior_2, prior_3]), index= [1,2,3])

print('P(class=1)=', prior_1)
print('P(class=2)=', prior_2)
print('P(class=3)=', prior_3)

# check to see if they add up to 1
print('sum prior probabilities:', prior_1 + prior_2 + prior_3)

#### Step 7.3 Make a prediction for the first example

Now that we have all the pieces, let's try making a prediction for the first test example. It looks like this is a cultivar 1 (class_label=1) example. Alcohol and malic acid features have high values, and ash and color intensity features have low values.

We start by assuming the prior distribution, which has a slight preference for cultivar 2, followed by cultivar 1. Of course, these estimates come from our training data, which might not be a representative sample. In practice, we may prefer to use a uniform prior.

In [None]:
# What does the feature vector look like? And what's the true label?
index = 0
print ('Feature vector: \n', X_test.iloc[index, :])
print ('\nClass label: ', y_test[index])

# Start with the prior distribution over labels.
print ('\nPriors:\n', priors)

Let's find the predicted label for this test example by accounting for the **alcohol feature** only.

In [None]:
# just as a reminder, print conditional probabilities for feature=1
cond_prob1

(a) Compute conditional probabilities * prior probabilities for each class

In [None]:
numerator = []

for class_label in range(len(priors)):
    numerator.append(cond_prob1['alcohol'].values[class_label] * priors.values[class_label])

print(numerator)

(b) compute evidence

In [None]:
denominator = 0
for val in numerator:
    denominator += val

denominator

compute posterior probability

In [None]:
posterior_prob = list(numerator/denominator)
posterior_prob

So after accounting for the alcohol feature our updated belief is that the train obsetvation is part of cultivar 1.

Let's include the remaining features.

In [None]:
# compute conditional probabilities * prior probabilities
numerator = []

for class_label in range(len(priors)):
    numerator.append(cond_prob1['alcohol'].values[class_label] * cond_prob1['malic_acid'].values[class_label] *
                     priors.values[class_label])
    
numerator

In [None]:
# compute evidence
denominator = 0
for val in numerator:
    denominator += val

denominator

In [None]:
# compute posterior probability
posterior_prob = list(numerator/denominator)
posterior_prob

So after accounting for all the feature our updated belief is that the train obsetvation is part of cultivar 3. It looks like Naive Bayes came up with the wrong answer.

#### Step 7.4 Make a prediction using sklearn

Now we can compare our implementation with the sklearn implementation. Do the predictions agree? 

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
y_pred = bnb.predict(X_test)
print('Predicted label for the first test example is: ', y_pred[0])
y_pred

Question: what's the role of alpha here?

Let's compute accuracy

In [None]:
print('Prediction accuracy: %3.2f' % bnb.score(X_test, y_test))

Let's compare our prior probabilities with sklearn's

In [None]:
print ('\nour prior probabilities:\n', list(priors))
print ('\nsklearn prior probabilities:\n', np.exp(bnb.class_log_prior_))

Let's compare our conditional probabilities for feature=1 with sklearn's

In [None]:
print ('\nOur conditional probabilities\n', cond_prob1.loc[:,features])
print ('\nsklearn conditional probabilities\n', np.exp(bnb.feature_log_prob_))