# Notebook Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Constants

In [2]:

TOKEN_SPAM_PROB_FILE = 'SpamData/SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/SpamData/03_Testing/prob-all-tokens.txt'

TEST_FEATURE_MATRIX = 'SpamData/SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/SpamData/03_Testing/test-target.txt'


VOCAB_SIZE = 2500

# Load the data

In [3]:
# Features
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
# target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')
# Token Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')


# Joint Probabilty

### Dot product

In [4]:
X_test.shape

(5737, 2500)

In [5]:
prob_token_spam.shape


(2500,)

## Set the Prior

$$P(Spam \, | \, X) = \frac{P(X \, | \, Spam \,) \, P(Spam)} {P(X)} $$

In [6]:
PROB_SPAM = 0.3116

In [7]:
# Calculating the log probabilities of the tokens given that the email is spam
np.log(prob_token_spam)

array([ -4.40759195,  -5.25366675,  -4.99006919, ...,  -9.26099994,
        -9.38616308, -11.4010661 ])

# Joint Prob in log format

In [8]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)

In [9]:
joint_log_spam[:5]

array([-1.16603496, -1.16603496, -1.16603496, -1.16603496, -1.16603496])

 $$P(Ham \, | \, X) = \frac{P(X \, | \, Spam \,) \, (1- P(Spam))} {P(X)} $$

In [52]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(1 - PROB_SPAM)

In [53]:
joint_log_ham[:5]

array([-0.37338521, -0.37338521, -0.37338521, -0.37338521, -0.37338521])

In [54]:
X_test

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [nan,  4., nan, ..., nan, nan, nan],
       [ 2.,  2., nan, ..., nan, nan, nan],
       [ 1.,  9.,  1., ..., nan, nan, nan]])

In [13]:
prob_token_spam


array([1.21844839e-02, 5.22831229e-03, 6.80519364e-03, ...,
       9.50602234e-05, 8.38766677e-05, 1.11835557e-05])

In [14]:
PROB_SPAM

0.3116

In [15]:
prob_token_spam

array([1.21844839e-02, 5.22831229e-03, 6.80519364e-03, ...,
       9.50602234e-05, 8.38766677e-05, 1.11835557e-05])

In [16]:
prob_all_tokens

array([1.78477825e-02, 8.20751047e-03, 7.59480106e-03, ...,
       8.15392751e-05, 6.52314201e-05, 6.52314201e-05])

# Making Predictions
 
## Checking for the higher joint probabilities

 $$P(Spam \, | \, X) \, > \, P(Ham \, | \, X) $$
 <br>
 <center>**OR**</center>
 <br>
 $$P(Spam \, | \, X) \, < \, P(Ham \, | \, X) $$

In [18]:
# Making a prediction
prediction = joint_log_spam > joint_log_ham

In [20]:
prediction[-5:]
# The the last 5 emails in our prediction vector are all non spam.

array([False, False, False, False, False])

In [22]:
y_test[-5:]

array([0., 0., 0., 0., 0.])

# Metrics and Evalutions

In [26]:
correct_docs = (y_test == prediction).sum()
print("The total number of correct predictions:", correct_docs)
numdocs_wrong = X_test.shape[0] - correct_docs
print('No of docs classified incorrectly:', numdocs_wrong)

The total number of correct predictions: 5148
No of docs classified incorrectly: 589


In [27]:
accuracy = correct_docs/len(X_test)
accuracy

0.8973331009238278

In [32]:
fraction_wrong = numdocs_wrong/len(X_test)
print('The fraction classifies incorrectly is {:.2%}'.format(fraction_wrong))

The fraction classifies incorrectly is 10.27%


In [47]:
joint_log_ham

array([-0.37338521, -0.37338521, -0.37338521, ...,         nan,
               nan,         nan])

### False Positives and False Negetives

In [48]:
np.unique(prediction, return_counts=True)

(array([False]), array([5737], dtype=int64))

In [55]:
true_pos = (y_test == 1) & (prediction == 1)

In [57]:
true_pos.sum()

0

In [59]:
false_pos = (y_test == 0) & (prediction == 1)
false_pos.sum()

0

In [60]:
false_neg = (y_test == 1) & (prediction == 0)
false_neg.sum()

589

## Recall Score

In [62]:
recall_score = true_pos.sum() / (true_pos.sum() + false_neg.sum())
print('The recall socre is {:.2%}'.format(recall_score))


The recall socre is 0.00%
