In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
TOKEN_SPAM_PROB_FILE = 'SpamData/03_Testing/prob-spam.txt'
TOKEN_HAM_PROB_FILE = 'SpamData/03_Testing/prob-nonspam.txt'
TOKEN_ALL_PROB_FILE = 'SpamData/03_Testing/prob-all-tokens.txt'
TEST_FEATURE_MATRIX = 'SpamData/03_Testing/test-features.txt'
TEST_TARGET_FILE = 'SpamData/03_Testing/test-target.txt'
VOCAB_SIZE = 2500

In [3]:
# Features
X_test = np.loadtxt(TEST_FEATURE_MATRIX, delimiter=' ')
# Target
y_test = np.loadtxt(TEST_TARGET_FILE, delimiter=' ')
# Token Probabilities
prob_token_spam = np.loadtxt(TOKEN_SPAM_PROB_FILE, delimiter=' ')
prob_token_ham = np.loadtxt(TOKEN_HAM_PROB_FILE, delimiter=' ')
prob_all_tokens = np.loadtxt(TOKEN_ALL_PROB_FILE, delimiter=' ')

In [4]:
X_test

array([[0., 0., 1., ..., 0., 0., 0.],
       [6., 1., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 4., 0., ..., 0., 0., 0.],
       [2., 2., 0., ..., 0., 0., 0.],
       [1., 9., 1., ..., 0., 0., 0.]])

In [5]:
X_test.shape

(1724, 2500)

In [6]:
X_test[0]

array([0., 0., 1., ..., 0., 0., 0.])

In [7]:
len(X_test[0])

2500

In [8]:
(X_test[0]).sum()

180.0

In [9]:
y_test

array([1., 1., 1., ..., 0., 0., 0.])

In [10]:
y_test.shape

(1724,)

In [11]:
prob_token_spam

array([1.21863238e-02, 5.22910177e-03, 6.80622123e-03, ...,
       5.59262221e-06, 1.11852444e-05, 3.35557333e-05])

In [12]:
prob_token_ham

array([2.14715279e-02, 1.01406377e-02, 8.00679697e-03, ...,
       1.29205037e-04, 9.00519952e-05, 9.78826035e-05])

In [13]:
prob_token_spam.shape

(2500,)

In [14]:
prob_token_ham.shape

(2500,)

In [15]:
prob_all_tokens

array([1.78488636e-02, 8.20800764e-03, 7.59526112e-03, ...,
       7.45547103e-05, 5.35861981e-05, 6.75652062e-05])

In [16]:
prob_all_tokens.shape

(2500,)

In [17]:
X_test.shape

(1724, 2500)

In [18]:
prob_token_spam.shape

(2500,)

In [19]:
print('shape of the dot product is ', X_test.dot(prob_token_spam).shape)

shape of the dot product is  (1724,)


In [20]:
# uncomment to see the error
# print('shape of the dot product is ', prob_token_spam.dot(X_test).shape)

In [21]:
PROB_SPAM = 0.3116 # setting the prior

In [22]:
# Assume that I have a mail...in the X_test.....which is as follows...
# Can  give  free loan
# P(Spam|Can) x P(Spam|give) x P(Spam|free) x P(spam|loan)

In [23]:
prob_token_spam[:5]

array([0.01218632, 0.0052291 , 0.00680622, 0.01130828, 0.00681741])

In [24]:
np.log(prob_token_spam)[:5]

array([-4.40744096, -5.25351576, -4.9899182 , -4.48221989, -4.98827616])

In [25]:
joint_log_spam = X_test.dot(np.log(prob_token_spam) - np.log(prob_all_tokens)) + np.log(PROB_SPAM)

In [26]:
joint_log_ham = X_test.dot(np.log(prob_token_ham) - np.log(prob_all_tokens)) + np.log(1- PROB_SPAM)

In [27]:
prediction = (joint_log_spam > joint_log_ham)

In [28]:
prediction.shape

(1724,)

In [29]:
prediction[:10]

array([ True,  True,  True,  True,  True, False,  True,  True,  True,
       False])

In [30]:
prediction[:10]*1

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 0])

In [31]:
y_test[:10]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [32]:
joint_log_spam

array([  24.29988266,    2.1666715 ,   20.5990493 , ..., -374.59533866,
         -9.89844797, -111.99345913])

In [33]:
joint_log_ham

array([-60.98484965, -11.01400734, -37.97355359, ...,  60.92799817,
        -0.06139541,  24.43342717])

In [34]:
# Home work......
# I have 1724 mails in my X_test....
# find how many are predicted correctly
# find how many are predicted wrongly
# How many spam mails in X_test
# How many ham mails in X_test

In [35]:
# find how many are predicted correctly
# find how many are predicted wrongly

correctly = 0
wrongly   = 0

for i in range(y_test.shape[0]):
    
    if (y_test[i] == prediction[i]*1):
        correctly = correctly+1
    else: 
        wrongly = wrongly+1
print('predicted correctly :',correctly)
print('predicted wrongly :',wrongly)

predicted correctly : 1685
predicted wrongly : 39


In [36]:
# How many spam mails in X_test
# How many ham mails in X_test

no_of_spam_mails = y_test.sum()

no_of_ham_mails = y_test.shape[0] - no_of_spam_mails

print('How many spam mails in X_test :',no_of_spam_mails)
print('How many ham mails in X_test :',no_of_ham_mails)

How many spam mails in X_test : 589.0
How many ham mails in X_test : 1135.0


In [38]:
print(confusion_matrix(y_test, prediction*1))

[[1116   19]
 [  20  569]]


In [37]:
print(classification_report(y_test, prediction*1))

              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      1135
         1.0       0.97      0.97      0.97       589

    accuracy                           0.98      1724
   macro avg       0.98      0.97      0.97      1724
weighted avg       0.98      0.98      0.98      1724

