In [24]:
def create_histogram(words):
    '''Create a histogram of words for spam/ham emails.'''
    hist = {}
    # make things lower case
    for phrase in words:
        for word in phrase.lower().split():
            # low = word.lower()
            if word in hist:
                hist[word] += 1
            else:  # low not in hist
                hist[word] = 1
    return hist

spam_text= ['Send us your password', 'review us', 'Send your password', 'Send us your account']
ham_text= ['Send us your review', 'review your password']

spam_hist, ham_hist = create_histogram(spam_text), create_histogram(ham_text)

In [25]:
spam_hist

{'send': 3, 'us': 3, 'your': 3, 'password': 2, 'review': 1, 'account': 1}

In [26]:
ham_hist

{'send': 1, 'us': 1, 'your': 2, 'review': 2, 'password': 1}

In [45]:
def cond_prob_word(words, target_word):
    hist = create_histogram(words)
    return hist[target_word] / len(hist.items())

cond_prob_word(spam_text, 'password')

0.3333333333333333

In [31]:
(2/13) * (2/3)

0.10256410256410256

In [32]:
0.10256410256410256 + (1/9)

0.21367521367521367

In [33]:
(4/3) /0.21367521367521367

6.24

In [38]:
1 - 0.6829268292682927

0.31707317073170727

In [39]:
p_ham_given_pass = .31707317073170727

In [40]:
p_spam_given_pass = 1 - p_ham_given_pass

In [43]:
import numpy as np
'''
Naive Bayes Pseudocode

1- Based on the given dataset above, create the following two dictionaries:

 Ham -> D_ham = {'Jos': 1,'ask':1, 'you':1,... }

 Spam- > D_spam= {'Did': 1, 'you':3, ... }

Each dictionary representes all words for the spam and ham emails and their frequency (as the value of dictionaries)

2- For any new given sentences, having $w_1$, $w_2$, ... $w_n$ words, assuming the sentence is ham, calculate the following:

 $P(w_1| ham)$, $P(w_2| ham)$, ..., $P(w_n| ham)$
 $log(P(w_1| ham))$, $log(P(w_2| ham))$, ..., $log(P(w_n| ham))$

then add them all together to create one value

3- Calculate what percentage of labels are ham -> $P(ham)$ -> then take the log -> $log(P(ham))$

4- Add the value from step (2) and (3)

5- Do Steps (2) - (4) again, but assume the given new sentence is spam

6- Compare the two values. The greater value indicates which label (class) the sentence should be given
'''
def naive_classifer()

"\nNaive Bayes Pseudocode\n\n1- Based on the given dataset above, create the following two dictionaries:\n\n Ham -> D_ham = {'Jos': 1,'ask':1, 'you':1,... }\n\n Spam- > D_spam= {'Did': 1, 'you':3, ... }\n\nEach dictionary representes all words for the spam and ham emails and their frequency (as the value of dictionaries)\n\n2- For any new given sentences, having $w_1$, $w_2$, ... $w_n$ words, assuming the sentence is ham, calculate the following:\n\n $P(w_1| ham)$, $P(w_2| ham)$, ..., $P(w_n| ham)$\n $log(P(w_1| ham))$, $log(P(w_2| ham))$, ..., $log(P(w_n| ham))$\n\nthen add them all together to create one value\n\n3- Calculate what percentage of labels are ham -> $P(ham)$ -> then take the log -> $log(P(ham))$\n\n4- Add the value from step (2) and (3)\n\n5- Do Steps (2) - (4) again, but assume the given new sentence is spam\n\n6- Compare the two values. The greater value indicates which label (class) the sentence should be given\n"

In [47]:
import os
import re
import string
import math
import pandas as pd

class SpamDetector(object):
    """Implementation of Naive Bayes for binary classification"""

    # clean up our string by removing punctuation
    def clean(self, s):
        translator = str.maketrans("", "", string.punctuation)
        return s.translate(translator)

    #  tokenize our string into words
    def tokenize(self, text):
        text = self.clean(text).lower()
        return re.split("\W+", text)

    # count up how many of each word appears in a list of words.
    def get_word_counts(self, words):
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0.0) + 1.0
        return word_counts

    def fit(self, X, Y):
        """Fit our classifier
        Arguments:
            X {list} -- list of document contents
            y {list} -- correct labels
        """
        self.num_messages = {}
        self.log_class_priors = {}
        self.word_counts = {}
        self.vocab = set()

        # Compute log class priors (the probability that any given message is spam/ham),
        # by counting how many messages are spam/ham, 
        # dividing by the total number of messages, and taking the log.
        n = len(X)
        self.num_messages['spam'] = sum(1 for label in Y if label == 'spam')
        self.num_messages['ham'] = sum(1 for label in Y if label == 'ham')
        self.log_class_priors['spam'] = ma.log(self.num_messages['spam'] / n )
        self.log_class_priors['ham'] = math.log(self.num_messages['ham'] / n )
        self.word_counts['spam'] = {}
        self.word_counts['ham'] = {}
        
        # for each (document, label) pair, tokenize the document into words.
        for x, y in zip(X, Y):
            c = 'spam' if y == 'spam' else 'ham'
            counts = self.get_word_counts(self.tokenize(x))
            # For each word, either add it to the vocabulary for spam/ham, 
            # if it isn’t already there, and update the number of counts. 
            for word, count in counts.items():
                # Add that word to the global vocabulary.
                if word not in self.vocab:
                    self.vocab.add(word)
                if word not in self.word_counts[c]:
                    self.word_counts[c][word] = 0.0

                self.word_counts[c][word] += count

    # function to actually output the class label for new data.
    def predict(self, X):
        result = []
        # Given a document...
        for x in X:
            counts = self.get_word_counts(self.tokenize(x))
            spam_score = 0
            ham_score = 0
            # We iterate through each of the words...
            for word, _ in counts.items():
                if word not in self.vocab: continue
                # ... and compute log p(w_i|Spam), and sum them all up. The same will happen for Ham
                # add Laplace smoothing
                # https://medium.com/syncedreview/applying-multinomial-naive-bayes-to-nlp-problems-a-practical-explanation-4f5271768ebf
                log_w_given_spam = math.log( (self.word_counts['spam'].get(word, 0.0) + 1) / (self.num_messages['spam'] + len(self.vocab)) )
                log_w_given_ham = math.log( (self.word_counts['ham'].get(word, 0.0) + 1) / (self.num_messages['ham'] + len(self.vocab)) )

                spam_score += log_w_given_spam
                ham_score += log_w_given_ham
            
            # Then we add the log class priors...
            spam_score += self.log_class_priors['spam']
            ham_score += self.log_class_priors['ham']

            # ... and check to see which score is bigger for that document.
            # Whichever is larger, that is the predicted label!
            if spam_score > ham_score:
                result.append('spam')
            else:
                result.append('ham')
        return result
        

# TODO: Fill in the below function to make a prediction, 
# your answer should match the final number in the below output (0.9641)
if __name__ == '__main__':
    # get the dataset
    data = pd.read_csv('./datasets/spam.csv',encoding='latin-1')
    data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
    data = data.rename(columns={"v1":'label', "v2":'text'})
    print(data.head())
    tags = data["label"]
    texts = data["text"]
    # make the classifier
    clf = SpamDetector()


In [48]:
# get the dataset
data = pd.read_csv('./datasets/spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]
# make the cl
# clf = Spam

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [49]:
# make the model 
clf = SpamDetector()

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts, tags, random_state=1)

In [53]:
clf.fit(X_train, y_train)

In [54]:
y_pred = clf.predict(X_test)

In [55]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
cm

array([[1213,    0],
       [  50,  130]])

In [56]:
TN, FP, FN, TP = np.ravel(cm)

In [57]:
accuracy = (TN + TP) / len(y_test)

In [58]:
accuracy

0.9641062455132807

In [59]:
tags.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

# use sklearn CountVectorizer and MultinomialNB to spam email dataset

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Prepare the dataset
data = pd.read_csv('Datasets/spam.csv',encoding='latin-1')
data = data.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
data = data.rename(columns={"v1":'label', "v2":'text'})
print(data.head())
tags = data["label"]
texts = data["text"]

# create texts and tags
X, y = texts, tags

# split the data into train vs test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# transform text into numerical vectors
vectorizer = CountVectorizer()
X_train_dtm = vectorizer.fit_transform(X_train)
print(X_train_dtm)

# instantiate Multinomial Naive Bayes model
nb = MultinomialNB()
# fit to model, with the trained part of the dataset
nb.fit(X_train_dtm, y_train)
X_test_dtm = vectorizer.transform(X_test)
# make prediction
y_pred_class = nb.predict(X_test_dtm)
# test accurarcy of prediction
metrics.accuracy_score(y_test, y_pred_class)

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  (0, 3286)	1
  (0, 4747)	2
  (0, 1896)	1
  (0, 875)	2
  (0, 6599)	2
  (0, 801)	1
  (0, 5258)	1
  (0, 7209)	3
  (0, 1559)	1
  (0, 913)	1
  (0, 6623)	3
  (0, 1050)	1
  (0, 5980)	1
  (0, 3530)	1
  (0, 919)	1
  (0, 802)	1
  (0, 819)	1
  (0, 5712)	1
  (0, 6727)	1
  (0, 2112)	1
  (0, 5065)	2
  (0, 7373)	1
  (0, 4176)	2
  (0, 1535)	2
  (0, 6604)	1
  :	:
  (4176, 4747)	1
  (4176, 3252)	1
  (4176, 3416)	1
  (4176, 2304)	1
  (4176, 6638)	1
  (4176, 4450)	1
  (4176, 7163)	1
  (4176, 4219)	1
  (4176, 1590)	1
  (4176, 3439)	1
  (4176, 4833)	1
  (4176, 4894)	1
  (4177, 3647)	1
  (4177, 3252)	1
  (4177, 6074)	1
  (4177, 4125)	1
  (4177, 3162)	1
  (4177

0.9856424982053122

In [69]:
cm = confusion_matrix(y_test, y_pred_class)

In [70]:
cm

array([[1205,    8],
       [  12,  168]])

In [74]:
TN, FP, FN, TP = np.ravel(cm)
print(TN, FP, FN, TP)

1205 8 12 168


In [72]:
accuracy = (TN + TP) / len(y_test)
accuracy

0.9856424982053122

In [73]:
precision = TP / (TP + FP)

recall = TP / (TP + FN)

print((2 * precision * recall)/ (precision + recall))

0.9438202247191012
