# Naive Bayes Classification

In [1]:
from collections import defaultdict # 데이터 모든 형태를 dictionary 형태로 묶는다. (문장 spam인지 아닌지)
import re  # regular expression 숫자나 문자를 받아서 data화
import math
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
df['v1'].nunique

<bound method IndexOpsMixin.nunique of 0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object>

In [2]:
df = pd.read_csv('spam.csv', encoding="ISO-8859-1") #한국어 cp949사용하는 것처럼 alphabet사용하는 나라에서 encoding
df.dropna(inplace=True, axis=1)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [7]:
df['is_spam']=df.v1.map(lambda x: 1 if x =='spam' else 0)    ##get_dummies를 이용해도 되고 / 매개함수를 이용하는 방법 one-hot encoding
df.head()

Unnamed: 0,v1,v2,is_spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
subset = df[['v2', 'is_spam']]
tuples=[tuple(x) for x in subset.values]
data = tuples

In [9]:
subset.head()

Unnamed: 0,v2,is_spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
subset.describe()

Unnamed: 0,is_spam
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [11]:
data

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  0),
 ('Ok lar... Joking wif u oni...', 0),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  1),
 ('U dun say so early hor... U c already then say...', 0),
 ("Nah I don't think he goes to usf, he lives around here though", 0),
 ("FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv",
  1),
 ('Even my brother is not like to speak with me. They treat me like aids patent.',
  0),
 ("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
  0),
 ('WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 0906170146

In [12]:
def split_data(data, prob):
    '''split data into fractions [prob, 1-prob]'''
    results = [],[]
    for row in data:
        results[0 if random.random() < prob else 1].append(row)
    return results

In [14]:
random.seed(137)
train_data, test_data = split_data(data, 0.75)
train_data

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  0),
 ('Ok lar... Joking wif u oni...', 0),
 ('U dun say so early hor... U c already then say...', 0),
 ("Nah I don't think he goes to usf, he lives around here though", 0),
 ("As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
  0),
 ("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.",
  0),
 ('SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info',
  1),
 ('URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18',
  1),
 ("I've been searching for the right words to thank you for this breather. I promise i wont take yo

In [19]:
# 단어 전처리
# import nltk
def tokenize(message):
    message = message.lower() # convert to lowercase
    all_words = re.findall("[a-z0-9]+", message) # extract the words ##### +
    return set(all_words)

In [20]:
def count_words(training_set):
    '''training set consists of pairs (message, is_spam)'''
    counts = defaultdict(lambda: [0,0])
    for message, is_spam in training_set:
        for word in tokenize(message):
            counts[word][0 if is_spam else 1]+=1
    return counts

In [25]:
count_words(train_data)['free']

[123, 47]

## Calculate probability

In [26]:
def word_probabilities(counts, total_spams, total_non_spams, k=0.5): ###laplace smoothing을 진행함
    '''turn the word_counts into a list of triplets'''
    return [(w,
            (spam + k)/(total_spams+2*k),
            (non_spam+k) / (total_non_spams+2*k))
           for w, (spam, non_spam) in counts.items()]

$a=0.00000000000000001$
$a = exp(log(a))$

In [27]:
# p(c), p(~c)
prob_spam = df.is_spam.mean() #p(c)  prior에 해당함
prob_n_spam = 1-prob_spam

from math import log
log_prob_spam=math.log(df.is_spam.mean())  ##곱을 합으로... 0으로 수렴할까봐 방지
log_prob_n_spam = math.log(1-df.is_spam.mean())

In [28]:
prob_n_spam

0.8659368269921034

In [29]:
def spam_probability(word_probs, message):
    message_words = tokenize(message)
    log_prob_if_spam = log_prob_if_not_spam = 0
    # iterate through each word in our set
    for word, prob_if_spam, prob_if_not_spam in word_probs:
        if word in message_words:
            log_prob_if_spam += math.log(prob_if_spam)
            log_prob_if_not_spam += math.log(prob_if_not_spam)
        # if "word" does not appear in the message
        # add the log probability of _not_ seeing it
        # which is log(1-prob. of seeing it)
        else:
            log_prob_if_spam += math.log(1-prob_if_spam)
            log_prob_if_not_spam += math.log(1-prob_if_not_spam)
    prob_if_spam = math.exp(log_prob_if_spam + log_prob_spam)  ### exponential을 취해줬음
    prob_if_not_spam = math.exp(log_prob_if_not_spam + log_prob_n_spam)
    return prob_if_spam / (prob_if_spam + prob_if_not_spam), prob_if_not_spam / (prob_if_spam + prob_if_not_spam)

In [None]:
# Naive Bayes Classifier라는 class로 넣자

In [30]:
class NaiveBayesClassifier:
    def __init__(self, k=0.5):  ###smoothing parameter 0.5로 
        self.k=k
        self.word_probs = []

    def train(self, training_set):
        # count spam and non-spam messages
        num_spams = len([is_spam
                        for message, is_spam in training_set
                        if is_spam])
        num_non_spams = len(training_set) - num_spams
        # run training data through our "pipeline"
        word_counts = count_words(training_set)
        self.word_probs = word_probabilities(word_counts, num_spams, num_non_spams, self.k)
    
    def classify(self, message):
        return spam_probability(self.word_probs, message)

In [31]:
classifier = NaiveBayesClassifier()
classifier.train(train_data)

In [32]:
classified = [(subject, is_spam, classifier.classify(subject))
             for subject, is_spam in test_data]

In [33]:
len(classified)

1388

In [35]:
classified[5:20] #message, true_spam, prob(p(spam|x), p(ham|x))

[("Oh k...i'm watching here:)",
  0,
  (9.625676033558943e-14, 0.9999999999999037)),
 ('Fine if thatåÕs the way u feel. ThatåÕs the way its gota b',
  0,
  (5.233787004820494e-13, 0.9999999999994766)),
 ('Is that seriously how you spell his name?',
  0,
  (3.3002979572690897e-10, 0.9999999996699701)),
 ('I\x89Û÷m going to try for 2 months ha ha only joking',
  0,
  (4.041921013645774e-09, 0.999999995958079)),
 ('So Ì_ pay first lar... Then when is da stock comin...',
  0,
  (1.2831849989537263e-13, 0.9999999999998717)),
 ('Aft i finish my lunch then i go str down lor. Ard 3 smth lor. U finish ur lunch already?',
  0,
  (1.431004826951556e-16, 0.9999999999999998)),
 ('K tell me anything about you.',
  0,
  (5.767693660995297e-12, 0.9999999999942323)),
 ('I see the letter B on my car',
  0,
  (2.6242622560076063e-11, 0.9999999999737574)),
 ('Pls go ahead with watts. I just wanted to be sure. Do have a great weekend. Abiola',
  0,
  (2.649875729972435e-10, 0.9999999997350125)),
 ('WHO ARE