# 1. Import dataset and check it.

In [4]:
import pandas as pd
df = pd.read_table('SMSSpamCollection', header = None,sep = '\t',names=['label', 'sms_message'])
# Output printing out first 5 rows
df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Convert the values in the 'label' column to numerical values using map method as follows: {'ham':0, 'spam':1} This maps the 'ham' value to 0 and the 'spam' value to 1.

In [5]:
df['label'] = df.label.map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# Check the size of dataset
df.shape

(5572, 2)


# 2. Data clean and create Bag of Words Model
1. Change all words into lower case.
2. Remove all punctuations.
3. Tokenization
4. Count frequencies of each word
I use a 'documents' list to show how this process was done.

In [7]:
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

lower_case_documents = []
for i in documents:
    lower_case_documents.append(i.lower())
print('The original documents',documents)
print()
print('After lower case process',lower_case_documents)

The original documents ['Hello, how are you!', 'Win money, win from home.', 'Call me now.', 'Hello, Call hello you tomorrow?']

After lower case process ['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [15]:
import string
sans_punctuation_documents = []
for i in lower_case_documents:
    translator = str.maketrans('', '', string.punctuation)
    sans_punctuation_documents.append(i.translate(translator))
print('After remove puctuation',sans_punctuation_documents)

After remove puctuation ['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [17]:
preprocessed_documents = []
for i in sans_punctuation_documents:
    preprocessed_documents.append(i.split(' '))
print('Tokenization split into word',preprocessed_documents)

Tokenization split into word [['hello', 'how', 'are', 'you'], ['win', 'money', 'win', 'from', 'home'], ['call', 'me', 'now'], ['hello', 'call', 'hello', 'you', 'tomorrow']]


In [18]:
frequency_list = []
import pprint
from collections import Counter
for i in preprocessed_documents:
    frequency = Counter(i)
    frequency_list.append(frequency)
    
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


# 3. Create Bag of Words Model using sklearn
Or we can just use sklearn package to do the same thing!
CountVectorizer() has certain parameters which take care of these steps for us.
The get_feature_names() method returns our feature names for this dataset.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()
print(count_vector)
count_vector.fit(documents)
count_vector.get_feature_names()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [20]:
doc_array = count_vector.transform(documents).toarray()
print("Doc array: \n", doc_array)

Doc array: 
 [[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


In [21]:
frequency_matrix = pd.DataFrame(doc_array,columns = count_vector.get_feature_names())
print(frequency_matrix)

   are  call  from  hello  home  how  me  money  now  tomorrow  win  you
0    1     0     0      1     0    1   0      0    0         0    0    1
1    0     0     1      0     1    0   0      1    0         0    2    0
2    0     1     0      0     0    0   1      0    1         0    0    0
3    0     1     0      2     0    0   0      0    0         1    0    1


Split the dataset into training and test set

In [22]:
from sklearn.cross_validation import train_test_split
# The default retio is 3:1.
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393




In [23]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

# 4. Training our model using Naive Bayes Model

In [24]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [25]:
predictions = naive_bayes.predict(testing_data)

# 5. Evaluating our model
Now that we have made predictions on our test set, our next goal is to evaluate how well our model is doing. There are various mechanisms for doing so, but first let's do quick recap of them.

** Accuracy ** measures how often the classifier makes the correct prediction. It’s the ratio of the number of correct predictions to the total number of predictions (the number of test data points).

** Precision ** tells us what proportion of messages we classified as spam, actually were spam. It is a ratio of true positives(words classified as spam, and which are actually spam) to all positives(all words classified as spam, irrespective of whether that was the correct classification), in other words it is the ratio of

[True Positives/(True Positives + False Positives)]

** Recall(sensitivity)** tells us what proportion of messages that actually were spam were classified by us as spam. It is a ratio of true positives(words classified as spam, and which are actually spam) to all the words that were actually spam, in other words it is the ratio of

[True Positives/(True Positives + False Negatives)]

In [26]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: {}'.format(accuracy_score(y_test, predictions)))
print('Precision score: {}'.format(precision_score(y_test, predictions)))
print('Recall score: {}'.format(recall_score(y_test, predictions)))
print('F1 score: {}'.format(f1_score(y_test, predictions)))

Accuracy score: 0.9885139985642498
Precision score: 0.9720670391061452
Recall score: 0.9405405405405406
F1 score: 0.9560439560439562


# 6.More details
I will use an easy example to show you how Naive Bayes works in the project.

If we have 8 emails. We classify them into 2 groups. One is spam, another one is Ham.
In Spam group, we have 3 emails. The contents are, "Win money now", "Make cash easy!", "Cheap money, reply!".
In Ham group, we have 5 emails. The contens are, "How are you?", "There you are!",
"Can I borrow money?", "Say hi to grandma", "Was the exam easy?".
We got a new email, the centent is "easy money". We need to use naive Bayes to decide it 
whether it is a spam or not.

We can know that:
P(Spam) = 3/8
P(Ham) = 5/8

P('easy'|Spam) = 1/3
P('money'|Spam) = 2/3

P('easy'|Ham) = 1/5
P('money'|Ham) = 1/5
In this question, we need to calculate the P(Spam|'easy','money') and P(Ham|'easy','money').

According the definition of Native Bayes,
P(Spam|'easy','money') ∝ P('easy','money'|Spam)*P(Spam)
Meanwhile, Native Bayes make an assumption that all events are independent. Naive assumption: P(A&B) = P(A)*P(B).
P（'easy','money'|Spam) = P('easy'|Spam)*P('money'|Spam)
P(Spam|'easy','money') = P('easy'|Spam)*P('money'|Spam)*P(Spam)
So P(Spam|'easy','money') = 1/3*2/3*3/8 =1/12 
P(Ham|'easy','money') = 1/5*1/5*5/8 =1/40
We can normalize these two possibilties.

P(Spam|'easy','money') = 10/13
P(Ham|'easy','money') = 3/13