In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [10]:
data=pd.read_table('SMSSpamCollection',sep='\t',header=None,names=['label','sms message'])

In [11]:
data.head()

Unnamed: 0,label,sms message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [28]:
data.drop('labels',axis=1,inplace=True)

In [29]:
data.head()

Unnamed: 0,label,sms message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [44]:
#Conversion to lower case letters
documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']
lower_case_documents=[]
for word in documents:
    lower_case_documents.append(word.lower())
print(lower_case_documents)

['hello, how are you!', 'win money, win from home.', 'call me now.', 'hello, call hello you tomorrow?']


In [43]:
#Removing puntuations
import string
sans_punctuation_documents=[]
for w in lower_case_documents:
    sans_punctuation_documents.append(w.translate(str.maketrans('', '', string.punctuation)))
print(sans_punctuation_documents)

['hello how are you', 'win money win from home', 'call me now', 'hello call hello you tomorrow']


In [42]:

#Tokenization of words
preprocessed_words=[]
for i in lower_case_documents:
    preprocessed_words.append(i.split(' '))
    
print(preprocessed_words)

[['hello,', 'how', 'are', 'you!'], ['win', 'money,', 'win', 'from', 'home.'], ['call', 'me', 'now.'], ['hello,', 'call', 'hello', 'you', 'tomorrow?']]


In [49]:
#Counting frequency
frequency_list = []
import pprint
from collections import Counter
for i in preprocessed_words:
    frequency_count=Counter(i)
    frequency_list.append(frequency_count)
print(frequency_list)

[Counter({'hello,': 1, 'how': 1, 'are': 1, 'you!': 1}), Counter({'win': 2, 'money,': 1, 'from': 1, 'home.': 1}), Counter({'call': 1, 'me': 1, 'now.': 1}), Counter({'hello,': 1, 'call': 1, 'hello': 1, 'you': 1, 'tomorrow?': 1})]


In [52]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
print(vectorizer)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [55]:
vectorizer.fit(documents)
vectorizer.get_feature_names()

['are',
 'call',
 'from',
 'hello',
 'home',
 'how',
 'me',
 'money',
 'now',
 'tomorrow',
 'win',
 'you']

In [61]:
doc_vector=vectorizer.transform(documents).toarray()

In [62]:
print(doc_vector)

[[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


In [65]:
pd.DataFrame(doc_vector,columns=vectorizer.get_feature_names())

Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['sms message'], 
                                                    data['label'], 
                                                    random_state=101)

In [85]:
# Instantiate the CountVectorizer method
vectorizer = CountVectorizer()

# Fit the training data and then return the matrix
training_data = vectorizer.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = vectorizer.transform(X_test)

In [86]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [87]:
predictions = naive_bayes.predict(testing_data)

In [88]:
print(predictions)

['0' '1' '1' ..., '0' '0' '0']


In [90]:
from sklearn.metrics import accuracy_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))


Accuracy score:  0.9849246231155779
