### Spam Classification using bayes Classifier

In [1]:
import pandas

# Data downloaded from  https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection
# Loading Data
messages = pandas.read_csv('./data/SMSSpamCollection', sep='\t', 
                           names=["label", "message"])
print(messages)

     label                                            message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
5     spam  FreeMsg Hey there darling it's been 3 week's n...
6      ham  Even my brother is not like to speak with me. ...
7      ham  As per your request 'Melle Melle (Oru Minnamin...
8     spam  WINNER!! As a valued network customer you have...
9     spam  Had your mobile 11 months or more? U R entitle...
10     ham  I'm gonna be home soon and i don't want to tal...
11    spam  SIX chances to win CASH! From 100 to 20,000 po...
12    spam  URGENT! You have won a 1 week FREE membership ...
13     ham  I've been searching for the right words to tha...
14     ham                I HAVE A DATE ON SUNDAY WITH WILL!!
15    sp

In [2]:
messages.message.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: message, dtype: object

In [3]:
#number of unique words just for analysis purpose
wordsCorpus = set()
for text in messages.message:
    wordsCorpus.update(text.split())
len(wordsCorpus)

15691

In [4]:
# Word to Index. this dictionary will stores the index of all the words in word corpus
word2idx = {word : i for i, word in enumerate(wordsCorpus)}
print(word2idx['to'])

9072


In [5]:
# Convert messages to words Corpus for analysis purpose
from collections import Counter

def Words_Corpus_Count(data):
    all_words = []
    for line in data:
        line_words = line.split()
        all_words += line_words
    Word_to_Vec = Counter(all_words)
    return Word_to_Vec


# Testing words_Corpus method
data = ['this is first message', 'this is second message']
resDict = Words_Corpus_Count(data)
print(resDict)

Counter({'this': 2, 'is': 2, 'message': 2, 'first': 1, 'second': 1})


In [6]:
#Most Common 100 Words
vocab = Words_Corpus_Count(messages.message)
vocab = sorted(vocab, key=vocab.get, reverse=True)
print(vocab[:100])
len(vocab)

['to', 'you', 'I', 'a', 'the', 'and', 'in', 'is', 'i', 'u', 'for', 'my', 'of', 'your', 'me', 'on', 'have', '2', 'that', 'are', 'it', 'call', 'or', 'be', 'at', 'with', 'not', 'will', 'get', 'can', 'U', 'ur', 'so', "I'm", 'but', '&lt;#&gt;', 'You', 'from', '4', 'up', 'do', '.', 'if', 'just', 'go', 'when', 'know', 'like', 'this', 'we', 'all', 'out', 'got', 'was', 'come', 'now', '?', 'am', '...', 'want', 'by', 'Call', 'time', 'send', 'about', 'only', 'then', 'what', 'going', 'need', 'n', "I'll", 'How', 'still', 'as', 'If', 'one', 'But', 'its', 'he', 'our', 'text', 'No', 'no', 'been', 'Just', 'We', 'there', 'So', 'has', 'some', 'love', 'see', 'good', 'r', 'Do', "don't", 'think', 'how', '&']


15691

In [7]:
'to' in wordsCorpus

True

In [9]:
import numpy as np

def word_to_vector(text):
    word_vec = np.zeros(len(vocab), dtype=np.int)
    for word in text.split(' '):
        index = word2idx.get(word, None)
        if index is None:
            continue
        else:
            word_vec[index] += 1
            
    return np.array(word_vec)
            

#testing the method word_to_vector
resArray = word_to_vector('Today is the final day to finish the assignments. So please do Complete it! If you have any doubts regarding the assignment please consult me at any time to')
print(resArray[:100])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [10]:
#Converting entire messages to the arrays
word_vec_messages = np.zeros((len(messages.message), len(vocab)), dtype=np.int_)
for index, message in enumerate(messages.message):
    if index == 5:
        print("message input:", message)
    word_vec_messages[index] = word_to_vector(message)

val =  Counter(word_vec_messages[5])
print(val)

message input: FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
Counter({0: 15660, 1: 30, 2: 1})


In [11]:
word_vec_messages

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(word_vec_messages, messages.label, test_size=0.33, random_state=42)

In [13]:
from sklearn.naive_bayes import MultinomialNB

spam_detector = MultinomialNB().fit(X_train, y_train)
print("Using MultinomialNB : ",spam_detector.score(X_test, y_test))

from sklearn.naive_bayes import GaussianNB

spam_detector = GaussianNB().fit(X_train, y_train)
print("Using GaussianNB : ",spam_detector.score(X_test, y_test))

Using MultinomialNB :  0.971723762915
Using GaussianNB :  0.941272430669


In [15]:
#trying new method for improving the efficiency of the algorithm
from collections import Counter

positive_words = Counter()
negative_words = Counter()
total_words = Counter()

In [16]:
for i in range(len(messages.label)):
    if messages.label[i] == 'ham':
        for word in messages.message[i].split():
            positive_words[word] += 1
            total_words[word] += 1
    else:
        for word in messages.message[i].split():
            negative_words[word] += 1
            total_words[word] += 1

print("Total words : ", len(total_words))
print("Positive Words : ", len(positive_words))
print("negative words : ", len(negative_words))


Total words :  15691
Positive Words :  12583
negative words :  4320


In [17]:
#top positive words
pos_list = sorted(positive_words)[:500]
pos_list

['!',
 '!!',
 '!!!',
 '!!!!',
 "!!''.",
 '!1',
 '!:-)',
 '"',
 '".',
 '"1.U',
 '"A',
 '"Be',
 '"Because',
 '"Best',
 '"Boost',
 '"Checkmate"',
 '"DRINK".',
 '"DRIVE',
 '"Enjoy"',
 '"GOODMORNING',
 '"GUD',
 '"HELLO"',
 '"How',
 '"Hurt',
 '"I',
 '"It',
 '"Its',
 '"KUDI"yarasu',
 '"Margaret',
 '"Miss',
 '"Nver',
 '"OH',
 '"Oh',
 '"Our',
 '"POWER',
 '"Shah',
 '"She',
 '"Sometimes',
 '"Sweet"',
 '"This',
 '"Ur',
 '"VALENTINES',
 '"VALUED',
 '"Walk',
 '"Walk,',
 '"Wen',
 '"What',
 '"X"',
 '"You"',
 '"don\'t',
 '"find',
 '"get',
 '"how',
 '"jeevithathile',
 '"julianaland"',
 '"life',
 '"morning"',
 '"oh"',
 '"our',
 '"paths',
 '"sleep',
 '"smokes',
 '"song',
 '"the',
 '"usf',
 '"welcomes"',
 '"what',
 '"with',
 '"woah"',
 '"wow',
 '"wylie',
 '#',
 '$',
 '$1',
 '$140',
 '$180',
 '$2',
 '$50',
 '$50...',
 '$700',
 '$900',
 '$95/pax,',
 '%',
 '%.',
 '%of',
 '&',
 '&SAM',
 '&amp;',
 '&gt;:(',
 '&it',
 '&lt;#&gt;',
 '&lt;)',
 '&lt;3',
 '&lt;DECIMAL&gt;',
 '&lt;EMAIL&gt;',
 '&lt;TIME&gt;',
 '&lt;UR

In [18]:
#top negative words
neg_list = sorted(negative_words)[:500]
neg_list

['!',
 '!!!',
 '!This',
 '"3000',
 '"ADP"',
 '"Crazy"',
 '"Divorce',
 '"MIX"',
 '"POLYS"',
 '"STOP',
 '"go',
 '"suppliers",',
 '#150',
 '#5000',
 '$350',
 '$5.00',
 '&',
 '&Cs',
 '&XXX',
 "'Uptown",
 "'help'",
 '(100p/SMS)',
 '(10p/min)',
 '(10p/min).',
 '(150p/SMS)',
 '(18',
 '(18+)',
 '(18+).',
 '(2/3)',
 '(20/F)',
 '(25/F)',
 '(25p),',
 '(29/M)',
 '(32/F)',
 '(Bank',
 '(Book',
 '(Get',
 '(Henry,',
 '(More',
 '(Send',
 '(Txt',
 '(flights',
 '(mob',
 '(nat',
 '(quizclub',
 '(std',
 '(to',
 '(£4.50)',
 ')',
 '*',
 '*****',
 '***********',
 '***************',
 '*****UP',
 '**FREE',
 '*BILLING',
 '*turn*',
 '+',
 '+123',
 '+447797706009',
 '+449071512431',
 '+std',
 '+£400',
 ',',
 '-',
 '-)',
 '-Message',
 '-PLS',
 '-call',
 '-msg',
 '-sub',
 '.',
 '...',
 '.....',
 '/',
 '0',
 '008704050406',
 '0089(my',
 '0121',
 '01223585236',
 '01223585334',
 '02',
 '02/06/03!',
 '02/09/03!',
 '0207',
 '0207-083-6089',
 '02072069400.',
 '02073162414',
 '02085076972',
 '021',
 '050703',
 '0578',
 '06

In [19]:
#postivie to negative ratio
word_ratio = {}
for word in total_words:
    word_ratio[word] = (positive_words[word] + 1)/( negative_words[word] + 1)

sorted(word_ratio.items(), key=lambda x: x[1])

[('claim', 0.013513513513513514),
 ('won', 0.021739130434782608),
 ('prize', 0.022222222222222223),
 ('FREE', 0.022222222222222223),
 ('awarded', 0.02631578947368421),
 ('£1000', 0.029411764705882353),
 ('URGENT!', 0.030303030303030304),
 ('PO', 0.03125),
 ('Claim', 0.03225806451612903),
 ('150ppm', 0.034482758620689655),
 ('guaranteed', 0.038461538461538464),
 ('entry', 0.038461538461538464),
 ('4*', 0.04),
 ('500', 0.041666666666666664),
 ('Box', 0.041666666666666664),
 ('£100', 0.043478260869565216),
 ('18', 0.043478260869565216),
 ('Nokia', 0.043478260869565216),
 ('16+', 0.043478260869565216),
 ('STOP', 0.044444444444444446),
 ('8007', 0.045454545454545456),
 ('Holiday', 0.045454545454545456),
 ('Valid', 0.047619047619047616),
 ('£2000', 0.047619047619047616),
 ('weekly', 0.047619047619047616),
 ('tone', 0.05),
 ('WON', 0.05),
 ('£5000', 0.05),
 ('150p', 0.05),
 ('86688', 0.05),
 ('GUARANTEED.', 0.05),
 ('T&Cs', 0.05),
 ('750', 0.05263157894736842),
 ('Orange', 0.05263157894736842

In [20]:
# Don't use because MultinomialNB doesn't use negative continuos values.. 
# Use GaussianNB classifier for negative values
word_ratio_modified = { key : np.log(value) for key, value in word_ratio.items()}

#word_ratio_modified = {}

#for key, value in word_ratio.items():
#    if value > 1:
#        word_ratio_modified[key] = np.log(value)
#    else:
#        word_ratio_modified[key] = -1/(1 + np.log(value))

sorted(word_ratio_modified.items(), key=lambda x: x[1], reverse=True)

[('&lt;#&gt;', 5.6240175061873385),
 ("I'll", 4.9344739331306915),
 ('But', 4.8828019225863706),
 ('he', 4.8520302639196169),
 ('i', 4.6647858956624457),
 ('ü', 4.6634390941120669),
 ('come', 4.6001576441645469),
 ('And', 4.5951198501345898),
 ('Ok', 4.5849674786705723),
 ('d', 4.5538768916005408),
 ("i'm", 4.5217885770490405),
 ('him', 4.499809670330265),
 ('later', 4.4773368144782069),
 ('she', 4.4543472962535073),
 ('&amp;', 4.4426512564903167),
 ('ask', 4.4188406077965983),
 ('...', 4.4006030202468169),
 ('way', 4.3307333402863311),
 ('da', 4.290459441148391),
 ('e', 4.2766661190160553),
 ('going', 4.2626798770413155),
 ('them', 4.2484952420493594),
 ('lor.', 4.219507705176107),
 ('said', 4.1896547420264252),
 ('doing', 4.1743872698956368),
 ('pick', 4.1743872698956368),
 ('my', 4.1303549997451334),
 (':)', 4.0775374439057197),
 ('gonna', 4.0604430105464191),
 ('lor...', 4.0430512678345503),
 ('think', 4.0342406381523954),
 ('feel', 4.0253516907351496),
 ('buy', 4.0073331852324712)

In [21]:
len(word_ratio_modified)

15691

In [22]:
#Bug in Code, so getting extra words => Comparision can be done using the words Corpus and total_words set
for key, value in word_ratio_modified.items():
    if not key in wordsCorpus:
        print(key)

In [23]:
'Free' in wordsCorpus

True

In [24]:
word_ratio_modified['Free']

-1.9740810260220096

In [25]:
index = word2idx.get('Free', None)
index

7168

In [26]:
def word_to_vecRatio(message):
    word_vec = np.zeros(len(vocab), dtype=np.float)
    for word in message.split(' '):
        index = word2idx.get(word, None)
        # print("index :", index, " word : ", word )
        if index is None:
            continue
        else:
            word_vec[index] += word_ratio_modified[word]
    return np.array(word_vec)


In [27]:
#Converting entire messages to the arrays
word_vec_messages = np.zeros((len(messages.message), len(vocab)), dtype=np.float)
for index, message in enumerate(messages.message):
    if index == 2:
        print("message input:", message)
    word_vec_messages[index] = word_to_vecRatio(message)

val =  Counter(word_vec_messages[2])
print("\n ", val)

message input: Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

  Counter({0.0: 15667, -1.0986122886681098: 3, -1.3862943611198906: 3, -1.6094379124341003: 3, 0.99663171170193643: 1, -0.88238918019847368: 1, 1.6739764335716716: 1, -2.3025850929940455: 1, 2.7861397556181369: 1, -1.9740810260220096: 1, 2.4282006222936996: 1, 0.53062825106217038: 1, -0.40546510810816444: 1, -1.824549292051046: 1, -2.1546649629174235: 1, -6.5161930760429643: 1, -3.2188758248682006: 1, -2.3978952727983707: 1, -1.5404450409471491: 1})


In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(word_vec_messages, messages.label, test_size=0.33, random_state=42)

print("training data : ", len(X_train), "\ntesting data : ", len(X_test), "\notal num of records : ", len(X_train) + len(X_test))

training data :  3733 
testing data :  1839 
otal num of records :  5572


In [32]:
from sklearn.naive_bayes import GaussianNB

spam_detector = GaussianNB().fit(X_train, y_train)
spam_detector.score(X_test, y_test)

# Accuracy got decrease might be due to overfitting.
# ToDo: Remove the stop words and apply proper Stemming and lemmatization

0.94779771615008157

### Preprocessing Using Sklearn Library 

In [35]:
import numpy
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(messages.message)
targets = messages.label

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.33, random_state=42)
X_train = X_train.toarray()
X_test = X_test.toarray()

In [37]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train, y_train)
print("Using MultiNomialNB : ", classifier.score(X_test, y_test))

from sklearn.naive_bayes import GaussianNB

spam_detector = GaussianNB().fit(X_train, y_train)
print("Using GaussianNB : ", spam_detector.score(X_test, y_test))

Using MultiNomialNB :  0.983686786297
Using GaussianNB :  0.907014681892


In [38]:
from sklearn.feature_extraction.text import TfidfTransformer

messages_tfidf = TfidfTransformer().fit(counts).transform(counts)

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(messages_tfidf, targets, test_size=0.33, random_state=42)
X_train = X_train.toarray()
X_test = X_test.toarray()

In [40]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train, y_train)
print("Using MultiNomialNB : ", classifier.score(X_test, y_test))

from sklearn.naive_bayes import GaussianNB

spam_detector = GaussianNB().fit(X_train, y_train)
print("Using GaussianNB : ", spam_detector.score(X_test, y_test))

Using MultiNomialNB :  0.960848287113
Using GaussianNB :  0.907014681892
