In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
message = pd.read_csv('SMSSpamCollection', sep='\t',names=['label','message'],encoding="latin1")

In [3]:
message

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [5]:
import string
mess = 'Sample message ! Notice: it has punctuations.'

In [6]:
mess

'Sample message ! Notice: it has punctuations.'

In [7]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
nopunc = [c for c in mess if c not in string.punctuation]

In [9]:
nopunc

['S',
 'a',
 'm',
 'p',
 'l',
 'e',
 ' ',
 'm',
 'e',
 's',
 's',
 'a',
 'g',
 'e',
 ' ',
 ' ',
 'N',
 'o',
 't',
 'i',
 'c',
 'e',
 ' ',
 'i',
 't',
 ' ',
 'h',
 'a',
 's',
 ' ',
 'p',
 'u',
 'n',
 'c',
 't',
 'u',
 'a',
 't',
 'i',
 'o',
 'n',
 's']

In [10]:
from nltk.corpus import stopwords

In [11]:
stopwords.words('english')

nopunc = ''.join(nopunc)

In [12]:
nopunc.split()

['Sample', 'message', 'Notice', 'it', 'has', 'punctuations']

In [13]:
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [14]:
clean_mess

['Sample', 'message', 'Notice', 'punctuations']

In [15]:
def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return[word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [16]:
mess

'Sample message ! Notice: it has punctuations.'

In [17]:
text_process(mess)

['Sample', 'message', 'Notice', 'punctuations']

In [18]:
message['message'].head(5).apply(text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(message['message'])

In [21]:
mess4 = message['message'][3]

In [22]:
mess4

'U dun say so early hor... U c already then say...'

In [23]:
bow4 = bow_transformer.transform([mess4])

In [24]:
bow4

<1x11396 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [25]:
bow4.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
message_bow = bow_transformer.transform(message['message'])

In [27]:
message_bow

<5572x11396 sparse matrix of type '<class 'numpy.int64'>'
	with 50529 stored elements in Compressed Sparse Row format>

In [28]:
message_bow.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(message_bow)

In [30]:
bow4

<1x11396 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [31]:
tfidf4 = tfidf_transformer.transform(bow4)

In [32]:
tfidf4.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

In [33]:
message_tfidf = tfidf_transformer.transform(message_bow)

In [34]:
message_tfidf

<5572x11396 sparse matrix of type '<class 'numpy.float64'>'
	with 50529 stored elements in Compressed Sparse Row format>

In [35]:
message_tfidf.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
from sklearn.naive_bayes import MultinomialNB # it is best for sparse dataset
from sklearn.ensemble import RandomForestClassifier

In [37]:
spam_detection_model = MultinomialNB().fit(message_tfidf,message['label'])

In [38]:
spam_detection_model.predict(tfidf4)

array(['ham'], dtype='<U4')

In [39]:
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(message['message'],message['label'],test_size=0.3)

In [40]:
msg_train

3752        Haha... Sounds crazy, dunno can tahan anot...
3565      Do you always celebrate NY's with your family ?
2633                       I WILL CAL YOU SIR. In meeting
5385    How do friends help us in problems? They give ...
3366                               Sorry, I'll call later
                              ...                        
1443               Its ok., i just askd did u knw tht no?
24      Ffffffffff. Alright no way I can meet up with ...
3220    Me too baby! I promise to treat you well! I be...
800               Gimme a few was  &lt;#&gt;  minutes ago
4571    Wanna have a laugh? Try CHIT-CHAT on your mobi...
Name: message, Length: 3900, dtype: object

In [41]:
msg_train.shape

(3900,)

In [42]:
msg_test.shape

(1672,)

In [43]:
from sklearn.pipeline import Pipeline

In [44]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier(n_estimators=10))
    ])

In [None]:
pipeline.fit(msg_train,label_train)

In [None]:
predictions = pipeline.predict(msg_test)

In [None]:
predictions

In [None]:
label_test

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(label_test,predictions))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(label_test,predictions))