In [20]:
import pandas as pd
import numpy as np
import string

In [7]:
messages = pd.read_csv('smsspamcollection/SMSSpamCollection',sep='\t',names=['Classifier','Message'])

In [9]:
messages.head()

Unnamed: 0,Classifier,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
messages.describe()

Unnamed: 0,Classifier,Message
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [14]:
messages.groupby('Classifier').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Classifier,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [17]:
messages['Length'] = messages['Message'].apply(len)

In [25]:
messages.head()

Unnamed: 0,Classifier,Message,Length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [35]:
#import nltk
from nltk.corpus import stopwords
#nltk.download()

In [39]:
def clean_message(message):
    """
    1. Removes punctuation
    2. Removes stopwords
    3. converts sentence into word list
    """
    message = [char for char in message if char not in string.punctuation] 
    message = ''.join(message)
    wordlist = [word for word in message.split() if word.lower() not in stopwords.words('english')]
    return wordlist

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
bagofwords_transform = CountVectorizer(analyzer=(clean_message)).fit(messages['Message'])

In [48]:
len(bagofwords_transform.vocabulary_)

11425

In [49]:
messages_bow = bagofwords_transform.transform(messages['Message'])

In [51]:
print(messages_bow.shape)

(5572, 11425)


In [52]:
from sklearn.feature_extraction.text import TfidfTransformer

In [58]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)

In [59]:
tfid_messages =tfidf_transformer.transform(messages_bow)

In [60]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(tfid_messages, messages['Classifier'])

In [62]:
from sklearn.model_selection import train_test_split

msg_train, msg_test, label_train, label_test = \
train_test_split(messages['Message'], messages['Classifier'], test_size=0.2)

In [65]:
from sklearn.pipeline import Pipeline
pipelinemodel = Pipeline([
    ('bow',CountVectorizer(analyzer=clean_message)),
    ('tfidf',TfidfTransformer()),
    ('naiveBaise',MultinomialNB())
])

In [66]:
pipelinemodel.fit(msg_train,label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function clean_message at 0x000002B9A889E048>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...f=False, use_idf=True)), ('naiveBaise', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [67]:
predicted = pipelinemodel.predict(msg_test)

In [68]:
from sklearn.metrics import classification_report,confusion_matrix

In [71]:
print(classification_report(label_test,predicted))

             precision    recall  f1-score   support

        ham       0.96      1.00      0.98       958
       spam       1.00      0.73      0.85       157

avg / total       0.96      0.96      0.96      1115



In [72]:
print(confusion_matrix(label_test,predicted))

[[958   0]
 [ 42 115]]
