In [1]:
#Using the Ham/Spam Text Messaging Corpus - Streamlined for testing and reconfiguring

#For Preprocessing
import nltk
import pandas as pd
import string
from nltk.corpus import stopwords

#For Vectorizing and Weighting
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#For Training
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split

#For Streamlining
from sklearn.pipeline import Pipeline

In [2]:
# Step 1: Importing Data
messages = pd.read_csv('SMSSpamCollection', sep = '\t', names = ["label", "message"]) 
messages.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# Step 2: Text Preprocessing
def text_process(mess):
    nopunc = [char for char in mess if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english') ]

In [17]:
# Split data

msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['label'], test_size = 0.2)

In [18]:
# Step 3: Vectorize, Weight, and Classify
# All within a pipeline

pipeline = Pipeline([
        ('bow', CountVectorizer(analyzer = text_process)), # strings to token integer counts
        ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
        ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier
    ])

In [19]:
#Magic
pipeline.fit(msg_train, label_train)



Pipeline(steps=[('bow', CountVectorizer(analyzer=<function text_process at 0x00000000095899E8>,
        binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>,
        encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preproces...         use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
# Step_: Predictions
predictions = pipeline.predict(msg_test)



In [21]:
print classification_report(predictions, label_test)

             precision    recall  f1-score   support

        ham       1.00      0.96      0.98      1005
       spam       0.71      1.00      0.83       110

avg / total       0.97      0.96      0.96      1115



In [None]:
#End

In [27]:
phoney = pipeline.predict("somebody")
print phoney

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham']


In [28]:
msg_test.head()

5460    December only! Had your mobile 11mths+? You ar...
4318    HIYA STU WOT U UP 2.IM IN SO MUCH TRUBLE AT HO...
376                               Can you say what happen
1818                    Well i will watch shrek in 3D!!B)
5479    Really do hope the work doesnt get stressful. ...
Name: message, dtype: object

In [4]:
messages['message'].head(5).apply(text_process)

0    [Go, jurong, point, crazy, Available, bugis, n...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, 2, wkly, comp, win, FA, Cup, fin...
3        [U, dun, say, early, hor, U, c, already, say]
4    [Nah, dont, think, goes, usf, lives, around, t...
Name: message, dtype: object

In [5]:
messages['message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
bow_transformer = CountVectorizer(analyzer = text_process)
bow_transformer.fit(messages['message'][0])
print len(bow_transformer.vocabulary_)

17


In [10]:
bow_transformer = CountVectorizer(analyzer = text_process)
bow_transformer.fit(messages['message'][3])
print len(bow_transformer.vocabulary_)

8
