In [3]:
import pandas as pd
import numpy as np

In [4]:
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])

In [5]:
sms.shape

(5572, 2)

In [6]:
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
sms['label_num'] = sms['label'].map({'ham': 0, 'spam': 1})
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
# how to define X and y  (from sms data set) for use with COUNTCONVERTER
X = sms['message']
y = sms['label_num']
print(X.shape)
print(y.shape)

(5572,)
(5572,)


- X is 1D currently because it will be passed to Vectorizer to become a 2D matrix
- You must always have a 1D object so CountVectorizer can turn into a 2D object for the model to be built on

In [10]:
#split X and y into training data and training set
# by default we take 70 % as training data and 30 % as training set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state =1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


Why are we splitting info training sets before vectorizing

Background of train/test split

- Train/Test split is for model evaluvation
    - Model evaluvation is to simulate the futuew
    - Past data is exchangable for future data
    - We pretent that some of the user data is coming from future data
    - By Training, predicting and evaluting the data, we can check the performace of our model
    


In [12]:
# vectorizing our dataset
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

#learn the data vocabulary, then use it to create a document martix model
vect.fit(X_train)

#transform training data
X_dtm_train = vect.transform(X_train)

X_dtm_train

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [13]:
X_dtm_test = vect.transform(X_test)
X_dtm_test

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

 # Building and Evaluating a Model

 We will use Multinomial Naive Bayer Model

 > The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.
 

In [14]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()

In [15]:
# train the model using X_train
%time nb.fit(X_dtm_train, y_train)

CPU times: user 6.2 ms, sys: 6.11 ms, total: 12.3 ms
Wall time: 31.9 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# make class prediction from x_test_dtm
y_pred_class = nb.predict(X_dtm_test)

In [18]:
# calculate accuracy of class prediction
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.9885139985642498

In [22]:
print(y_test.value_counts())
# there is a majority class of 0 here, hence the classes are skewed

# calculate null accuracy (for multi-class classification problems)
# .head(1) assesses the value 1208
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy:', null_accuracy)

# Manual calculation of null accuracy by always predicting the majority class
print('Manual null accuracy:',(1208 / (1208 + 185)))

0    1208
1     185
Name: label_num, dtype: int64
Null accuracy: 0    0.867193
Name: label_num, dtype: float64
Manual null accuracy: 0.8671931083991385


In this case, we can see that our accuracy (0.9885) is higher than the null accuracy (0.8672)

In [23]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [24]:
# print message text for the false positives (ham incorrectly classified as spam)

#X_test[y_pred_class > y_test]
X_test[(y_pred_class == 1) & (y_test==0)]

574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [25]:
# print message for false negatives (spam incorrectly classified as ham)

X_test[(y_pred_class == 0) & (y_test == 1)]

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [None]:
# calculate predicted probability for X_test_dtm
# TODOS: TODOS Virendra This is a test for todos
