In [1]:
import pandas as pd


In [2]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()


In [51]:
# example text for model training (SMS messages)
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']


In [52]:
from sklearn.feature_extraction.text import CountVectorizer 
vect = CountVectorizer()

In [53]:
# learn the 'vocabulary' of the training data (occurs in-place)
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [54]:
# examine the fitted vocabulary
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [55]:
# transform training data into a 'document-term matrix'
# row = document , Column = terms
# here we have 3 documents & 6 terms, so makes it a 
# 3*6 matrics# convert sparse matrix to a dense matrix

simple_train_dtm = vect.transform(simple_train)
simple_train_dtm


<3x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [56]:
# convert sparse matrix to a dense matrix
# in "sparse matrix" all of the value initially conserder
# as zero, so when we get a value it stores only the 
# co-ordinate of non-zero values but in "dense matrix"
# zeros & non-zeros both have the presence thus occupy
# more memory
simple_train_dtm.toarray()

array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]], dtype=int64)

In [57]:
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0


In [58]:
# check the type of the document-term matrix
type(simple_train_dtm)

scipy.sparse.csr.csr_matrix

In [59]:
# examine the sparse matrix contents
print(simple_train_dtm)

  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (1, 0)	1
  (1, 1)	1
  (1, 2)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	2


In [60]:
# example text for model tesing
simple_test = [" please don't call me "]

In [61]:
vect.transform(simple_test)

<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [62]:
# transform testing data into a document-term matrix (using existing vocabulary)
simple_test_dtm = vect.transform(simple_test)
simple_test_dtm.toarray()

array([[0, 1, 1, 1, 0, 0]], dtype=int64)

In [63]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(simple_test_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,1,1,0,0


In [64]:
#==================================================
#==================================================
#==================================================

In [65]:
# alternative: read file into pandas from a URL
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])

In [66]:
# examine the shape
sms.shape

(5572, 2)

In [67]:
# examine the first 10 rows
sms.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [68]:
# examine the class distribution
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [69]:
# convert label to a numerical variable
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [70]:
sms.head(10)

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [71]:
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [72]:
# split X & y into training & testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print( y_test.shape )

(4179,)
(1393,)
(4179,)
(1393,)


In [73]:
# instantiate the vectorizer
vect = CountVectorizer()

In [74]:
# learn training data vocabulary, then use it to creat "document term matrix"'
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [75]:
# examine the dtm matrix
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [76]:
# transform testing data  (using fitter vocabulary) into dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [77]:
# import & instantiate a Multinomial Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [78]:
%time nb.fit(X_train_dtm, y_train)
# we are using X_train_dtm as it's mathmatical 
# transformation of X_train

Wall time: 0 ns


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [79]:
y_pred_class = nb.predict(X_test_dtm)

In [80]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.98851399856424982

In [81]:
metrics.confusion_matrix(y_test,y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [82]:
# [ true_neg   false_pos
#   false_neg  true_pos  ]

In [84]:
# print message text for false positive (ham incorrectly specified)
X_test[ y_pred_class > y_test ]


574               Waiting for your call.
3375             Also andros ice etc etc
45      No calls..messages..missed calls
3415             No pic. Please re-send.
1988    No calls..messages..missed calls
Name: message, dtype: object

In [85]:
#find false negative
X_test[ y_pred_class < y_test ]

3132    LookAtMe!: Thanks for your purchase of a video...
5       FreeMsg Hey there darling it's been 3 week's n...
3530    Xmas & New Years Eve tickets are now on sale f...
684     Hi I'm sue. I am 20 years old and work as a la...
1875    Would you like to see my XXX pics they are so ...
1893    CALL 09090900040 & LISTEN TO EXTREME DIRTY LIV...
4298    thesmszone.com lets you send free anonymous an...
4949    Hi this is Amy, we will be sending you a free ...
2821    INTERFLORA - It's not too late to order Inter...
2247    Hi ya babe x u 4goten bout me?' scammers getti...
4514    Money i have won wining number 946 wot do i do...
Name: message, dtype: object

In [86]:
# example of false negative
X_test[3132]

"LookAtMe!: Thanks for your purchase of a video clip from LookAtMe!, you've been charged 35p. Think you can do better? Why not send a video in a MMSto 32323."

In [87]:
nb.predict_proba(X_test_dtm)
# array([ [ probability of being 1     probability of being 0]])

array([[  9.97122551e-01,   2.87744864e-03],
       [  9.99981651e-01,   1.83488846e-05],
       [  9.97926987e-01,   2.07301295e-03],
       ..., 
       [  9.99998910e-01,   1.09026171e-06],
       [  1.86697467e-10,   1.00000000e+00],
       [  9.99999996e-01,   3.98279868e-09]])

In [89]:
y_pred_prob = nb.predict_proba(X_test_dtm)[:,1]
y_pred_prob

array([  2.87744864e-03,   1.83488846e-05,   2.07301295e-03, ...,
         1.09026171e-06,   1.00000000e+00,   3.98279868e-09])

In [91]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.98664310005369615

In [94]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [95]:
%time logreg.fit(X_train_dtm, y_train)

Wall time: 29.1 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [96]:
y_pred_class = logreg.predict(X_test_dtm)

In [97]:
y_pred_prob = logreg.predict_proba(X_test_dtm)[:,1]
y_pred_prob

array([ 0.01269556,  0.00347183,  0.00616517, ...,  0.03354907,
        0.99725053,  0.00157706])

In [98]:
metrics.accuracy_score(y_test,y_pred_class)

0.9877961234745154

In [99]:
metrics.roc_auc_score(y_test,y_pred_prob)

0.99368176123143015