In [25]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
import pandas as pd

In [11]:
#path = 'data/sms.tsv'
url = 'https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv'
# url = 'https://github.com/justmarkham/pycon-2016-tutorial/blob/master/data/sms.tsv'
sms = pd.read_table(url, header=None, names=['label', 'message'])

In [12]:
sms.shape

(5572, 2)

In [13]:
sms.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [17]:
sms.label.value_counts()

ham     4825
spam     747
dtype: int64

In [18]:
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})

In [20]:
sms.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [22]:
X = sms.message
y = sms.label_num
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [28]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [29]:
vect = CountVectorizer()

In [31]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [32]:
X_train_dtm = vect.fit_transform(X_train)

In [33]:
X_train_dtm

<4179x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [34]:
X_test_dtm = vect.transform(X_test)

In [35]:
X_test_dtm

<1393x7456 sparse matrix of type '<type 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
nb = MultinomialNB()

In [39]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 8.53 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
y_pred_class = nb.predict(X_test_dtm)

In [42]:
from sklearn import metrics

In [43]:
metrics.accuracy_score(y_test, y_pred_class)

0.98851399856424982

In [44]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
lr = LogisticRegression()

In [47]:
%time lr.fit(X_train_dtm, y_train)

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 139 ms


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
y_pred_class = lr.predict(X_test_dtm)

In [49]:
metrics.accuracy_score(y_test, y_pred_class)

0.9877961234745154

In [50]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1207,    1],
       [  16,  169]])

In [53]:
y_pred_prob = lr.predict_proba(X_test_dtm)[:,1]

In [54]:
metrics.roc_auc_score(y_test, y_pred_prob)

0.99368176123143015