### Reading a text-based dataset into pandas


In [None]:
# read file into pandas from the working directory
import pandas as pd
sms = pd.read_table('sms.tsv', header=None, names=['label', 'message'])

In [None]:
# examine the first 10 rows
sms.head(10)

In [3]:
sms.shape

(5572, 2)

In [4]:
sms.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
sms['label_n'] = sms.label.map({'ham':0, 'spam':1})

In [6]:
sms.head()

Unnamed: 0,label,message,label_n
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [8]:
X = sms.message
y = sms.label_n
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [12]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


####Vectorizing our dataset

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [14]:
vect.fit(X_train)
X_train_dtm = vect.transform(X_train)

In [16]:
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm

<4179x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 55209 stored elements in Compressed Sparse Row format>

In [17]:
X_test_dtm = vect.transform(X_test)
X_test_dtm

<1393x7456 sparse matrix of type '<class 'numpy.int64'>'
	with 17604 stored elements in Compressed Sparse Row format>

#### Building and evaluating a model

In [18]:
###The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [19]:
%time 
nb.fit(X_train_dtm, y_train)

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 10.3 µs


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
y_pred_class = nb.predict(X_test_dtm)
y_pred_class

array([0, 0, 0, ..., 0, 1, 0])

In [22]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.9885139985642498

In [23]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred_class)

array([[1203,    5],
       [  11,  174]])

In [24]:
# calculate predicted probabilities for X_test_dtm (poorly calibrated)
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

array([2.87744864e-03, 1.83488846e-05, 2.07301295e-03, ...,
       1.09026171e-06, 1.00000000e+00, 3.98279868e-09])

In [25]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob)

0.9866431000536962