In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('SMSSpamCollection', delimiter='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
print('Number of spam messages:', df[df[0] == 'spam'][0].count())

Number of spam messages: 747


In [5]:
print('Number of ham messages:', df[df[0] == 'ham'][0].count())

Number of ham messages: 4825


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split

In [7]:
X_train_raw, X_test_raw, y_train, y_test = train_test_split(df[1], df[0])

In [8]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

In [9]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

In [10]:
for i, prediction in enumerate(predictions[:5]):
    print('{} {}'.format(prediction, X_test[i]))

ham   (0, 7296)	0.145741661642
  (0, 7192)	0.142998188879
  (0, 6856)	0.56867134943
  (0, 6746)	0.196321322542
  (0, 6719)	0.0844996593804
  (0, 6629)	0.179807254752
  (0, 6608)	0.20948719384
  (0, 6600)	0.102742927708
  (0, 6509)	0.332522588758
  (0, 5266)	0.229662770621
  (0, 4788)	0.130395902779
  (0, 4771)	0.155004238331
  (0, 4688)	0.137914266357
  (0, 3582)	0.332522588758
  (0, 3088)	0.17471642099
  (0, 3031)	0.142034805527
  (0, 1745)	0.284335674715
  (0, 921)	0.113841116267
  (0, 873)	0.155004238331
ham   (0, 6979)	0.160778016582
  (0, 6598)	0.140546158247
  (0, 4747)	0.136802253563
  (0, 3641)	0.245259921487
  (0, 3494)	0.357577295575
  (0, 3271)	0.138840237696
  (0, 2182)	0.357577295575
  (0, 1738)	0.590729155488
  (0, 1042)	0.189271320198
  (0, 918)	0.40999048579
  (0, 910)	0.21829681322
ham   (0, 6983)	0.481120846338
  (0, 3652)	0.282817893742
  (0, 2572)	0.54620102297
  (0, 2283)	0.329641006753
  (0, 2066)	0.417039363613
  (0, 1069)	0.32804906427
ham   (0, 6914)	0.46704492

In [11]:
scores = cross_val_score(classifier, X_train, y_train, cv=5)

In [12]:
scores

array([ 0.95340502,  0.9474313 ,  0.9508982 ,  0.96167665,  0.9508982 ])

In [13]:
np.mean(scores)

0.95286187481667484

In [14]:
precisions = cross_val_score(classifier, X_train, y_train, cv=5)

In [15]:
print(precisions)

[ 0.95340502  0.9474313   0.9508982   0.96167665  0.9508982 ]


In [21]:
print(np.mean(precisions))

0.952861874817
