In [145]:
import glob
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split

%matplotlib inline

## Let us utilize Bayesian classifiers to write a program for spam detection. 

## First, we will read in our dataset and use ".head" to visualize a sample. 

In [146]:
spam = pd.read_csv('spamdata.txt', header=None)

In [147]:
spam.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


## Next, we will use ".pop" to remove the final column from the main dataframe and sample the update. 

In [149]:
final_col = spam.pop(57)

In [169]:
spam.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,50,51,52,53,54,55,56
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191


## Time to split the data into training and tests sets (60/40, respectively)

In [187]:
train_X, test_X, train_y, test_y = train_test_split(spam, final_col, train_size=0.6, random_state=50)

## Next, we will create the Multinomial class object and fit it with our training data to "teach" it spam detection.

In [211]:
nb_train = MultinomialNB()

In [212]:
nb_train.fit(train_X, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## As we can see below, our test score is only .79. This is much lower than ideal (95+)

In [225]:
print("Score:", nb_train.score(train_X, train_y))

Score: 0.790942028986


## Next, we will test out test data (the other 40% we didn't ues for training)

In [230]:
print('Score:', nb_train.score(test_X, test_y))

Score: 0.797392721347


As we can see, its basically exactly the same as training. Although it would be preferable that our test data
is much higher, it's promising that our test sample did equally well.

## If we wanted, we could throw our test emails at our classifier and, utilizing predit, get an array of guesses.

In [232]:
nb_train.predict(test_X)

array([1, 0, 0, ..., 1, 1, 1])

In [233]:
nb_train.predict(spam.iloc[4])



array([1])