In [31]:
#import data
import pandas as pd
emails = pd.read_csv("enron.csv", sep=";")
emails.head()

Unnamed: 0,id,subject,text,hamSpam
0,1,christmas tree farm pictures,,1
1,2,vastar resources inc .,gary production from the high island larger ...,1
2,3,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,1
3,4,re : issue,fyi - see note below - already done . stella -...,1
4,5,meter 7268 nov allocation,fyi . - - - - - - - - - - - - - - - - - - - - ...,1


In [32]:
#drop unused data 
emails.drop(['id', 'subject'], axis="columns", inplace=True)
emails.head()

Unnamed: 0,text,hamSpam
0,,1
1,gary production from the high island larger ...,1
2,- calpine daily gas nomination 1 . doc,1
3,fyi - see note below - already done . stella -...,1
4,fyi . - - - - - - - - - - - - - - - - - - - - ...,1


In [33]:
#1412 spam and 3663 ham emails
emails.groupby("hamSpam").describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
hamSpam,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1412,1353,click here to be removed,19
1,3663,3415,> ricky a . archer fuel supply 700 louisiana ...,21


In [34]:
#split data into training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(emails.text, emails.hamSpam, test_size=0.1)

In [35]:
#transform text into numeric values
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values.astype('str'))
X_train_count.toarray()[:3]


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [37]:
#create Multinomial Naive Bayes model, multinomial because we have discrete data (count of each word in text)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count, y_train)

MultinomialNB()

In [38]:
#test with 2 random emails. first spam second ham
email = [
    'try cortizyte ! with our pharmaceutical grade all - natural weight - loss formula   you will : - lose pounds and inches like crazy - jump start your metabolism - boost your energy level - lose your appetite and feel satisfied - reduce cholesterol levels - eliminate cellulite and excess fat pockets - stimulate your body  s natural fat - burning ability most important :',
    'gary   production from the high island larger block a - 1 # 2 commenced on saturday at 2 : 00 p . m . at about 6   500 gross . carlos expects between 9   500 and 10   000 gross for tomorrow . vastar owns 68 % of the gross production . george x 3 - 6992 - - - - - - - - - - - - - - - - - - - - - - forwarded by george weissman / hou / ect on 12 / 13 / 99 10 : 16'
]
emails_count = v.transform(email)
model.predict(emails_count)



array([0, 1], dtype=int64)

In [39]:
#accuracy
X_test_count = v.transform(X_test.values.astype('str'))
model.score(X_test_count, y_test)

0.9768339768339769

In [40]:
def get_score(model, X_train, X_test, y_train, y_test):
    v = CountVectorizer()
    X_train_count = v.fit_transform(X_train.values.astype('str'))
    X_test_count = v.transform(X_test.values.astype('str'))
    model.fit(X_train_count, y_train)
    return model.score(X_test_count, y_test)

In [41]:
# K-Folds cross-validator
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)

In [42]:
#create 10 folds and their score
scores = []
for train_index, test_index in kf.split(emails.text):
    X_train, X_test, y_train, y_test = emails.text[train_index], emails.text[test_index], \
                                        emails.hamSpam[train_index], emails.hamSpam[test_index]
    scores.append(get_score(model, X_train, X_test, y_train, y_test))

In [43]:
scores

[0.971042471042471,
 0.9903474903474904,
 0.9864603481624759,
 0.9903288201160542,
 0.9845261121856866,
 0.9787234042553191,
 0.9477756286266924,
 0.9729206963249516,
 0.9245647969052224,
 0.90715667311412]

In [46]:
#does same as the for loop above (doesnt work rn)
#from sklearn.model_selection import cross_val_score
#cross_val_score(model, emails.text, emails.hamSpam)

Traceback (most recent call last):
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 615, in fit
    X, y = self._check_X_y(X, y)
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\naive_bayes.py", line 480, in _check_X_y
    return self._validate_data(X, y, accept_sparse='csr')
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
    return f(**kwargs)
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 796, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "C:\Users\O.Y\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 73, in inner_f
    

array([nan, nan, nan, nan, nan])