In [226]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

In [227]:
#Loading symbol
import itertools
import threading
import time
import sys

done = True
#here is the animation
def animate():
    for c in itertools.cycle(['|', '/', '-', '\\']):
        if done:
            break
        sys.stdout.write('\rloading ' + c)
        sys.stdout.flush()
        time.sleep(0.1)
    sys.stdout.write('\rDone!     ')

In [228]:
#ingest csv and preview dataframe
done = False
t = threading.Thread(target=animate)
t.start()
df = pd.read_csv('ham-spam.csv',engine='python')
done = True

Done!     

In [229]:
#Preview occurences of each category and most frequent items for each category
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


## Get Dummy Variables

In [230]:
df['spam'] = pd.get_dummies(df['Category'])['spam']
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


## Partition Data

In [231]:
X_train, X_test, y_train, y_test = train_test_split(df['Message'], df['spam'], test_size=0.3)

## Bag of Words

In [232]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Naive Bayes

In [233]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [234]:
#Test array
emails = [
    'Since your name matches the one under the account and you knew the email, I\'ll assume it\'s really you - I\'ve updated the email to testemail@gmail.com, so you should be able to perform a PW reset now.',
    'Congratulations! you are going to receive 250 emails/ day from all categories!'
]
#Counts occurences of words in email array
emails_count=v.transform(emails)
#Run model on our test array
model.predict(emails_count)

array([0, 1], dtype=uint8)

In [235]:
X_test_count = v.transform(X_test)
model.score(X_test_count, y_test)

0.9820574162679426

## Pipeline

In [236]:
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [237]:
#Retrain model using pipeline
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [238]:
clf.score(X_test,y_test)

0.9820574162679426

## Test on new data set

In [239]:
import os
import sys
import numpy

In [240]:
SOURCES = [
    ('data/spam', 'spam'),
    ('data/easy_ham', 'ham'),
    ('data/hard_ham', 'ham'),
    ('data/beck-s', 'ham'),
    ('data/farmer-d', 'ham'),
    ('data/kaminski-v', 'ham'),
    ('data/kitchen-l', 'ham'),
    ('data/lokay-m', 'ham'),
    ('data/williams-w3', 'ham'),
    ('data/BG', 'spam'),
    ('data/GP', 'spam'),
    ('data/SH', 'spam')
]

def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in 'cmds':
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    past_header, lines = False, []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                            lines.append(line)
                        elif line == '\n':
                            past_header = True
                    f.close()
                    content = '\n'.join(lines)
                    yield file_path, content


def build_data_frame(l, path, classification):
    rows = []
    for i, (file_name, text) in enumerate(read_files(path)):
        rows.append({'Message': text, 'Category': classification})
   
    data_frame = DataFrame(rows)
    return data_frame, len(rows)

def load_data():
    data = DataFrame({'Message': [], 'Category': []})
    l = 0
    for path, classification in SOURCES:
        data_frame, nrows = build_data_frame(l, path, classification)
        data = data.append(data_frame)
        l += nrows
    return data

In [241]:
done = False
t = threading.Thread(target=animate)
t.start()
df2 = load_data()
done = True

loading |

In [242]:
df2['spam'] = pd.get_dummies(df2['Category'])['spam']

Done!     

## Test Model on New Data Frame

In [243]:
clf.score(df2['Message'],df2['spam'])

0.6500763877100662

## Append New Data to Data Frame

In [244]:
df3 = pd.concat([df, df2], ignore_index=True, sort=True).drop(columns='Category')
df3

Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
64477,"<html>\n\n<head>\n\n<meta http-equiv=""content-...",1
64478,This is a multi-part message in MIME format.\n...,1
64479,"Dear Subscriber,\n\n\n\nIf I could show you a ...",1
64480,****Mid-Summer Customer Appreciation SALE!****...,1


## Partition Data Frame

In [245]:
X_train, X_test, y_train, y_test = train_test_split(df3['Message'], df3['spam'], test_size=0.3)

## Retrain Model

In [246]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

## Check Performance

In [247]:
clf.score(X_test,y_test)

0.9208064099250453

In [253]:
emails = [
    'Since your name matches the one under the account and you knew the email, I\'ll assume it\'s really you - I\'ve updated the email to test@email.com, so you should be able to perform a PW reset now.',
    'Congratulations, you hav wone a new car.',
    'Save on phone plans, too\nSwappa is here to help you compare and save on your next phone plan. Check out reviews and prices to find the plan that fits your life.\nShare your experience\nLove (or hate) your carrier? Let other Swappa users know by writing a review. Your reviews help users like you save money on their phone plan.'
    ]
clf.predict(emails)

array([0, 1, 0], dtype=uint8)