Python for Data & Analytics<br>
Chapter 18, section 5

*** requires datafile: emails2.txt

In [1]:
# Read the data fi le of emails - 1 line per email, pipe delimited
import pandas as pd
data = pd.read_csv('emails2.txt', usecols=['isSpam', 'Message'], sep='|')

In [2]:
# check how many spam vs. non- spam emails (1=spam)
data.isSpam.value_counts()

0    140
1    139
Name: isSpam, dtype: int64

In [3]:
# look at an example (non-spam) message
data.iloc[0].Message[:160]

"just to update you on this project ' s status : based on a new report that scott mills ran for me from sitara , i have come up with the following counterparties"

In [4]:
# take out non-letters
def nonletterRemover(text):
   alist = [c if c.isalpha() else ' ' for c in text]
   return ''.join(alist)
data['Message2'] = data['Message'].apply(nonletterRemover)

In [5]:
# tokenize
from nltk.tokenize import word_tokenize
tokenizer = lambda text: word_tokenize(text)
data['Message2'] = data['Message2'].apply(tokenizer)

In [6]:
# apply Porter Stemmer
from nltk import PorterStemmer
porterStemmer = PorterStemmer()
stemmer = lambda words: [ porterStemmer.stem(word) for word in words ]
data['Message2'] = data['Message2'].apply(stemmer)

In [7]:
# rejoin text
rejoiner = lambda words: ' '.join(words)
data['Message2'] = data['Message2'].apply(rejoiner)

In [8]:
# compare the initial and transformed text for a few non-spam messages
data.head()

Unnamed: 0,isSpam,Message,Message2
0,0,just to update you on this project ' s status ...,just to updat you on thi project s statu base ...
1,0,the above referenced meters need to be placed ...,the abov referenc meter need to be place on a ...
2,0,( see attached file : hpll 228 . xls ) - hpll ...,see attach file hpll xl hpll xl
3,0,"daren , it ' s in . bob - - - - - - - - - - - ...",daren it s in bob forward by robert cotten hou...
4,0,"daren , fyi . bob - - - - - - - - - - - - - - ...",daren fyi bob forward by robert cotten hou ect...


In [9]:
# compare the initial and transformed text for a few spam messages
data.tail()

Unnamed: 0,isSpam,Message,Message2
274,1,we have very compet itive pricing on hundreds ...,we have veri compet itiv price on hundr of the...
275,1,"good day , your application has been pre - app...",good day your applic ha been pre approv on thu...
276,1,"dear sir or madam , would you refinance if you...",dear sir or madam would you refin if you knew ...
277,1,size = 1 > order confirmation . your order sho...,size order confirm your order should be ship b...
278,1,"if you ' re tired of traffic lights , speed ca...",if you re tire of traffic light speed camera a...


In [10]:
# split to 25% test data and 75% train data
# isSpam is the dependent variable, Message2 is the independent variable
from sklearn.model_selection import train_test_split
train_text, test_text, train_labels, test_labels = \
 train_test_split(data.Message2, data.isSpam, test_size=0.25, random_state=1)

In [11]:
train_labels

227    1
264    1
147    1
230    1
266    1
      ..
203    1
255    1
72     0
235    1
37     0
Name: isSpam, Length: 209, dtype: int64

In [12]:
# build bag of words features vectorizer and get features
from sklearn.feature_extraction.text import CountVectorizer
bow_vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
bow_train_features = bow_vectorizer.fit_transform(train_text)
bow_test_features = bow_vectorizer.transform(test_text)

In [13]:
from sklearn.naive_bayes import MultinomialNB # import naive bayes
model = MultinomialNB()
model.fit(bow_train_features, train_labels)

MultinomialNB()

In [14]:
# predict using model
predictions = model.predict(bow_test_features)

In [15]:
# number of emails in the test data
len(test_labels)

70

In [16]:
test_results = \
   pd.DataFrame({'actual':test_labels.tolist(), 'predict':list(predictions)})
test_results

Unnamed: 0,actual,predict
0,0,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
65,0,0
66,0,0
67,0,0
68,0,0


In [17]:
test_results[test_results.actual != test_results.predict]

Unnamed: 0,actual,predict
0,0,1
24,0,1
36,1,0
53,0,1


In [18]:
# check accuracy using confusion matrix
from sklearn import metrics
metrics.confusion_matrix(test_labels, predictions)

array([[33,  3],
       [ 1, 33]])

Code from: Python for Data & Analytics, (c) 2023 Rose River Software, LLC