## 1. Import Datasets

In [1]:
import pandas as pd

In [2]:
datasets = pd.read_csv("emails.csv")

In [3]:
datasets.columns

Index(['text', 'spam'], dtype='object')

In [4]:
datasets.shape

(5728, 2)

In [5]:
data = datasets.drop_duplicates()

In [6]:
data.shape

(5695, 2)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5695 entries, 0 to 5727
Data columns (total 2 columns):
text    5695 non-null object
spam    5695 non-null int64
dtypes: int64(1), object(1)
memory usage: 133.5+ KB


In [8]:
data.columns

Index(['text', 'spam'], dtype='object')

In [9]:
data['text'].head(10)

0    Subject: naturally irresistible your corporate...
1    Subject: the stock trading gunslinger  fanny i...
2    Subject: unbelievable new homes made easy  im ...
3    Subject: 4 color printing special  request add...
4    Subject: do not have money , get software cds ...
5    Subject: great nnews  hello , welcome to medzo...
6    Subject: here ' s a hot play in motion  homela...
7    Subject: save your money buy getting this thin...
8    Subject: undeliverable : home based business f...
9    Subject: save your money buy getting this thin...
Name: text, dtype: object

In [10]:
type(data['text'][0])

str

In [11]:
data['text'][0]

"Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  ma

## 2. Clean the texts

In [68]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []

for i in range(0,2155):
    ## Step 1: remove numbers, only keep letters
    email = re.sub('[^a-zA-Z]',' ',data['text'][i])
    ## Step 2: lowercase all letters
    email_lower = email.lower()
    ## Step 3: split the string to list of wrods
    email_words = email_lower.split()
    ## Every email starts with 'Subject', we need to remove this from each email.
    email_words = email_words[1:]
    ## Step 4: remove the propositions such as the, an, this, that
    email = [word for word in email_words if not word in stopwords.words('english')]
    ## Step 5: Stemming, keep the root of the words
    ps = PorterStemmer()
    email = [ps.stem(word) for word in email]
    email = ' '.join(email)
    corpus.append(email)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xinyang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Natural Language Toolkit - NLTK

## 3. Creating bag of words model 
Very popular NLP model.  It is a model used to preprocess the texts to classify before fitting the classification algorithms on the observations containning the texts.

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = data['spam'][:2155].values

## 4. Split the data

In [70]:
# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0, stratify = y)


## 5. Applying Machine Learning Models

In [71]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
# Predicting the Test set results
y_pred = nb_clf.predict(X_test)

In [74]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.90      0.98      0.94       263
           1       0.99      0.94      0.96       456

   micro avg       0.95      0.95      0.95       719
   macro avg       0.94      0.96      0.95       719
weighted avg       0.96      0.95      0.95       719



In [73]:
test = []
Xin = "Hello! Everyone! We don't have class next Thursday. It is spring break."
new = re.sub('[^a-zA-Z]',' ',Xin)
    ## Step 2: lowercase all letters
new_lower = new.lower()
    ## Step 3: split the string to list of wrods
new_words = new_lower.split()
    ## Step 4: remove the propositions such as the, an, this, that
new = [word for word in new_words if not word in stopwords.words('english')]
    ## Step 5: Stemming, keep the root of the words
ps = PorterStemmer()
new = [ps.stem(word) for word in new]
new = ' '.join(new)
test.append(new)