In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Ham or Spam?

In [2]:
import pandas as pd

df = pd.read_csv("emails.csv")

df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


The dataset is made up of email that are classified as ham [0] or spam[1]. You need to clean the dataset before training a prediction model.

## Remove Punctuation

👇 Create a function to remove the punctuation. Apply it to the entire data and add the output as a new column in the dataframe called `clean_text`

In [3]:
def clean_text(df, column):
    try:
        df['clean_text'] = df[column].str.replace('[^\w\s]','')
    except:
        pass
    return df

clean_text(df, 'text').sample(3)

Unnamed: 0,text,spam,clean_text
4826,"Subject: hello all , the program for the 2000...",0,Subject hello all the program for the 2000 t...
5480,Subject: file cabinet for mike roberts vince ...,0,Subject file cabinet for mike roberts vince ...
3813,Subject: fw : mckinsey partner specializing in...,0,Subject fw mckinsey partner specializing in a...


## Lower Case

👇 Create a function to lower case the text. Apply it to `clean_text`

In [4]:
def lowercase(df, column):
    try:
        df[column] = df[column].str.lower()
    except:
        pass
    return df

lowercase(df, 'clean_text').sample(3)

Unnamed: 0,text,spam,clean_text
3050,"Subject: re : vince , sorry for getting back...",0,subject re vince sorry for getting back a ...
3402,"Subject: re : your visit to enron frank , gr...",0,subject re your visit to enron frank great...
541,Subject: returned mail : can ' t create output...,1,subject returned mail can t create output t...


## Remove Numbers

👇 Create a function to remove numbers from the text. Apply it to `clean_text`

In [5]:
def remove_num(df, column):
    try:
        df[column] = df[column].str.replace('[\d+]', '')
    except:
        pass
    return df

remove_num(df, 'clean_text').sample(3)

Unnamed: 0,text,spam,clean_text
4547,"Subject: re : vacation in march , april stins...",0,subject re vacation in march april stinson ...
1437,Subject: re : term papers please respond to h...,0,subject re term papers please respond to her...
386,Subject: from mrs . fatima rasheed dear belov...,1,subject from mrs fatima rasheed dear belovet...


## Remove StopWords

👇 Create a function to remove stopwords from the text. Apply it to `clean_text`.

In [6]:
def remove_stopwords(text):
    stop = set(stopwords.words('english'))
    word_tokens = word_tokenize(str(text))
    return [i for i in word_tokens if i not in stop]

df['clean_text'] = df['clean_text'].apply(remove_stopwords)
df.sample(3)

Unnamed: 0,text,spam,clean_text
2994,"Subject: re : dabhol report narottam , i hav...",0,"[subject, dabhol, report, narottam, fact, rece..."
183,Subject: what ' s going on hi u . this is sar...,1,"[subject, going, hi, u, sarah, gal, hiding, wa..."
2279,Subject: off work all i will be taking the f...,0,"[subject, work, taking, following, days, work,..."


## Lemmatize

👇 Create a function to lemmatize the text. Make sure the output is a single string, not a list of words. Apply it to `clean_text`.

In [7]:
def word_lemmatizer(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w) for w in text])

df['clean_text'] = df['clean_text'].apply(word_lemmatizer)
df.sample(3)

Unnamed: 0,text,spam,clean_text
3437,Subject: re : information dear mr kaminski : ...,0,subject information dear mr kaminski thank muc...
757,Subject: http : / / www . shackleton . net he...,1,subject http www shackleton net hello visited ...
5581,Subject: foreign language lessons fyi ! - - ...,0,subject foreign language lesson fyi forwarded ...


## Bag-of-words Modelling

👇 Vectorize the `clean_text` to a Bag-of-Words representation with a default CountVectorizer . Save as `X_bow`.

In [8]:
vect = CountVectorizer()
count_matrix = vect.fit_transform(df['clean_text'])
count_array = count_matrix.toarray()
X_bow = pd.DataFrame(data = count_array, columns = vect.get_feature_names())
X_bow

Unnamed: 0,aa,aaa,aaaenerfax,aadedeji,aagrawal,aal,aaldous,aaliyah,aall,aanalysis,...,zwzm,zxghlajf,zyban,zyc,zygoma,zymg,zzmacmac,zzn,zzncacst,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


👇 Cross-validate a MultinomialNB model with the Bag-of-words. Score the model's accuracy.

In [9]:
X = X_bow
y = df['spam']

X_train, X_test, y_train, y_test = train_test_split(X, y)

nb = MultinomialNB()

nb.fit(X_train, y_train)

y_predict = nb.predict(X_test)

print(f"Précision : {round(metrics.accuracy_score(y_test, y_predict)*100,2)}%")

Précision : 99.65%


⚠️ Please push the exercise once you are done 🙃

## 🏁 