In [20]:
import pandas as pd
import numpy as np

In [21]:
# load file
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
# count the number of regular and spam email
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [23]:
def get_spam_number(x):
    if x == 'spam':
        return 1
    return 0

In [39]:
# build machine learning model without handling the imbalance in the dataset
# write transformation function which can convert the text into numbers
# spam: 1; other emails: 0
# df['spam'] = df['Category'].apply(get_spam_number)
df['spam'] = df['Category'].apply(lambda x: 1 if x =='spam' else 0)


In [40]:
df.shape 
# total 5572 emails

(5572, 3)

In [41]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [42]:
# ========== Train test split ===========
from sklearn.model_selection import train_test_split
# X: message body
# y: spam or no spam
# text_size=0.2 : 20% of samples in test, 80% remain in training dataset
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [36]:
# after split, there are 4457 in training dataset (5572 x 80%)
X_train.shape

(4457,)

In [34]:
# test: 1115 (5572 x 20%)
X_test.shape

(1115,)

In [30]:
type(X_train)

pandas.core.series.Series

In [54]:
# get the first 4 samples (sample[0-3])
X_train[:4]

1227    Reply with your name and address and YOU WILL ...
2347    Its posible dnt live in  &lt;#&gt; century cm ...
4201                              I will come tomorrow di
1799       That one week leave i put know that time. Why.
Name: Message, dtype: object

In [55]:
# get the first email in train (index 1227)
X_train[:4][1227]

'Reply with your name and address and YOU WILL RECEIVE BY POST a weeks completely free accommodation at various global locations www.phb1.com ph:08700435505150p'

In [47]:
type(y_train)

pandas.core.series.Series

In [48]:
y_train[:4]

1227    1
2347    0
4201    0
1799    0
Name: spam, dtype: int64

In [51]:
X_train.values

array(['Reply with your name and address and YOU WILL RECEIVE BY POST a weeks completely free accommodation at various global locations www.phb1.com ph:08700435505150p',
       'Its posible dnt live in  &lt;#&gt; century cm frwd n thnk different',
       'I will come tomorrow di', ...,
       'Cool. So how come you havent been wined and dined before?',
       'I wish things were different. I wonder when i will be able to show you how much i value you. Pls continue the brisk walks no drugs without askin me please and find things to laugh about. I love you dearly.',
       "I'm at bruce &amp; fowler now but I'm in my mom's car so I can't park (long story)"],
      dtype=object)

In [49]:
type(X_train.values)

numpy.ndarray

In [52]:
# ======= Create bag of words representation using CountVectorizer ======

# source: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
from sklearn.feature_extraction.text import CountVectorizer

# create the count vectorizer
v = CountVectorizer()

# create an instance of that class 
X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7839 sparse matrix of type '<class 'numpy.int64'>'
	with 59355 stored elements in Compressed Sparse Row format>

In [53]:
# convert to a NumPy array (a sparse matrix)
X_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [56]:
# get the first 2 samples
X_train_cv.toarray()[:2]
# a sparse metric

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [62]:
# the first sample
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [57]:
X_train_cv.shape

(4457, 7839)

In [58]:
# give entire vocabulary
v.get_feature_names_out()

array(['00', '000', '008704050406', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [60]:
# get words from 1000 to 1050
v.get_feature_names_out()[1000:1050]

array(['answer', 'answering', 'answers', 'answr', 'antelope', 'antha',
       'anthony', 'anti', 'antibiotic', 'any', 'anybody', 'anyhow',
       'anymore', 'anyone', 'anyones', 'anyplaces', 'anythiing',
       'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apeshit',
       'aphex', 'apo', 'apologise', 'apologize', 'apology', 'app',
       'apparently', 'appendix', 'applausestore', 'applebees', 'apples',
       'application', 'apply', 'applyed', 'applying', 'appointment',
       'appointments', 'appreciate', 'approaches', 'approaching',
       'appropriate', 'apps'], dtype=object)

In [61]:
v.get_feature_names_out().shape
# My vocabulary folks: 7839

(7839,)

In [63]:
# give all the methods that support on a variable
dir(v)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [None]:
v.vocabulary_ # print vocabulary in accordance with (ứng với) index

# Cell -> Current Outputs -> Clear (cuz it occupies a lot of vertical space)

In [68]:
v.get_feature_names_out()[4228] #the word 'locations'
v.get_feature_names_out()[1828] #the word 'christmas'

'christmas'

In [74]:
# Convert to array
X_train_np = X_train_cv.toarray()
# the first 4 emails
X_train_np[:4]
# Get my first email
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [76]:
# get the indexes where the value is not zero
np.where(X_train_np[0]!=0)

(array([  69,  798,  833,  978, 1127, 1608, 1921, 1954, 3007, 3179, 4228,
        4747, 5242, 5245, 5399, 5707, 5798, 7347, 7537, 7617, 7647, 7740,
        7802, 7807], dtype=int64),)

In [78]:
X_train[:4]
# get my first sample (at index 1227)
X_train[:4][1227]

'Reply with your name and address and YOU WILL RECEIVE BY POST a weeks completely free accommodation at various global locations www.phb1.com ph:08700435505150p'

In [87]:
# X_train_np is my vector
# 69th position is a non-zero value, the same at other position (798, 833,...)
X_train_np[0][69]
X_train_np[0][833]

1

In [86]:
v.get_feature_names_out()[833]

'address'

In [89]:
# ========== Train the naive bayes model ===========
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

# MultinomialNB()
# In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
# On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [90]:
# convert X_test (simple email, word email) into count vectorizer
X_test_cv = v.transform(X_test)

In [91]:
# ========= Evaluate Performance ===========
from sklearn.metrics import classification_report

# do a prediction on X_test_cv
y_pred = model.predict(X_test_cv)

# first give the truth (y_test), the give prediction
print(classification_report(y_test, y_pred))

# 0: non-spam; 1: spam

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       986
           1       0.98      0.92      0.95       129

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [92]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

# result means the first email is not a spam (0), the second one is a spam (1)

array([0, 1], dtype=int64)

In [96]:
# ======== Train the model using sklearn pipeline and reduce number of lines of code =========
from sklearn.pipeline import Pipeline

# a convenient  API that we can use
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

# Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])
# In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
# On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

In [97]:
clf.fit(X_train, y_train)

In [99]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       986
           1       0.98      0.92      0.95       129

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

