### Step 1: Import relevant libraries

In [2]:
import string 
import nltk # Natural Language Toolkit
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


### Step 2: Read dataset

In [38]:
DATASET_PATH = './2cls_spam_text_cls.csv'
df = pd.read_csv(DATASET_PATH)

messages = df['Message'].values.tolist()
labels = df['Category'].values.tolist()

In [39]:
len(messages)

5572

### Step 3: Preprocessing

#### Preprocess feature datas

<p align="center">
    <img src="./assets/preprocessing.png" alt="Preprocessing Step">
</p>

In [40]:
def lower_case(text):
    return text.lower()

def punctuation_removal ( text ) :
    translator = str.maketrans ('', '', string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return text.split(' ')

def remove_stopword(token):
    stop_words = nltk.corpus.stopwords.words('english')
    return [word for word in token if word not in stop_words]

def stemming(token):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(word) for word in token]

def preprocess_text(text):
    text = punctuation_removal(lower_case(text))
    token = stemming(remove_stopword(tokenize(text)))
    return token

messages = [preprocess_text(message) for message in messages]

#### Build dictionary, and create features 

In [41]:
def create_dictionary(messages):
    dictionary = []
    for token in messages:
        for word in token:
            if word not in dictionary:
                dictionary.append(word)
    return dictionary

dictionary = create_dictionary(messages)

In [42]:
len(dictionary)

8192

In [46]:
# features represent the frequency of each word in dictionary 
def create_features(token, dictionary):
    features = np.zeros(len(dictionary))
    for word in token:
        if word in dictionary:
            features[dictionary.index(word)] += 1
    return features

X = np.array([create_features(token, dictionary) for token in messages])

#### Preprocess labels

In [45]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f'Classes: {le.classes_}')
print(f'Encoded labels: {y}')

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


### Step 4: Split dataset to train/val/test

In [47]:
# split train/val/test to 7:2:1
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size = VAL_SIZE, 
                                                  shuffle=True, 
                                                  random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=TEST_SIZE, 
                                                    shuffle=True, 
                                                    random_state=SEED)

### Step 5: Train and evaluate model

In [48]:
%%time
model = GaussianNB()
model = model.fit(X_train, y_train)

CPU times: total: 2.81 s
Wall time: 5.88 s


In [50]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8620071684587813


### Step 6: Final classification

In [60]:
def predict(text, model, dictionary):
    token = preprocess_text(text)
    features = create_features(token, dictionary).reshape(1,-1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0] # inverse from 0 or 1 to 'ham' or 'spam'
    return prediction_cls

test_1 = 'Hello nice to meet u'
prediction_cls_1 = predict(test_1, model, dictionary)
print(f'Prediction 1: {prediction_cls_1}')
test_2 = 'you have WON a guaranteed £1000000 cash from Mr Beast!'
prediction_cls_2 = predict(test_2, model, dictionary)
print(f'Prediction 2: {prediction_cls_2}')

Prediction 1: ham
Prediction 2: spam
