#### Import thư viện

In [3]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\trinh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trinh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---

#### Read dataset

In [4]:
df = pd.read_csv('2cls_spam_text_cls.csv')
df.info()
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
messages = df['Message'].to_numpy()
categories = df['Category'].to_numpy()

---

#### Preprocess

In [6]:
def lowercase(text):
    return text.lower()


def remove_punctuation(text):
    # Create translate table
    translator = str.maketrans('', '', string.punctuation)
    # Apply the translation
    text = text.translate(translator)

    return text


def tokenize(text):
    return text.split()


def remove_stopword(tokens):
    stopwords = nltk.corpus.stopwords.words('english')

    return [token for token in tokens if token not in stopwords]


def stemming(tokens):
    stemmer = nltk.PorterStemmer()

    return [stemmer.stem(token) for token in tokens]


def preprocess_text(text):
    text = lowercase(text)
    text = remove_punctuation(text)
    tokens = tokenize(text)
    tokens = remove_stopword(tokens)
    tokens = stemming(tokens)

    return tokens

Apply the preprocess steps to the messages data

In [7]:
messages = [preprocess_text(message) for message in messages]

In [8]:
print(messages[0])

['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']


---

#### Build vocabulary

Message --preprocess--> a list of words <br>
Scan though all the lists and build the vocabulary

In [9]:
def build_vocabulary(messages):
    vocabulary = []

    for message in messages:
        for word in message:
            if word not in vocabulary:
                vocabulary.append(word)

    return vocabulary

In [10]:
vocabulary = build_vocabulary(messages)
print(len(vocabulary))

8190


---

#### Create features (bag of words)

In [11]:
def create_features(tokens, vocabulary):
    features = np.zeros(len(vocabulary))

    for word in tokens:
        if word in vocabulary:
            features[vocabulary.index(word)] += 1
    
    return features

In [27]:
x = [create_features(message, vocabulary) for message in messages]
x = np.array(x)
print(len(x))

5572


---

#### Label encoding 
Ham -> 0 <br>
Spam ->

In [13]:
le = LabelEncoder()
y = le.fit_transform(categories)
print(f'Classes: {le.classes_}')
print(f'Labels: {y}')

Classes: ['ham' 'spam']
Labels: [0 0 1 ... 0 0 0]


---

#### Train, test, validation split

In [14]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0
IS_SHUFFLE = True

x_train, x_val, y_train, y_val = train_test_split(
    x, y,
    test_size=VAL_SIZE,
    shuffle=IS_SHUFFLE, 
    random_state=SEED
)

x_train, x_test, y_train, y_test = train_test_split(
    x_train, y_train,
    test_size=TEST_SIZE,
    shuffle=IS_SHUFFLE,
    random_state=SEED
)

In [15]:
print(f'Number of training samples: {len(x_train)}')
print(f'Number of validation samples: {len(x_val)}')
print(f'Number of test samples: {len(x_test)}')

Number of training samples: 3899
Number of validation samples: 1115
Number of test samples: 558


#### Train model

In [16]:
model = GaussianNB()
model = model.fit(x_train, y_train)

In [17]:
y_val_pred = model.predict(x_val)
y_test_pred = model.predict(x_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Val accuracy: {val_accuracy}')
print(f'Test accuracy: {test_accuracy}')

Val accuracy: 0.8816143497757848
Test accuracy: 0.8620071684587813


---

#### Predict a user's input text

In [18]:
def predict(text, model, vocabulary, le):
    tokens = preprocess_text(text)
    x = create_features(tokens, vocabulary)
    x = np.array(x).reshape(1, -1)
    y = model.predict(x)
    prediction = le.inverse_transform(y)[0]

    return prediction

In [19]:
test_input = "Do you free on Sunday?"
prediction = predict(test_input, model, vocabulary, le) 
print(prediction)

test_input = "This is our new product"
prediction = predict(test_input, model, vocabulary, le) 
print(prediction)

ham
spam


---

#### Version 2: Dont use library model

##### Create probability table so we can pluck the P(w|c) into Bayes formula, and the P(c) too

In [49]:
y = y.reshape(-1, 1)
xy = np.hstack((x, y))

x_ham = np.array([row[:-1] for row in xy if row[-1] == 0])
x_spam = np.array([row[:-1] for row in xy if row[-1] == 1])

# Create ham vocabulary probability table
ham_vocabulary_probability = np.sum(x_ham, axis=0)
ham_vocabulary_probability += 1
total_ham_words = x_ham.sum()
ham_vocabulary_probability /= (total_ham_words + len(x_ham))

# Create spam vocabulary probability table
spam_vocabulary_probability = np.sum(x_spam, axis=0)
spam_vocabulary_probability += 1
total_spam_words = x_spam.sum()
spam_vocabulary_probability /= (total_spam_words + len(x_spam))

# Find the spam and ham probability
spam_prob = len(x_ham) / len(x)
ham_prob = len(x_spam) / len(x)

##### Use the Bayes formula to see which is bigger P(ham|w1, w2, ...) or P(spam|w1, w2, ...)

In [51]:
# Use log space to prevent underflow
def predict_text(text):
    tokens = preprocess_text(text)

    # HAM
    ham_pwc_part = 0
    for token in tokens:
        ham_pwc_part += np.log(ham_vocabulary_probability[vocabulary.index(token)])
    full_ham = ham_pwc_part + np.log(ham_prob)

    # SPAM
    spam_pwc_part = 0
    for token in tokens:
        spam_pwc_part += np.log(spam_vocabulary_probability[vocabulary.index(token)])
    full_spam = spam_pwc_part + np.log(ham_prob)

    if full_ham >= full_spam:
        return 0
    else:
        return 1
    
text = "Nah I don't think he goes to usf, he lives around here though"
print(predict_text(text))
text = "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
print(predict_text(text))

0
1
