## Libraries

In [35]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>
[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1006)>


### PREPROCESSING

In [41]:
DATASET_PATH = "./Data/cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

In [42]:
messages[:1]

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']

In [43]:
import string

class TextPreprocessor:
    def __init__(self, stopwords=None):
        if stopwords is None:
            # Danh sách stopwords đơn giản (bạn có thể mở rộng)
            self.stopwords = set([
                'a', 'an', 'the', 'is', 'are', 'am', 'in', 'on', 'at',
                'this', 'that', 'and', 'or', 'not', 'to', 'of', 'for', 'with', 'as', 'by', 'be'
            ])
        else:
            self.stopwords = set(stopwords)

    def lowercase(self, text):
        return text.lower()

    def remove_punctuation(self, text):
        return text.translate(str.maketrans('', '', string.punctuation))

    def tokenize(self, text):
        # tách theo dấu cách
        return text.split()

    def remove_stopwords(self, tokens):
        return [word for word in tokens if word not in self.stopwords]

    def stem(self, tokens):
        # Stemming đơn giản bằng cách cắt hậu tố phổ biến 
        suffixes = ['ing', 'ed', 'ly', 's', 'es']
        stemmed = []
        for word in tokens:
            for suffix in suffixes:
                if word.endswith(suffix) and len(word) > len(suffix) + 2:
                    word = word[: -len(suffix)]
                    break
            stemmed.append(word)
        return stemmed

    def preprocess(self, text):
        text = self.lowercase(text)
        text = self.remove_punctuation(text)
        tokens = self.tokenize(text)
        tokens = self.remove_stopwords(tokens)
        tokens = self.stem(tokens)
        return tokens


preprocess = TextPreprocessor()
messages = [preprocess.preprocess(message) for message in messages]

In [44]:
messages[:1]

[['go',
  'until',
  'jurong',
  'point',
  'crazy',
  'available',
  'only',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'there',
  'got',
  'amore',
  'wat']]

### Create a Dictionary

In [48]:
def create_dictionary(messages):
    dictionary = []
    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary

dictionary = create_dictionary(messages)

#### Bag of words

In [64]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1
    return features

X = np.array([create_features(tokens, dictionary) for tokens in messages])


In [65]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


### Split data preparation

In [66]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125 # 0.1 / (1 - 0.2)
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size= VAL_SIZE,
                                                  shuffle=True,
                                                  random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                    test_size=TEST_SIZE,
                                                    shuffle=True,
                                                    random_state=SEED)

### Model

In [67]:
model = GaussianNB()
print("Start Training ....")
model.fit(X_train, y_train)
print("Training completed ....")

Start Training ....
Training completed ....


### EVALUATION

In [68]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)


print(f"Val accuracy: {val_accuracy:.4f}")
print(f"Test accuracy: {test_accuracy:.4f}")


Val accuracy: 0.8888
Test accuracy: 0.8710


### Implement Prediction

In [72]:
def predict(text, model, dictionary, label_encoder):
    precessed_text = preprocess.preprocess(text)
    features = create_features(precessed_text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = label_encoder.inverse_transform(prediction)[0]
    return prediction_cls


test_input = "I am actually thinking a way of doing something useful"
prediction_cls = predict(test_input, model, dictionary, le)
print(f"Prediction: {prediction_cls}")

Prediction: ham
