## **1. Import libraries**

In [51]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **2. Load Dataset**

In [52]:
df = pd.read_csv('../data/2cls_spam_text_cls.csv')

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [53]:
messages = df['Message'].values.tolist()

labels = df['Category'].values.tolist()

## **3. Preprocessing**

### **Features**

In [54]:
def lowercase(text):
    return text.lower()

def punctuation_remover(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words("English")
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.PorterStemmer()
    return [stemmer.stem(token) for token in tokens]

In [55]:
def preprocessing_text(text):
    text = lowercase(text)
    text = punctuation_remover(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return tokens

messages = [preprocessing_text(message) for message in messages]

In [56]:
# create dictionary
def create_dictionary(messages):
    dictionary = []
    for message in messages:
        for word in message:
            if word not in dictionary:
                dictionary.append(word)
    return dictionary

dictionary = create_dictionary(messages)

In [57]:
# create features
def created_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1
    
    return features

X = np.array([created_features(tokens, dictionary) for tokens in messages])

### **Labels**

In [58]:
### **Labels**
le = LabelEncoder()
y = le.fit_transform(labels)

print(f"Classes: {le.classes_}")
print(f"Encoded Labels: {y}")

Classes: ['ham' 'spam']
Encoded Labels: [0 0 1 ... 0 0 0]


## **4. Split data**

In [59]:
val_size = 0.2
test_size = 0.125
seed = 0

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = val_size, random_state = seed, shuffle = True)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = test_size, random_state = seed, shuffle = True)

## **5. Model**

In [60]:
model = GaussianNB()

print("Start training...")
model = model.fit(X_train, y_train)

print("Training completed")

Start training...
Training completed


## **6. Evaluation**

In [61]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Validation Accuracy: {val_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

Validation Accuracy: 0.8816143497757848
Test Accuracy: 0.8602150537634409


## **7. Prediction**

In [62]:
def predict(text, model, dictionary):
    processed_text = preprocessing_text(text)
    features = created_features(processed_text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction, prediction_cls

test_input = "I am actually thinking about you"
prediction, prediction_cls = predict(test_input, model, dictionary)

print(f"Prediction: {prediction_cls}")
print(prediction)

Prediction: ham
[0]
