## 1.0 Setting Up File

1.1 Importing Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels
from scipy.sparse import hstack
import nltk

1.2 Importing Data

In [2]:
messages = pd.read_csv("../Data/SMS_Processed.csv")

messages.head()

Unnamed: 0,Classification,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
messages.shape

(5572, 2)

1.3 Splitting Data into Train-Test (80/20)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(messages["Message"],
                                                    messages["Classification"],
                                                    test_size=0.2,
                                                    random_state=1)

1.4 Split Validation

In [5]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [6]:
print(5572 * 0.8)

4457.6


80% of 5572 is 4457.6, which rounds to 4458. This matches the shape of X_train and Y_train, confirming that the train-test split was done correctly.

## 2.0 Creating Vectorizer

2.1 Extract Unigram and Bigram Features

In [7]:
vectorizer = CountVectorizer(
    stop_words="english", # remove stop words which contain little semantic meaning
    ngram_range=(1, 2) # include unigrams and bigrams
)

2.2 Fit Model

In [8]:
X = vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape

(4457, 31520)

In [13]:
model = MultinomialNB(alpha=0.1) # alpha is just a smoothing parameter
model.fit(X_train_vectorized, Y_train)

2.3 Observe Model

In [14]:
feature_names = vectorizer.get_feature_names_out()
print("Features:")
print(feature_names)

Features:
['00' '00 easter' '00 sub' ... 'zouk nichols' 'zyada' 'zyada kisi']


2.4 Test Initial Model Accuracy

In [15]:
predictions = model.predict(vectorizer.transform(X_test))
print("Accuracy:", 100 * sum(predictions == Y_test) / len(predictions), '%')

Accuracy: 99.01345291479821 %


## 3.0 Encoding Additional Features

3.1 Create Function for Feature Extraction

In [8]:
# Create function "def extract_features()"
def extract_features(messages):
    # Length of Message

    # Number of Digits

    # Number of Uppercase Letters

    # Number of Special Characters

    # Number of Indicator Words


SyntaxError: incomplete input (3209820300.py, line 11)

3.2 Extract features and combine with vectorizer

In [None]:
# potential code
X_text = vectorizer.fit_transform(messages)
X_features = extract_features(messages).values

X = hstack([X_text, X_features])

NameError: name 'vectorizer' is not defined

3.3 Refit MultinomialNB model

3.4 Test Model Performance