## 1.0 Setting Up File

1.1 Importing Packages

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.utils.multiclass import unique_labels
from scipy.sparse import hstack
import nltk

1.2 Importing Data

In [None]:
messages = pd.read_csv("../Data/SMS_Processed.csv")

messages.head()

Unnamed: 0,Classification,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
messages.shape

(5572, 2)

1.3 Splitting Data into Train-Test (80/20)

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(messages["Message"],
                                                    messages["Classification"],
                                                    test_size=0.2,
                                                    random_state=1)

1.4 Split Validation

In [18]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((4457,), (1115,), (4457,), (1115,))

In [19]:
print(5572 * 0.8)

4457.6


80% of 5572 is 4457.6, which rounds to 4458. This matches the shape of X_train and Y_train, confirming that the train-test split was done correctly.

## 2.0 Creating Vectorizer

2.1 Extract Unigram and Bigram Features

In [None]:
vectorizer = CountVectorizer(
    stop_words="english", # remove stop words which contain little semantic meaning
    ngram_range=(1, 2) # include unigrams and bigrams
)

2.2 Fit Model

In [23]:
X = vectorizer.fit(X_train)
X_train_vectorized = vectorizer.transform(X_train)
X_train_vectorized.toarray().shape

(4457, 31674)

In [None]:
X_train_vectorized

AttributeError: 'csr_matrix' object has no attribute 'head'

2.3 Observe Model

In [28]:
feature_names = vectorizer.get_feature_names_out()
print("Features:")
print(feature_names)
print("\nDocument-Term Matrix:")
print(X_train_vectorized)

Features:
['00' '00 easter' '00 sub' ... 'ûò stick' 'ûówell' 'ûówell û_']

Document-Term Matrix:
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 66788 stored elements and shape (4457, 31674)>
  Coords	Values
  (0, 9710)	1
  (0, 19364)	1
  (0, 19368)	1
  (0, 24598)	1
  (0, 24604)	1
  (1, 2118)	1
  (1, 2121)	1
  (1, 5681)	1
  (1, 5682)	1
  (1, 5946)	1
  (1, 5951)	1
  (1, 7388)	1
  (1, 7389)	1
  (1, 11841)	1
  (1, 11857)	1
  (1, 13368)	1
  (1, 16793)	1
  (1, 16795)	1
  (2, 23608)	1
  (3, 8699)	1
  (3, 21589)	1
  (3, 21595)	1
  (3, 31175)	1
  (3, 31204)	1
  (4, 4214)	1
  :	:
  (4453, 20883)	1
  (4453, 20888)	1
  (4453, 31152)	1
  (4453, 31153)	1
  (4454, 13456)	1
  (4454, 13466)	1
  (4454, 20015)	1
  (4454, 20018)	1
  (4454, 20173)	1
  (4454, 27171)	1
  (4454, 27193)	1
  (4455, 5946)	1
  (4455, 6050)	1
  (4455, 11841)	1
  (4455, 11858)	1
  (4455, 16793)	1
  (4455, 16797)	1
  (4455, 19584)	1
  (4455, 19617)	1
  (4455, 21526)	1
  (4455, 21528)	1
  (4455, 28670)	1
  (4456, 5298)	1

2.4 Test Initial Model Accuracy

## 3.0 Encoding Additional Features

3.1 Create Function for Feature Extraction

In [8]:
# Create function "def extract_features()"
def extract_features(messages):
    # Length of Message

    # Number of Digits

    # Number of Uppercase Letters

    # Number of Special Characters

    # Number of Indicator Words


SyntaxError: incomplete input (3209820300.py, line 11)

3.2 Extract features and combine with vectorizer

In [None]:
# potential code
X_text = vectorizer.fit_transform(messages)
X_features = extract_features(messages).values

X = hstack([X_text, X_features])

NameError: name 'vectorizer' is not defined

3.3 Refit MultinomialNB model

3.4 Test Model Performance