## Retrieve Classified Data

In [9]:
import pandas as pd

In [10]:
data = pd.read_csv("classified_spam.csv")

# Rows of interest
print(data[["text", "label_num", "classification"]])

text = data["text"]
# We should probably do some preprocessing here
is_spam = data["label_num"]
text_class = data["classification"]

                                                    text  label_num  \
0      Subject: enron methanol ; meter # : 988291\r\n...          0   
1      Subject: hpl nom for january 9 , 2001\r\n( see...          0   
2      Subject: neon retreat\r\nho ho ho , we ' re ar...          0   
3      Subject: photoshop , windows , office . cheap ...          1   
4      Subject: re : indian springs\r\nthis deal is t...          0   
...                                                  ...        ...   
19602                           :( but your not here....          0   
19603  Becoz its  &lt;#&gt;  jan whn al the post ofic...          0   
19604  Its a valentine game. . . send dis msg to all ...          0   
19605                              We r outside already.          0   
19606  The Xmas story is peace.. The Xmas msg is love...          0   

      classification  
0                ham  
1                ham  
2                ham  
3       general spam  
4                ham  
...      

## Prepare Model Evaluation

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

In [12]:
def evaluate_model(model, X_test, Y_test):
    predictions = model.predict(X_test)
    Y_test_array = np.array(Y_test)
    print("Overall accuracy of model:", sum(predictions == Y_test_array) / len(predictions))
    print("Accuracy on classifying spam:", sum(np.where(Y_test_array != "ham", predictions == Y_test_array, 0)) / sum(Y_test_array != "ham"))
    print("Accuracy on identifying ham from spam:", sum(np.where(Y_test == "ham", predictions == Y_test, 0)) / sum(Y_test == "ham"))

In [13]:
def vectorize_data(ngram_range=(1, 1), variation="count"):
    if variation == "tfidf":
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(text)
    if variation == "count":
        vectorizer = CountVectorizer(ngram_range=ngram_range)
        X = vectorizer.fit_transform(text)
    return X, vectorizer

# Bag of Words (aka CountVectorizer)

In [14]:
vec_X, vectorizer = vectorize_data((1, 1), "count")
# print(vectorizer.get_feature_names_out(vec_X))
# print(vectorizer.vocabulary_)
# print(vec_X.shape)
# print(vec_X)

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(vec_X, text_class, test_size=0.25, random_state=10)

### Applying Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, Y_train)
evaluate_model(lr_model, X_test, Y_test)

Overall accuracy of model: 0.9602203182374541
Accuracy on classifying spam: 0.8211640211640212
Accuracy on identifying ham from spam: 0.9934293656810715


In [17]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), Y_train)
evaluate_model(nb_model, X_test.toarray(), Y_test)

Overall accuracy of model: 0.7195022439820481
Accuracy on classifying spam: 0.7111111111111111
Accuracy on identifying ham from spam: 0.7215061915592621


# TF-IDF (Term Frequency Inverse Document Frequency)

In [18]:
vec_X, vectorizer = vectorize_data((1, 1), "tfidf")
# print(vectorizer.get_feature_names_out(vec_X))
# print(vectorizer.vocabulary_)
# print(vec_X.shape)
# print(vec_X)

In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(vec_X, text_class, test_size=0.25, random_state=10)

### Applying Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(random_state=0, max_iter=1000)
lr_model.fit(X_train, Y_train)
evaluate_model(lr_model, X_test, Y_test)

Overall accuracy of model: 0.9320685434516524
Accuracy on classifying spam: 0.6613756613756614
Accuracy on identifying ham from spam: 0.9967146828405358


In [21]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train.toarray(), Y_train)
evaluate_model(nb_model, X_test.toarray(), Y_test)

Overall accuracy of model: 0.7368421052631579
Accuracy on classifying spam: 0.7132275132275132
Accuracy on identifying ham from spam: 0.7424816780389184
