<a href="https://colab.research.google.com/github/Quinn-Caton/CS_5568_Project/blob/main/group_a1_5563.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Part 1: Entropy, Cross-entropy, and KL Divergence Calculations using Numpy

In [None]:
import numpy as np

P = np.array([0.441, 0.255, 0.132, 0.172])

# uniform distribution Q
Q = np.full_like(P, 1/len(P))

# Entropy: H(P) = -Σ p(x) log p(x)
entropy_P = -np.sum(P * np.log2(P))

# Cross-Entropy: H(P, Q) = -Σ p(x) log q(x)
cross_entropy = -np.sum(P * np.log2(Q))

# KL Divergence: D_KL(P || Q) = Σ p(x) log (p(x) / q(x))
kl_divergence = np.sum(P * np.log2(P / Q))

entropy_P, cross_entropy, kl_divergence

print(f"Entropy (H(P)): {entropy_P:.4f}")
print(f"Cross-Entropy (H(P, Q)): {cross_entropy:.4f}")
print(f"KL Divergence (D_KL(P || Q)): {kl_divergence:.4f}")

Entropy (H(P)): 1.8460
Cross-Entropy (H(P, Q)): 2.0000
KL Divergence (D_KL(P || Q)): 0.1540


In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

### Loading dataset and preprocessing

In [None]:
# Load dataset (a csv with 'text' and 'label' columns)
data = pd.read_csv('spam.csv')

In [None]:
def preprocess_text(text):
    text = text.lower() # make text lowercase
    text = re.sub(r'[^a-zA-Z]', ' ', text) # replace non-letters with space
    words = word_tokenize(text)

    # remove stop words
    words = [word for word in words if word not in stopwords.words('english')]
    return words

    # add lemmatization and/or stemming

In [None]:
data['tokens'] = data['text'].apply(preprocess_text)

In [None]:
data

Unnamed: 0,label,text,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, goes, usf, lives, around, though]"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[nd, time, tried, contact, u, u, pound, prize,..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestions]"
5570,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, interested, buyin..."


In [None]:
def filter_pos(tokens, target_pos):
    tagged = pos_tag(tokens)
    return [word for word, pos in tagged if pos.startswith(target_pos)]

In [None]:
# Create different feature sets
data['nouns'] = data['tokens'].apply(lambda x: filter_pos(x, 'NN'))
data['verbs'] = data['tokens'].apply(lambda x: filter_pos(x, 'VB'))

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test, model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

In [None]:
# Vectorize text
def vectorize_and_train(feature_column):
    vectorizer = CountVectorizer(tokenizer=lambda x: x, preprocessor=lambda x: x)
    X = vectorizer.fit_transform(data[feature_column])
    y = data['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print(f'--- Training Naive Bayes for {feature_column} ---')
    nb_model = MultinomialNB()
    train_and_evaluate(X_train, X_test, y_train, y_test, nb_model)

    print(f'--- Training Logistic Regression for {feature_column} ---')
    lr_model = LogisticRegression(max_iter=1000)
    train_and_evaluate(X_train, X_test, y_train, y_test, lr_model)




In [None]:
# Train models for different feature sets
vectorize_and_train('tokens')  # Baseline
vectorize_and_train('nouns')   # Noun-only
vectorize_and_train('verbs')   # Verb-only

# add 4th model - custom for Q2 and Q3



--- Training Naive Bayes for tokens ---
Accuracy: 0.9704035874439462
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.98      0.98       965
        spam       0.86      0.93      0.89       150

    accuracy                           0.97      1115
   macro avg       0.93      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
 [[943  22]
 [ 11 139]]
ROC-AUC Score: 0.9841899827288428
--- Training Logistic Regression for tokens ---
Accuracy: 0.97847533632287
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[965   0]
 [ 24 126]]
ROC-AUC Score: 0.9914749568221072
--



--- Training Naive Bayes for verbs ---
Accuracy: 0.9354260089686098
Classification Report:
               precision    recall  f1-score   support

         ham       0.94      0.98      0.96       965
        spam       0.86      0.62      0.72       150

    accuracy                           0.94      1115
   macro avg       0.90      0.80      0.84      1115
weighted avg       0.93      0.94      0.93      1115

Confusion Matrix:
 [[950  15]
 [ 57  93]]
ROC-AUC Score: 0.8873264248704662
--- Training Logistic Regression for verbs ---
Accuracy: 0.9192825112107623
Classification Report:
               precision    recall  f1-score   support

         ham       0.92      0.99      0.96       965
        spam       0.89      0.45      0.60       150

    accuracy                           0.92      1115
   macro avg       0.91      0.72      0.78      1115
weighted avg       0.92      0.92      0.91      1115

Confusion Matrix:
 [[957   8]
 [ 82  68]]
ROC-AUC Score: 0.9106563039723662
