In [1]:
import pandas as pd
import numpy as np
import random as rm
import nltk
import unicodedata
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import confusion_matrix

In [207]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')  # For POS tagging


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sabyrkabylbek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sabyrkabylbek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sabyrkabylbek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sabyrkabylbek/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
!wget https://lazyprogrammer.me/course_files/spam.csv

--2025-05-21 22:08:43--  https://lazyprogrammer.me/course_files/spam.csv
Resolving lazyprogrammer.me (lazyprogrammer.me)... 2606:4700:3030::ac43:d5a6, 2606:4700:3031::6815:17d2, 172.67.213.166, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|2606:4700:3030::ac43:d5a6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 503663 (492K) [text/csv]
Saving to: ‘spam.csv’


2025-05-21 22:08:46 (539 KB/s) - ‘spam.csv’ saved [503663/503663]



In [3]:
df = pd.read_csv("spam.csv", encoding='ISO-8859-1')

## Data Preparation

In [6]:
df['v2'] = df.v2.apply(lambda x: unicodedata.normalize('NFKC',str(x)))
df['v2_token'] = df.v2.apply(lambda x: nltk.word_tokenize(x))
df['v2_token'] = df.v2_token.apply(lambda x: [token for token in x if token.lower() not in set(stopwords.words('english'))])


In [7]:
# Function to map POS tags to WordNet POS categories
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun

lemmatizer = WordNetLemmatizer()
df['v2_token'] = df['v2_token'].apply(lambda x: [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tag(x)])

In [8]:
df['v1_bool'] = df.v1.apply(lambda x: x == 'spam')

In [9]:
df['v2_token_set'] = df.v2_token.apply(lambda x: set(x))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.v2_token, df.v1_bool, test_size = 0.2, random_state = 42, 
                                                   stratify = df.v1_bool)

In [12]:
all_train_vocab_set = set()
spam_vocab_set = set()
ham_vocab_set = set()
for x, y in zip(X_train, y_train):
    all_train_vocab_set |= set(x)
    if y == True:
        spam_vocab_set |= set(x)
    else:
        ham_vocab_set |= set(x)

In [13]:
# calculate priors
P_s = sum(y_train == 1)/ len(y_train)
P_h = sum(y_train == 0)/ len(y_train)
P_s, P_h

(0.13417096701817366, 0.8658290329818263)

In [20]:
cw_s = {} # count of word in spam
cw_h = {} # count of word in ham

for token_line in X_train:
    for word_ in token_line:
        if word_ in spam_vocab_set:
            cw_s[word_] = cw_s.get(word_, 0) + 1
    
        if word_ in ham_vocab_set:
            cw_h[word_] = cw_h.get(word_, 0) + 1

# Print some sample counts for verification
print("Sample counts for spam words:", {k: cw_s[k] for k in list(cw_s)[:5]})
print("Sample counts for ham words:", {k: cw_h[k] for k in list(cw_h)[:5]})

Sample counts for spam words: {'nothing': 14, '.': 3906, 'wat': 67, "'s": 379, 'guy': 43}
Sample counts for ham words: {'Going': 10, 'nothing': 14, 'great.bye': 1, 'wont': 27, '.': 3906}


In [22]:
# input_list - is the list of tokens to classify
# P_prior - is the priors
# class_word_cnt - dictionary of word: count in the given class

def calculate_probability(input_list, P_prior, class_word_cnt):
    log_sum_p = 0
    all_words_count = sum(class_word_cnt.values())
    for word in input_list:
        term = (class_word_cnt.get(word, 0)+1)/(all_words_count)
        log_sum_p += math.log(term)
    log_sum_p += math.log(P_prior)
    return log_sum_p

In [24]:
def predict_spam(X_input):
    y_train_predict = np.zeros(len(X_input))
    for i in range(0, len(X_input)):
        input_set = X_input.iloc[i]
        spam = calculate_probability(input_set, P_s, cw_s)
        ham = calculate_probability(input_set, P_h, cw_h)
        if spam > ham:
            y_train_predict[i] = 1
        else:
            y_train_predict[i] = 0
    return y_train_predict

In [64]:
def calculate_confustion_matrixt(y_actual, y_pred):
    cm_train = confusion_matrix(y_actual, y_pred)
    # Define the labels for the rows and columns
    row_labels = ['Actual Negative', 'Actual Positive']
    column_labels = ['Predicted Negative', 'Predicted Positive']
    # Create a DataFrame with labeled rows and columns
    cm_train_df = pd.DataFrame(cm_train, index=row_labels, columns=column_labels)
    return cm_train_df

In [66]:
y_train_pred = predict_spam(X_train)
cm_train = calculate_confustion_matrixt(y_train, y_train_pred)

In [68]:
cm_train

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,3834,25
Actual Positive,1,597


In [70]:
y_test_pred = predict_spam(X_test)
cm_test = calculate_confustion_matrixt(y_test, y_test_pred)

In [72]:
cm_test

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,943,23
Actual Positive,9,140


In [74]:
precision_train = cm_train.loc['Actual Positive', 'Predicted Positive'] / (cm_train.loc['Actual Positive', 'Predicted Positive'] +
                                                                       cm_train.loc['Actual Negative', 'Predicted Positive'])
recall_train = cm_train.loc['Actual Positive', 'Predicted Positive'] / (cm_train.loc['Actual Positive', 'Predicted Positive'] +
                                                                       cm_train.loc['Actual Positive', 'Predicted Negative'])


precision_test = cm_test.loc['Actual Positive', 'Predicted Positive'] / (cm_test.loc['Actual Positive', 'Predicted Positive'] +
                                                                       cm_test.loc['Actual Negative', 'Predicted Positive'])
recall_test = cm_test.loc['Actual Positive', 'Predicted Positive'] / (cm_test.loc['Actual Positive', 'Predicted Positive'] +
                                                                       cm_test.loc['Actual Positive', 'Predicted Negative'])

F1_train = 2 * ((precision_train * recall_train) / (precision_train + recall_train))
F1_test = 2 * ((precision_test * recall_test) / (precision_test + recall_test))

In [76]:
F1_train

0.978688524590164

In [78]:
F1_test

0.8974358974358976

## Old version of code (Errors)

In [510]:
cw_s = {}
cw_h = {}

for word_ in all_train_vocab_set:
    if word_ in spam_vocab_set:
        cw_s[word_] = cw_s.get(word_, 0) + 1

    if word_ in ham_vocab_set:
        cw_h[word_] = cw_h.get(word_, 0) + 1

# Print some sample counts for verification
print("Sample counts for spam words:", {k: cw_s[k] for k in list(cw_s)[:5]})
print("Sample counts for ham words:", {k: cw_h[k] for k in list(cw_h)[:5]})

Sample counts for spam words: {}
Sample counts for ham words: {'fortune': 1, 'kappa': 1, 'sapna': 1, 'sofa': 1, 'breezy': 1}


In [532]:
def calculate_probability(input_set, all_train_vocab_set, cls_vocab_set, cnt_wrd_dict, P_prior):
    tl_wrds_class = len(cls_vocab_set)
    tl_wrds_trainset = len(all_train_vocab_set)
    log_sum_p = 0
    for word in input_set:
        term = (cnt_wrd_dict.get(word, 0)+1)/(tl_wrds_class+tl_wrds_trainset)
        log_sum_p += math.log(term)
    log_sum_p += math.log(P_prior)
    return log_sum_p

In [556]:
y_train_predict = np.zeros(len(y_train))
for i in range(0, len(X_train)):
    input_set = X_train.iloc[i]
    spam = calculate_probability(input_set, all_train_vocab_set, spam_vocab_set, cw_s, P_s)
    ham = calculate_probability(input_set, all_train_vocab_set, ham_vocab_set, cw_h, P_h)
    if spam > ham:
        y_train_predict[i] = 1
    else:
        y_train_predict[i] = 0

In [558]:
y_train_predict = np.zeros(len(y_train))
for i in X_train:
    spam = calculate_probability(i, all_train_vocab_set, spam_vocab_set, cw_s, P_s)
    ham = calculate_probability(i, all_train_vocab_set, ham_vocab_set, cw_h, P_h)