## Nikshay Jain | MM21B044
### DA5400: Assign 2

In [156]:
import re, os
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [112]:
data = pd.read_csv('enron_spam_data.csv')

In [113]:
data

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


### EDA + preprocessing

In [114]:
data = data.drop(['Message ID', 'Date'], axis=1)
data['Spam/Ham'].replace({'spam': 1, 'ham': 0}, inplace=True)
data = data.rename(columns={'Spam/Ham': 'label'})
data.fillna("", inplace = True)

In [118]:
data

Unnamed: 0,Subject,Message,label
0,christmas tree farm pictures,,0
1,"vastar resources , inc .","gary , production from the high island larger ...",0
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0
3,re : issue,fyi - see note below - already done .\nstella\...,0
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0
...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1


In [85]:
data['Spam/Ham'] = 1 if "spam" else 0

In [45]:
sum(data['Spam/Ham']=='spam')/sum(data['Spam/Ham']=='ham')   # ratio of spam:ham

1.0378362042913267

In [119]:
def make_usable(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)    # Remove special characters and digits
    return text.split()

data['body'] = (data['Message']).apply(make_usable)

In [152]:
data

Unnamed: 0,Subject,Message,label,body
0,christmas tree farm pictures,,0,[]
1,"vastar resources , inc .","gary , production from the high island larger ...",0,"[gary, production, from, the, high, island, la..."
2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,0,"[calpine, daily, gas, nomination, doc]"
3,re : issue,fyi - see note below - already done .\nstella\...,0,"[fyi, see, note, below, already, done, stella,..."
4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,0,"[fyi, forwarded, by, lauri, a, allen, hou, ect..."
...,...,...,...,...
33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",1,"[hello, welcome, to, gigapharm, onlinne, shop,..."
33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,1,"[i, got, it, earlier, than, expected, and, it,..."
33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,1,"[are, you, ready, to, rock, on, let, the, man,..."
33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,1,"[learn, how, to, last, times, longer, in, bed,..."


In [121]:
X_train, X_test, y_train, y_test = train_test_split(data['body'], data['label'], test_size=0.2)

### Naive Bayes Algo

In [None]:
def train_naive_bayes(train_data, train_labels):
    """Trains a Naive Bayes classifier by calculating word probabilities for spam and ham."""
    global spam_word_probs, ham_word_probs, total_spam_words, total_ham_words, vocab
    
    spam_counts = defaultdict(int)
    ham_counts = defaultdict(int)

    # Count words for each class
    for words, label in zip(train_data, train_labels):
        for word in words:
            vocab.add(word)
            if label == 1:  # Spam
                spam_counts[word] += 1
                total_spam_words += 1
            else:  # Ham
                ham_counts[word] += 1
                total_ham_words += 1
    
    vocab_size = len(vocab)

    # Calculate word probabilities with Laplace smoothing
    spam_word_probs = {word: (spam_counts[word] + alpha) / (total_spam_words + alpha * vocab_size) for word in vocab}
    ham_word_probs = {word: (ham_counts[word] + alpha) / (total_ham_words + alpha * vocab_size) for word in vocab}

In [122]:
# Get word frequencies for each class
sp_word = []
hm_word = []

for words, label in zip(X_train, y_train):
    if label == 1:
        sp_word.extend(words)
    else:
        hm_word.extend(words)

sp_word_counts = Counter(sp_word)
hm_word_counts = Counter(hm_word)

In [125]:
# Calculate probabilities
total_spam_words = sum(spam_word_counts.values())
total_ham_words = sum(ham_word_counts.values())
vocab = list(set(spam_word_counts.keys()).union(set(ham_word_counts.keys())))
vocab_size = len(vocab)

# Calculate prior probabilities for each class
p_spam = y_train.mean()
p_ham = 1 - p_spam

# Laplace smoothing
alpha = 1
spam_word_probs = {word: (spam_word_counts[word] + alpha) / (total_spam_words + alpha * vocab_size) for word in vocab}
ham_word_probs = {word: (ham_word_counts[word] + alpha) / (total_ham_words + alpha * vocab_size) for word in vocab}

In [144]:
def predict(words):
    spam_score = np.log(p_spam)
    ham_score = np.log(p_ham)

    vocab_set = set(vocab)

    # Precompute log probabilities for known words
    for word in words:
        if word in vocab_set:  # Check membership in the set
            spam_word_prob = spam_word_probs.get(word, alpha / (total_spam_words + alpha * vocab_size))
            ham_word_prob = ham_word_probs.get(word, alpha / (total_ham_words + alpha * vocab_size))
            spam_score += np.log(spam_word_prob)
            ham_score += np.log(ham_word_prob)

    return 1 if spam_score > ham_score else 0

In [151]:
# Predict on test set
y_pred = X_test.apply(predict)

Accuracy: 98.36892052194544%


In [168]:
# Evaluate over metrics
accuracy = (y_pred == y_test).mean()
f1 = f1_score(y_pred,y_test)
print(f'f1-score: {f1:.4f}')
print(f'Accuracy: {accuracy * 100:.2f}%')

f1-score: 0.9840
Accuracy: 98.37%


In [171]:
# Parameters for Laplace smoothing
alpha = 1

# Placeholder for spam and ham word probabilities
spam_word_probs = {}
ham_word_probs = {}

# Prior probabilities (could be based on your training dataset)
p_spam = 0.5
p_ham = 0.5

# Vocabulary and total word counts (to be computed during training)
vocab = set()
total_spam_words = 0
total_ham_words = 0

def preprocess_text(text):
    """ Preprocesses the email text by converting to lowercase, removing special characters, and tokenizing."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.split()

def train_naive_bayes(train_data, train_labels):
    """Trains a Naive Bayes classifier by calculating word probabilities for spam and ham."""
    global spam_word_probs, ham_word_probs, total_spam_words, total_ham_words, vocab

    spam_counts = defaultdict(int)
    ham_counts = defaultdict(int)
    
    # Count words for each class
    for text, label in zip(train_data, train_labels):
        words = preprocess_text(text)
        for word in words:
            vocab.add(word)
            if label == 1:  # Spam
                spam_counts[word] += 1
                total_spam_words += 1
            else:  # Ham
                ham_counts[word] += 1
                total_ham_words += 1
    
    vocab_size = len(vocab)

    # Calculate word probabilities with Laplace smoothing
    spam_word_probs = {word: (spam_counts[word] + alpha) / (total_spam_words + alpha * vocab_size) for word in vocab}
    ham_word_probs = {word: (ham_counts[word] + alpha) / (total_ham_words + alpha * vocab_size) for word in vocab}

def predict_email(words):
    """Predicts if an email is spam (+1) or ham (0) based on the words in the email."""
    spam_score = np.log(p_spam)
    ham_score = np.log(p_ham)

    for word in words:
        if word in vocab:
            spam_score += np.log(spam_word_probs.get(word, alpha / (total_spam_words + alpha * len(vocab))))
            ham_score += np.log(ham_word_probs.get(word, alpha / (total_ham_words + alpha * len(vocab))))

    return 1 if spam_score > ham_score else 0

# def classify_emails_in_folder(folder_path='test'):
#     """Classifies each email in the specified folder as spam (1) or ham (0) and outputs results."""
#     results = {}
#     for filename in os.listdir(folder_path):
#         if filename.endswith(".txt"):
#             filepath = os.path.join(folder_path, filename)
#             with open(filepath, 'r') as file:
#                 email_content = file.read()
#                 words = preprocess_text(email_content)
#                 prediction = predict_email(words)
#                 results[filename] = prediction
#                 print(f"{filename}: {'Spam' if prediction == 1 else 'Ham'}")
#     return results

# Sample usage
if __name__ == "__main__":
    # Training data: List of email texts and corresponding labels (1 for spam, 0 for ham)
    train_data = [
        "Win a $1000 prize now!", "Hello friend, let's catch up",
        "Get rich quick by investing in stocks", "Meeting at 3 PM tomorrow",
    ]
    train_labels = [1, 0, 1, 0]

    # Train the Naive Bayes model
    train_naive_bayes(train_data, train_labels)

#     # Classify emails in the "test" folder
#     classify_emails_in_folder()