# **Step 1)** Preprocessing data

In [14]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

documents = []
labels = []
with open('emails.csv') as f:
    f.readline()
    for row in f:
        line = ''.join(row.split(',')[0:-1])[8:]
        label= int(row.split(',')[-1].strip())
        sentence = re.sub(r"[^a-zA-Z]", " ", line.lower())
        documents.append(sentence)
        labels.append(label)

# **Step 2)** Spliting data into random train and test subsets

In [15]:
from sklearn.model_selection import train_test_split

train_docs, test_docs, train_labels, test_labels = train_test_split(documents, labels, test_size=0.2, random_state=42)

# **Step 3)** Processing training documents

In [16]:
from collections import defaultdict  

total_spam_words = 0
total_not_spam_words = 0
P_spam = 0
P_not_spam = 0
P_word_given_spam = {}  
P_word_given_not_spam = {}

vocabulary = set()  
for doc in train_docs: 
    for word in doc.split():
        if word.casefold() not in stop_words:
            vocabulary.add(word)  

def train_naive_bayes(documents, labels):  
    freq = defaultdict(lambda: defaultdict(int))    
    global total_not_spam_words
    global total_spam_words
    global P_spam
    global P_not_spam
    num_of_spams = 0  
    num_of_not_spams = 0  
    
    for doc, label in zip(documents, labels):  
        words = doc.split()  
        if label == 1:  
            num_of_spams += 1  
            for word in words:  
                if word.casefold() not in stop_words:
                    total_spam_words += 1
                    freq['spam'][word] += 1  
        else:  
            num_of_not_spams += 1  
            for word in words:  
                if word.casefold() not in stop_words:
                    total_not_spam_words += 1  
                    freq['not_spam'][word] += 1
                
    total_documents = num_of_spams + num_of_not_spams  
    P_spam = num_of_spams / total_documents  
    P_not_spam = num_of_not_spams / total_documents  
    
    for word in freq['spam']:  # applying Laplace smoothing
        P_word_given_spam[word] = (freq['spam'][word] + 1) / (total_spam_words + len(vocabulary)) 
    for word in freq['not_spam']:  
        P_word_given_not_spam[word] = (freq['not_spam'][word] + 1) / (total_not_spam_words + len(vocabulary)) 


train_naive_bayes(train_docs, train_labels)

<a id='step3'></a>

# **Step 4)** Classification and predicition

In [17]:
import numpy as np

def predict_by_multiplication(newEmail):  
    words = newEmail.split()  
    predict_spam_label = P_spam  
    predict_not_spam_label = P_not_spam  
    
    for word in words:
        if word.casefold() not in stop_words:
            predict_spam_label *= P_word_given_spam.get(word, 1 / (total_spam_words + len(vocabulary)))
            predict_not_spam_label *= P_word_given_not_spam.get(word, 1 / (total_not_spam_words + len(vocabulary)))  
    
    return 1 if predict_spam_label > predict_not_spam_label else 0  

def predict_by_summation(newEmail):  
    words = newEmail.split()  
    log_predict_spam_label = np.log(P_spam)
    log_predict_not_spam_label = np.log(P_not_spam)
    
    for word in words:
        if word.casefold() not in stop_words:
            log_predict_spam_label += np.log(P_word_given_spam.get(word, 1 / (total_spam_words + len(vocabulary))))
            log_predict_not_spam_label += np.log(P_word_given_not_spam.get(word, 1 / (total_not_spam_words + len(vocabulary))))
    
    return 1 if log_predict_spam_label > log_predict_not_spam_label else 0


correct_predictions_by_multiplication = 0
correct_predictions_by_summation = 0
for email, label in zip(test_docs, test_labels):
    predicted_label = predict_by_multiplication(email)
    if predicted_label == label:
        correct_predictions_by_multiplication += 1

    predicted_label = predict_by_summation(email)
    if predicted_label == label:
        correct_predictions_by_summation += 1

print("Accuracy through multiplication:", correct_predictions_by_multiplication / len(test_labels))
print("Accuracy through summation(logarithm):", correct_predictions_by_summation / len(test_labels))

Accuracy through multiplication: 0.912739965095986
Accuracy through summation(logarithm): 0.9921465968586387


<a id='step4'></a>

# Questions:

## 1. What happens if there exists a word in the email that is not previousely processed in the BoW matrix?

Assuming a probability of zero for a word that is not present in the training documents can lead to several disadvantages. By way of example, If a word hasn't been seen in the training data for a particular class, assuming its probability to be zero means that any document containing that word will be classified with a probability of zero for that class. This can lead to incorrect classifications, especially when the database is limited or the vocabulary is large. Moreover, The absence of a word in the training set might be interpreted as evidence that the word is entirely unrelated to the class. This can lead to overly confident but incorrect predictions, as the model effectively ignores potentially relevant information.

Not considering the probabilities of unseen words, on the other hand, makes the model less flexible and reduces its ability to generalize to new, unseen examples. This can result in poor performance on real-world data, which inevitably contains words not present in the training data. What is more, The model may misclassify documents based solely on the absence of certain words, leading to significant errors. This is particularly concerning in applications like spam detection or sentiment analysis, where specific keywords are critical to classification.

in the [train_naive_bayes](#step3) function the Laplace Smoothing application is pointed out.

## 2. What happens if the length of the email message is too long? What is the solution?

When dealing with long documents, multiplying probabilities can lead to very small numbers, potentially causing underflow issues where the result is too small for the computer to represent. To avoid this, we use logarithms. Logarithms transform multiplications into additions, making calculations more manageable. The function "predict_by_summation" is the proper method that predicts a better label when dealing with long texts.
You can see the difference when using this method in this [cell](#step4)

## 3. incorprating Stop Words

In codes provided, the Stop words are omitted from the BoW matrix thereby getting more accurate label predictions (this section is directly applied on the program in order to avoid any repetition). 
It is important to note that this program has used a previously made collection of Stop words in the nltk library (you will have to download Stop words first from the library).

**However, without Stop words' removal, the result will be:**

* Accuracy through multiplication: 0.7993019197207679
* Accuracy through summation(logarithm): 0.8272251308900523

which denotes the strong impact of Stop words on the classes. in this pragram the accuracy of the model falls by almost 15% if not implementing them.