In [None]:
#1. Loading and Preprocessing the Data:

'''import statements: Import necessary libraries like pandas, numpy, sklearn, string, and re.
Loading the dataset: Loads a CSV file containing emails labeled as spam or not spam using pd.read_csv.'''

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer
import string
import re
import os

# Loading the Data set

In [None]:

data = pd.read_csv("/content/spam_or_not_spam.csv")
data.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


# 2) Preprocess the data ( e.g lowercase, removing punctuation, special characters etc.) and also handle any missing values.

In [None]:
# 2) Preprocess the data
# Lowercase all text
data["email"] = data["email"].astype(str).str.lower()

# Remove punctuation and special characters
data["email"] = data["email"].apply(lambda x: re.sub(f"[{string.punctuation}]", "", x))

# Remove numeric characters
data["email"] = data["email"].apply(lambda x: re.sub("\\d+", "", x))

# Handle missing values
data.dropna(inplace=True)

# Check preprocessing
print("Preprocessing completed. Sample data:")
print(data.head())

Preprocessing completed. Sample data:
                                               email  label
0   date wed number aug number number number numb...      0
1  martin a posted tassos papadopoulos the greek ...      0
2  man threatens explosion in moscow thursday aug...      0
3  klez the virus that won t die already the most...      0
4   in adding cream to spaghetti carbonara which ...      0


# 3) Make unigrams and bigrams of the preprocessed data.

In [None]:
# 3) Make unigrams and bigrams
vectorizer = CountVectorizer(ngram_range=(1, 2))  # unigrams and bigrams
X = vectorizer.fit_transform(data["email"])

# Get unigram and bigram features
features = vectorizer.get_feature_names_out()
print(f"Extracted {len(features)} features (unigrams and bigrams).")

Extracted 264143 features (unigrams and bigrams).


# 4) Calculate the probability of these unigrams and bigrams.

In [None]:
# 4) Calculate probabilities for unigrams and bigrams
# Convert X to a dense array
X_dense = X.toarray()

# Get probabilities by class
spam_probs = X_dense[data["label"] == 1].sum(axis=0) / X_dense[data["label"] == 1].sum()
not_spam_probs = X_dense[data["label"] == 0].sum(axis=0) / X_dense[data["label"] == 0].sum()

# Map probabilities to feature names
probabilities = {
    "spam": dict(zip(features, spam_probs)),
    "not_spam": dict(zip(features, not_spam_probs))
}

#
5) Define a function that:    
1. Takes a user-defined sentence:
The function should accept a sentence from the user (at least 5 words).
2. Preprocess and Tokenize the Sentence:
We need to clean the text (lowercase, remove punctuation) and then split the sentence into individual words (unigrams) and word pairs (bigrams).
3. Compute the Posterior Probability:
Using the Bayes' theorem, we compute the probability of the sentence belonging to each class (spam or not spam) based on the unigrams and bigrams.
4. Predict the Class:
The class with the highest posterior probability is the predicted class for the sentence.


In [None]:
# 5) Define the prediction function
def predict_email_class(email):
    # Preprocess email
    email = email.lower()
    email = re.sub(f"[{string.punctuation}]", "", email)
    email = re.sub("\\d+", "", email)

    # Tokenize and create n-grams
    tokens = email.split()
    unigrams = tokens
    bigrams = [" ".join(tokens[i:i+2]) for i in range(len(tokens) - 1)]
    ngrams = unigrams + bigrams

    # Compute posterior probabilities
    spam_score = 1.0
    not_spam_score = 1.0

    for ngram in ngrams:
        spam_score *= probabilities["spam"].get(ngram, 1e-5)  # Small value for unseen n-grams
        not_spam_score *= probabilities["not_spam"].get(ngram, 1e-5)

    # Predict class
    if spam_score > not_spam_score:
        return 1  # Spam
    else:
        return 0  # Not spam

# 6) Test the function with at least three user-defined sentences.

In [None]:
# 6) Test the function with sample sentences
test_sentences = [
"Congratulations! You've won a free vacation to the Bahamas. Claim your prize now!",
"Hello, your bank account has been compromised. Click here to secure your account.",
"Reminder: Your subscription will be renewed automatically tomorrow.",
"Hey! I found a great deal on smartphones, check it out!",
"Hey, can we discuss the new project timeline tomorrow?",
"Urgent! Your account will be locked unless you update your details within 24 hours.",
"Get rich quick with this amazing investment opportunity. Don't miss out!",
"Your email storage is almost full. Upgrade now to avoid losing data."
]

In [None]:
results = {sentence: predict_email_class(sentence) for sentence in test_sentences}

# Display results
for sentence, result in results.items():
    label = "Spam" if result == 1 else "Not Spam"
    print(f"Sentence: {sentence} => Predicted Label: {label}")

Sentence: Congratulations! You've won a free vacation to the Bahamas. Claim your prize now! => Predicted Label: Spam
Sentence: Hello, your bank account has been compromised. Click here to secure your account. => Predicted Label: Not Spam
Sentence: Reminder: Your subscription will be renewed automatically tomorrow. => Predicted Label: Not Spam
Sentence: Hey! I found a great deal on smartphones, check it out! => Predicted Label: Not Spam
Sentence: Hey, can we discuss the new project timeline tomorrow? => Predicted Label: Not Spam
Sentence: Urgent! Your account will be locked unless you update your details within 24 hours. => Predicted Label: Not Spam
Sentence: Get rich quick with this amazing investment opportunity. Don't miss out! => Predicted Label: Spam
Sentence: Your email storage is almost full. Upgrade now to avoid losing data. => Predicted Label: Not Spam
