In [1]:
import pandas as pd 
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Define your paths
import os
import email  


spam_path = "C:\\Users\\HP\\OneDrive\\NLP PROJECT\\EMAIL-SPAM-DETECTION\\archive (13)\\spam_2\\spam_2"

easy_ham_path = "C:\\Users\\HP\\OneDrive\\NLP PROJECT\\EMAIL-SPAM-DETECTION\\archive (13)\\easy_ham\\easy_ham"
hard_ham_path = "C:\\Users\\HP\\OneDrive\\NLP PROJECT\\EMAIL-SPAM-DETECTION\\archive (13)\\hard_ham\\hard_ham"

raw_data = []
labels = []
invalid_list = []

def processemail(body):
    # Basic processing, for example:
    body = body.strip()  # Remove leading and trailing whitespace
    # Additional processing can be added here (e.g., further cleaning, tokenization, etc.)
    return body  # Return the processed body


def safe_decode(payload, charset):
    try:
        return payload.decode(charset, errors='ignore')
    except Exception:
        # Fallback to utf-8 if the charset is unknown
        return payload.decode('utf-8', errors='ignore')

def processfolder(path, label):
    for filename in os.listdir(path):
        if filename.startswith('.'):
            continue  # Skip hidden files

        try:
            with open(os.path.join(path, filename), 'r', errors='ignore') as file:
                content = file.read()
                msg = email.message_from_string(content)

                # Initialize an empty list to hold the message body parts
                body_parts = []

                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_type() == 'text/plain':
                            charset = part.get_content_charset()
                            if not charset:
                                charset = 'utf-8'  # Default fallback charset

                            # Decode using a safe method
                            body_parts.append(safe_decode(part.get_payload(decode=True), charset))
                else:
                    charset = msg.get_content_charset()
                    if not charset:
                        charset = 'utf-8'  # Default fallback charset

                    body_parts.append(safe_decode(msg.get_payload(decode=True), charset))

                # Join all parts into a single body
                body = ' '.join(body_parts)
                body = processemail(body)

                if body:  # Check if the body is not empty
                    raw_data.append(body)
                    labels.append(label)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            invalid_list.append(filename)

# Process the email folders
processfolder(spam_path, 1)
processfolder(easy_ham_path, 0)
processfolder(hard_ham_path, 0)

# Create a DataFrame from the collected email data
df = pd.DataFrame({'Email': raw_data, 'Label': labels})

# Display the head of the DataFrame
print("Total email count: {}".format(len(raw_data)))
print("Total labels: {}".format(len(labels)))
print("Invalid files: {}".format(invalid_list))
print("\nDataFrame head:")
print(df.head())

# Label distribution and email length statistics
label_distribution = df['Label'].value_counts()
email_length_stats = df['Email'].apply(len).describe()

print("\nLabel distribution:\n", label_distribution)
print("\nEmail length statistics:\n", email_length_stats)

Total email count: 4143
Total labels: 4143
Invalid files: []

DataFrame head:
                                               Email  Label
0  Greetings!\n\nYou are receiving this letter be...      1
1  <html>\n<body>\n<center>\n<h3>\n<font color="b...      1
2  <html>\n<body>\n<center>\n<b>\n<font color="bl...      1
3  <html>\n<body>\n<center>\n<b>\n<font color="bl...      1
4  <html><xbody>\n<hr width = "100%">\n<center><h...      1

Label distribution:
 Label
0    2799
1    1344
Name: count, dtype: int64

Email length statistics:
 count      4143.000000
mean       3092.830558
std        7028.323554
min          29.000000
25%         598.000000
50%        1139.000000
75%        2397.000000
max      194761.000000
Name: Email, dtype: float64


easy-ham: non-spam messages typically quite easy to differentiate from spam messages which is labelled 0

hard-ham: non-spam messages more difficult to differentiate also labelled 0

spam: spam messages. it is labelled 1


In [3]:
df.head()

Unnamed: 0,Email,Label
0,Greetings!\n\nYou are receiving this letter be...,1
1,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...",1
2,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1
3,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1
4,"<html><xbody>\n<hr width = ""100%"">\n<center><h...",1


# CLEANING THE TEXT COLUMN

In [4]:
import pandas as pd
from bs4 import BeautifulSoup


# Function to extract text from HTML using BeautifulSoup
def processemail(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    text_content = soup.get_text(separator='\n')
    cleaned_text = "\n".join(line.strip() for line in text_content.splitlines() if line.strip())
    return cleaned_text

# Apply the function to each row in the 'email_body' column
df['cleaned_email_body'] = df['Email'].apply(processemail)

In [5]:
df.head()

Unnamed: 0,Email,Label,cleaned_email_body
0,Greetings!\n\nYou are receiving this letter be...,1,Greetings!\nYou are receiving this letter beca...
1,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...",1,"The Need For Safety Is Real In 2002, You Might..."
2,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"*****Bonus Fat Absorbers As Seen On TV, Includ..."
3,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"*****Bonus Fat Absorbers As Seen On TV, Includ..."
4,"<html><xbody>\n<hr width = ""100%"">\n<center><h...",1,"Government Grants E-Book 2002\nedition, Just $..."


In [6]:
# replacing the asterisks in the cleaned email columns with spaces.
df['cleaned_email_body'] = df['cleaned_email_body'].str.replace("*","").str.replace("\n", "").str.replace("__", "")

In [7]:
df['cleaned_email_body'] = df['cleaned_email_body'].str.replace("\'", "").str.replace("\'", "").str.replace("---","")

In [8]:
df['cleaned_email_body'] = df['cleaned_email_body'].str.replace("--","").str.replace("-","").str.replace("\xa0", "")

In [9]:
df['cleaned_email_body'] = df['cleaned_email_body'].str.replace("=", "")

In [10]:
df.head()

Unnamed: 0,Email,Label,cleaned_email_body
0,Greetings!\n\nYou are receiving this letter be...,1,Greetings!You are receiving this letter becaus...
1,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...",1,"The Need For Safety Is Real In 2002, You Might..."
2,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"Bonus Fat Absorbers As Seen On TV, Included Fr..."
3,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"Bonus Fat Absorbers As Seen On TV, Included Fr..."
4,"<html><xbody>\n<hr width = ""100%"">\n<center><h...",1,"Government Grants EBook 2002edition, Just $15...."


In [11]:
import re

# Function to remove the 'Original Message From: "..." >' part including names
def remove_original_message(text):
    # Use regex to find and remove 'Original Message  From: "..." >' including any name in quotes
    return re.sub(r'Original Message\s+From:\s*".*?"\s*>', '', text).strip()

# Apply the function to your DataFrame column
df['cleaned_email_body'] = df['cleaned_email_body'].astype(str).apply(remove_original_message)

# Check the output
print(df['cleaned_email_body'])


0       Greetings!You are receiving this letter becaus...
1       The Need For Safety Is Real In 2002, You Might...
2       Bonus Fat Absorbers As Seen On TV, Included Fr...
3       Bonus Fat Absorbers As Seen On TV, Included Fr...
4       Government Grants EBook 2002edition, Just $15....
                              ...                        
4138    Greetings from Geocaching.com Recent caches in...
4139    I am trying to secure three of four virtual ho...
4140    Filled with useful examples and the depth, cla...
4141    LinuxAnnounce Digest #180, Volume #4          ...
4142    This is an official mailing from SourceForge.n...
Name: cleaned_email_body, Length: 4143, dtype: object


In [12]:
df['cleaned_email_body'] = df['cleaned_email_body'].str.replace(">", "")

In [13]:
df['cleaned_email_body'][2097]

'I went out and drew some chalk circles on my sidewalk just so I wouldnt miss out on the experience. Ive collected half a dozen passwords & access to email accounts so far. Greghahaha!It takes a thief to catch a thief?'

In [14]:
# replacing the links in the text in order to avoid noise in our model
def replace_links(text):
    return re.sub(r'http\S+|www\S+|https\S+', '[URL]', text)

# Apply this function to the text data
df['cleaned_email_body'] = df['cleaned_email_body'].apply(replace_links)


In [15]:
df['cleaned_email_body'][2097]

'I went out and drew some chalk circles on my sidewalk just so I wouldnt miss out on the experience. Ive collected half a dozen passwords & access to email accounts so far. Greghahaha!It takes a thief to catch a thief?'

# Preprocessing and building Model


In [16]:
# Preprocessing the dataset by removing stopwords and punctuation
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_.lower())
            
    return " ".join(filtered_tokens)


In [17]:
df["preprocessed_email"] = df["cleaned_email_body"].apply(preprocess)

In [18]:
df.head()

Unnamed: 0,Email,Label,cleaned_email_body,preprocessed_email
0,Greetings!\n\nYou are receiving this letter be...,1,Greetings!You are receiving this letter becaus...,greetings!you receive letter express interest ...
1,"<html>\n<body>\n<center>\n<h3>\n<font color=""b...",1,"The Need For Safety Is Real In 2002, You Might...",need safety real 2002 chance ready!free ship...
2,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"Bonus Fat Absorbers As Seen On TV, Included Fr...",bonus fat absorbers see tv include free purcha...
3,"<html>\n<body>\n<center>\n<b>\n<font color=""bl...",1,"Bonus Fat Absorbers As Seen On TV, Included Fr...",bonus fat absorbers see tv include free purcha...
4,"<html><xbody>\n<hr width = ""100%"">\n<center><h...",1,"Government Grants EBook 2002edition, Just $15....",government grants ebook 2002edition $ 15.95 su...


In [19]:
# Splitting the dataset into train and text data

from sklearn.model_selection import train_test_split

X = df["preprocessed_email"]
y = df["Label"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Converting the text to vectors and also building the model using traditional machine learning
# in order to handle data imbalance in our labels, we will be using the SMOTE technique

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC 
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.base import BaseEstimator, TransformerMixin


class_prior = [sum(y_train == 0) / len(y_train), sum(y_train == 1) / len(y_train)]


Model_NB =Pipeline([
    ["Vectorizer", TfidfVectorizer()],
    ["MultinomialNB", MultinomialNB(class_prior=class_prior)]
])

In [21]:
# train the model

Model_NB.fit(X_train, y_train)

In [22]:
# predicting on unseen data and calculating the accuracy score
from sklearn.metrics import accuracy_score, classification_report

y_pred = Model_NB.predict(X_test)


score = accuracy_score(y_test,y_pred)
confusion_score = classification_report(y_test, y_pred)


print("Accuracy_Score:", score)
print("Confusion_matrix:", confusion_score)

Accuracy_Score: 0.9047044632086851
Confusion_matrix:               precision    recall  f1-score   support

           0       0.87      1.00      0.93       536
           1       1.00      0.73      0.84       293

    accuracy                           0.90       829
   macro avg       0.93      0.87      0.89       829
weighted avg       0.92      0.90      0.90       829



In [23]:
# Trying another model on SUPPORT VECTOR CLASSIFIER
Model_SVC =Pipeline([
    ["Vectorizer", TfidfVectorizer()],
    ["SVC", SVC(class_weight="balanced")]
])


# train the model

Model_SVC.fit(X_train, y_train)

In [24]:
# predicting on unseen data and calculating the accuracy score
from sklearn.metrics import accuracy_score, classification_report

y_pred = Model_SVC.predict(X_test)


score = accuracy_score(y_test,y_pred)
confusion_score = classification_report(y_test, y_pred)


print("Accuracy_Score:", score)
print("Confusion_matrix:", confusion_score)

Accuracy_Score: 0.9650180940892642
Confusion_matrix:               precision    recall  f1-score   support

           0       0.96      0.99      0.97       536
           1       0.99      0.91      0.95       293

    accuracy                           0.97       829
   macro avg       0.97      0.95      0.96       829
weighted avg       0.97      0.97      0.96       829



Which Model is Better?
SVC is overall better based on:

1. Higher accuracy.
2. Better balance between precision and recall for both classes.
3. Higher F1-score, especially for Class 1.

MultinomialNB has perfect precision for Class 1 (it predicts Class 1 without false positives) but suffers in recall, meaning it misses quite a few actual Class 1 instances (27%).


Conclusion: Based on the metrics provided, SVC is the superior model overall.








In [25]:
# saving the svc model using pickle
import pickle

with open("SVC_model.pkl", "wb") as file:
    pickle.dump(Model_SVC, file)
