## Importing the Libraries

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import mysql.connector
import pickle
import os

In [3]:
def get_db_connection():
    return mysql.connector.connect(
        host="localhost",
        user="root",             # change if needed
        password="cgi@2025",     # your MySQL password
        database="spam_detection",
        auth_plugin="mysql_native_password"
    )

In [4]:
def insert_prediction_to_db(email_text, nb_pred, xgb_pred):
    """Insert spam detection result into MySQL database."""
    conn = get_db_connection()
    cursor = conn.cursor()

    sql = """
    INSERT INTO spam_results (email_text, naive_bayes_pred, xgboost_pred)
    VALUES (%s, %s, %s)
    """
    values = (email_text, int(nb_pred), int(xgb_pred))  # store predictions as 0/1
    cursor.execute(sql, values)
    conn.commit()
    cursor.close()
    conn.close()

In [5]:
# Paths 
MODEL_DIR = "./ml_model"
os.makedirs(MODEL_DIR, exist_ok=True)
NB_PATH = os.path.join(MODEL_DIR, "naive_bayes_model.pkl")
XGB_PATH = os.path.join(MODEL_DIR, "xgboost_model.pkl")
VEC_PATH = os.path.join(MODEL_DIR, "vectorizer.pkl")

## Load the dataset

In [6]:
# Load data
df = pd.read_csv('C:/CGI/Project/Email-Validation/uploads/validated_emails.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199582 entries, 0 to 199581
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    199580 non-null  object
 1   label   199580 non-null  object
dtypes: object(2)
memory usage: 3.0+ MB


In [8]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,spam
1,Subject: the stock trading gunslinger fanny i...,spam
2,Subject: unbelievable new homes made easy im ...,spam
3,Subject: 4 color printing special request add...,spam
4,"Subject: do not have money , get software cds ...",spam


## Remove the NaN values

In [9]:
# Remove NaN values 
df = df.dropna()

## Mapping label to integer values

In [10]:
# Keep only valid labels and map: ham -> 0, spam -> 1
valid_labels = {"ham": 0, "Ham": 0, "spam": 1, "Spam": 1}
df = df[df['label'].isin(valid_labels.keys())]
df['label'] = df['label'].map(valid_labels).astype(int)

In [11]:
# Drop rows where text is NaN or empty
df = df.dropna(subset=['text'])
df = df[df['text'].str.strip() != ""]

In [12]:
print(df)

                                                     text  label
0       Subject: naturally irresistible your corporate...      1
1       Subject: the stock trading gunslinger  fanny i...      1
2       Subject: unbelievable new homes made easy  im ...      1
3       Subject: 4 color printing special  request add...      1
4       Subject: do not have money , get software cds ...      1
...                                                   ...    ...
199577  on escapenumber escapenumber escapenumber rob ...      0
199578  we have everything you need escapelong cialesc...      1
199579  hi quick question say i have a date variable i...      0
199580  thank you for your loan request which we recie...      1
199581  this is an automatically generated delivery st...      0

[199576 rows x 2 columns]


In [13]:
# Check distribution
print("Label distribution:\n", df['label'].value_counts())

Label distribution:
 label
0    106517
1     93059
Name: count, dtype: int64


In [14]:
df.head()

Unnamed: 0,text,label
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [15]:
# Drop rows where text is NaN or empty BEFORE splitting
# df = df.dropna(subset=['text', 'label'])
# df = df[df['text'].str.strip() != ""]

In [16]:
X = df['text'].astype(str)
y = df['label'].astype(int)

In [17]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Ensure text column has no NaN or empty values
X_train = X_train.dropna().astype(str)
X_train = X_train[X_train.str.strip() != ""]

X_test = X_test.dropna().astype(str)
X_test = X_test[X_test.str.strip() != ""]

In [26]:
# Vectorize text

if not (os.path.exists(NB_PATH) and os.path.exists(XGB_PATH) and os.path.exists(VEC_PATH)):
    print("Training models...")

    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Naive Bayes
    nb = MultinomialNB()
    nb.fit(X_train_vec, y_train)
    y_pred_nb = nb.predict(X_test_vec)
    print("Naive Bayes Results:") 
    print(classification_report(y_test, y_pred_nb))
    print("Accuracy:", accuracy_score(y_test, y_pred_nb))

    # XGBoost
    xgb = XGBClassifier(eval_metric='logloss', max_depth=3, min_child_weight=2, subsample=0.8, reg_alpha=1, reg_lambda=1)
    xgb.fit(X_train_vec, y_train)
    y_pred_xgb = xgb.predict(X_test_vec)
    print("\nXGBoost Results:")
    print(classification_report(y_test, y_pred_xgb))
    print("Accuracy:", accuracy_score(y_test, y_pred_xgb)) 

    #Saving models in .pkl file
    with open(NB_PATH, "wb") as f:
        pickle.dump(nb, f)
    with open(XGB_PATH, "wb") as f:
        pickle.dump(xgb, f)
    with open(VEC_PATH, "wb") as f:
        pickle.dump(vectorizer, f)

    print("Models trained & saved in ./ml_model/")

else:
    print("Loading models from ./ml_model/")
    with open(NB_PATH, "rb") as f:
        nb = pickle.load(f)
    with open(XGB_PATH, "rb") as f:
        xgb = pickle.load(f)
    with open(VEC_PATH, "rb") as f:
        vectorizer = pickle.load(f)

Loading models from ./ml_model/


In [27]:
#Evaluating the models
X_test_vec = vectorizer.transform(X_test)
print("Naive Bayes Results:")
print(classification_report(y_test, nb.predict(X_test_vec)))
print("Accuracy:", accuracy_score(y_test, nb.predict(X_test_vec)))

print("\nXGBoost Results:")
print(classification_report(y_test, xgb.predict(X_test_vec)))
print("Accuracy:", accuracy_score(y_test, xgb.predict(X_test_vec)))

Naive Bayes Results:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91     21430
           1       0.92      0.88      0.90     18486

    accuracy                           0.91     39916
   macro avg       0.91      0.90      0.91     39916
weighted avg       0.91      0.91      0.91     39916

Accuracy: 0.9065036576811304

XGBoost Results:
              precision    recall  f1-score   support

           0       0.97      0.92      0.95     21430
           1       0.92      0.97      0.94     18486

    accuracy                           0.94     39916
   macro avg       0.94      0.95      0.94     39916
weighted avg       0.95      0.94      0.94     39916

Accuracy: 0.9443330995089688


In [28]:
# --- Prediction Function ---
def classify_and_store_email(email_text: str):
    email_vec = vectorizer.transform([email_text])
    nb_pred = nb.predict(email_vec)[0]
    xgb_pred = xgb.predict(email_vec)[0]

    insert_prediction_to_db(email_text, nb_pred, xgb_pred)

    return {
        "email_text": email_text,
        "naive_bayes_prediction": "Spam" if nb_pred == 1 else "Ham",
        "xgboost_prediction": "Spam" if xgb_pred == 1 else "Ham"
    }

In [29]:
# --- Main Execution ---
if __name__ == "__main__":
    user_email = input("Enter the email content/text to classify: ").strip()
    result = classify_and_store_email(user_email)

    print("\nClassification Result:")
    print("Email Text:", result["email_text"])
    print("Naive Bayes Prediction:", result["naive_bayes_prediction"])
    print("XGBoost Prediction:", result["xgboost_prediction"])

Enter the email content/text to classify:  Congratulations! You have won $10,000 in our lucky draw. Click this link immediately to claim your prize!!!



Classification Result:
Email Text: Congratulations! You have won $10,000 in our lucky draw. Click this link immediately to claim your prize!!!
Naive Bayes Prediction: Spam
XGBoost Prediction: Spam
