In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Load data
df = pd.read_csv('CEAS_08.csv')
df.dropna(subset=['body'], inplace=True)

# Limit dataset size to reduce memory usage
# df = df.sample(n=10000, random_state=42)  

# Clean text & extract URLs
def clean_email(email):
    email = re.sub(r'http[s]?://\S+', '', email)  # Remove URLs
    email = re.sub(r'[^a-zA-Z\s]', '', email).lower()  # Keep text only
    return email

df['cleaned_body'] = df['body'].apply(clean_email)

# Convert text into numerical features (Bag of Words)
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_text = vectorizer.fit_transform(df['cleaned_body']).toarray()

# Extract sender domain
df['sender_domain'] = df['sender'].apply(lambda x: x.split('@')[-1] if '@' in str(x) else 'unknown')
df['sender_domain'] = pd.factorize(df['sender_domain'])[0]

# Feature set
X = np.hstack((X_text, df[['sender_domain']].values))  # No scaling needed for Naïve Bayes
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Train Naïve Bayes
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Train XGBoost
xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, objective='binary:logistic', eval_metric='auc')
xgb.fit(X_train, y_train)

# Predictions & Evaluation
models = {'XGBoost': xgb, 'Naïve Bayes': nb}

for name, model in models.items():
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    print(f"\n{name} Model Results:")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    if y_pred_proba is not None:
        print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))
