In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from urllib.parse import urlparse

# Load data
df = pd.read_csv('CEAS_08.csv')
df.dropna(subset=['body'], inplace=True)

# Clean text & extract URLs
def clean_email(email):
    urls = re.findall(r'http[s]?://\S+', email)
    email = re.sub(r'http[s]?://\S+', '', email)  # Remove URLs
    email = re.sub(r'[^a-zA-Z\s]', '', email).lower()  # Keep text only
    return email, len(urls)

df['cleaned_body'], df['num_urls'] = zip(*df['body'].apply(clean_email))

# Bag of Words Vectorization
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_text = vectorizer.fit_transform(df['cleaned_body']).toarray()

# Extract sender domain
df['sender_domain'] = df['sender'].apply(lambda x: x.split('@')[-1] if '@' in str(x) else 'unknown')
df['sender_domain'] = pd.factorize(df['sender_domain'])[0]

# Feature set
X = np.hstack((X_text, df[['num_urls', 'sender_domain']].values))
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train[:, -2:] = scaler.fit_transform(X_train[:, -2:])
X_test[:, -2:] = scaler.transform(X_test[:, -2:])

# Train XGBoost
xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=100, objective='binary:logistic', eval_metric='auc')
xgb.fit(X_train, y_train)

# Predictions & Evaluation
y_pred = xgb.predict(X_test)
y_pred_proba = xgb.predict_proba(X_test)[:, 1]

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

: 