In [None]:
#ASS18. Implementation an Email Spam Detection model using a Support Vector Machine (SVM) for
#binary classification, where emails are categorized as Normal (Not Spam) or Abnormal (Spam).
#Apply oversampling or undersampling techniques to handle class imbalance and analyze model
#performance using appropriate evaluation metrics.

# ===============================================================
# Email Spam Detection using SVM with Oversampling / Undersampling
# ===============================================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    confusion_matrix, precision_score, recall_score,
    f1_score, roc_auc_score, accuracy_score, ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# -----------------------------------------------------------
# 1. Load Dataset
# -----------------------------------------------------------
df = pd.read_csv("email.csv")
print("Dataset Loaded Successfully! Shape:", df.shape)

# -----------------------------------------------------------
# 2. Separate Features and Target
# -----------------------------------------------------------
X = df.drop(columns=["Prediction", "Email No."])
y = df["Prediction"]

# -----------------------------------------------------------
# 3. Handle Missing Values (if any)
# -----------------------------------------------------------
print("\nBefore dropping NaN:", df.shape)
combined = pd.concat([X, y], axis=1)
combined = combined.dropna(subset=["Prediction"])  # drop unlabeled rows
combined = combined.dropna()  # drop rows with NaN features

# Split back
X = combined.drop(columns=["Prediction"])
y = combined["Prediction"]

print("After dropping NaN:", X.shape)
print("Unique Target Values:", np.unique(y, return_counts=True))

# -----------------------------------------------------------
# 4. Split Data into Train/Test
# -----------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"\nTrain size: {X_train.shape}, Test size: {X_test.shape}")

# -----------------------------------------------------------
# 5. Feature Scaling
# -----------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# -----------------------------------------------------------
# 6. Handle Class Imbalance (Choose One)
# -----------------------------------------------------------

# Option 1: Oversampling using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
print("\nAfter SMOTE Oversampling:", np.unique(y_resampled, return_counts=True))

# Option 2: Undersampling (uncomment to use instead)
# rus = RandomUnderSampler(random_state=42)
# X_resampled, y_resampled = rus.fit_resample(X_train_scaled, y_train)
# print("After Random Undersampling:", np.unique(y_resampled, return_counts=True))

# -----------------------------------------------------------
# 7. Train SVM Model on Resampled Data
# -----------------------------------------------------------
print("\nTraining SVM Model...")
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_resampled, y_resampled)
print("Training Complete!")

# -----------------------------------------------------------
# 8. Make Predictions
# -----------------------------------------------------------
y_pred = svm_model.predict(X_test_scaled)
y_pred_proba = svm_model.predict_proba(X_test_scaled)[:, 1]

# -----------------------------------------------------------
# 9. Evaluate Model Performance
# -----------------------------------------------------------
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print("\nConfusion Matrix (Normal / Spam):")
print(cm)

print("\nEvaluation Metrics:")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")

# -----------------------------------------------------------
# 10. Plot Confusion Matrix
# -----------------------------------------------------------
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Normal", "Spam"])
disp.plot(cmap='Blues', values_format='d')
plt.title("Confusion Matrix for SVM Spam Classifier (with SMOTE)")
plt.show()

# -----------------------------------------------------------
# 11. Plot ROC Curve
# -----------------------------------------------------------
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {auc:.3f})')
plt.plot([0,1], [0,1], 'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - SVM Spam Classifier (SMOTE)')
plt.legend()
plt.grid(True)
plt.show()




FileNotFoundError: [Errno 2] No such file or directory: 'cancer.csv'