In [None]:
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# ----------------------------
# 1️⃣ Load Email Dataset
# ----------------------------
try:
    df_email = pd.read_csv("emails.csv")  # CSV must have 'text' and 'label' columns
    print("✅ Email dataset loaded successfully!")
    print("Shape of dataset:", df_email.shape)
    print("Columns:", df_email.columns)
except FileNotFoundError:
    print("❌ emails.csv not found. Please place it in the same folder as this script.")
    raise SystemExit("Exiting: Dataset file is missing.")

# Ensure dataset has required columns
if 'text' not in df_email.columns or 'label' not in df_email.columns:
    raise ValueError("❌ Dataset must contain 'text' and 'label' columns.")

# ----------------------------
# 2️⃣ Features and Labels
# ----------------------------
X_email = df_email['text']
y_email = df_email['label']

# ----------------------------
# 3️⃣ Vectorize Text Data (TF-IDF)
# ----------------------------
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_email_tfidf = vectorizer.fit_transform(X_email)

# ----------------------------
# 4️⃣ Train-Test Split
# ----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_email_tfidf, y_email, test_size=0.2, random_state=42
)

# ----------------------------
# 5️⃣ Train Random Forest Model
# ----------------------------
email_model = RandomForestClassifier(n_estimators=100, random_state=42)
email_model.fit(X_train, y_train)

# ----------------------------
# 6️⃣ Predictions
# ----------------------------
y_pred = email_model.predict(X_test)

# ----------------------------
# 7️⃣ Evaluation
# ----------------------------
print("\n📊 Model Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Spam', 'Spam'], yticklabels=['Not Spam', 'Spam'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Email Spam Confusion Matrix")
plt.show()

# ----------------------------
# 8️⃣ Sample Predictions
# ----------------------------
y_test_np = np.array(y_test)

# Get first sample of not spam and spam from test set
idx_not_spam = np.where(y_test_np == 0)[0][0]
idx_spam = np.where(y_test_np == 1)[0][0]

sample_not_spam = X_test[idx_not_spam]
sample_spam = X_test[idx_spam]

print("\n🔎 Sample Predictions:")
print("Sample Not Spam -->", "Spam" if email_model.predict(sample_not_spam)[0] == 1 else "Not Spam")
print("Sample Spam     -->", "Spam" if email_model.predict(sample_spam)[0] == 1 else "Not Spam")

# ----------------------------
# 9️⃣ Optional: Function to predict any new email text
# ----------------------------
def predict_email(text):
    text_vec = vectorizer.transform([text])
    prediction = email_model.predict(text_vec)[0]
    return "Spam" if prediction == 1 else "Not Spam"

# Example usage:
new_email = "Congratulations! You won a free ticket. Click here to claim."
print("\n📧 New Email Prediction:")
print(new_email, "->", predict_email(new_email))