<a href="https://colab.research.google.com/github/Sarleymwaka/Community-members-/blob/main/emailphishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Setup ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Initial email dataset
emails = {
    "text": [
        "Win a free iPhone now! Click here.",
        "Your package is on the way.",
        "Urgent! Your account has been locked. Reset now!",
        "Meeting rescheduled to 3PM.",
        "Claim your lottery prize now!",
        "Invoice for your recent purchase.",
        "Security alert! Unusual login detected."
    ],
    "label": [1, 0, 1, 0, 1, 0, 1]
}

# Convert to DataFrame
df = pd.DataFrame(emails)

# Vectorize and train model
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["text"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)
# --- Add Your Own Emails ---
more_emails = {
    "text": [
        "Please find attached your interview schedule.",
        "You've been selected for a reward! Act now!",
        "Reminder: Your doctor's appointment is tomorrow.",
        "Verify your Apple ID to avoid suspension.",
        "Update your billing info to continue your subscription.",
        "Hi team, here are the minutes from today's meeting.",
        "You have a new voicemail, click to listen.",
        "Action needed: confirm your PayPal login.",
        "Lunch tomorrow at 1PM?",
        "Get rich quick by joining this crypto investment!"
    ],
    "label": [0, 1, 0, 1, 1, 0, 1, 1, 0, 1]
}

# Add to dataset
df2 = pd.DataFrame(more_emails)
df = pd.concat([df, df2], ignore_index=True)

# Re-vectorize and retrain the model
X = vectorizer.fit_transform(df["text"])
y = df["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Show updated accuracy
print("âœ… New Accuracy after adding custom emails:", round(accuracy_score(y_test, y_pred)*100, 2), "%")
# --- Visualize Phishy Words ---
# Get word list and class log probabilities from the trained model
feature_names = vectorizer.get_feature_names_out()
class_probs = model.feature_log_prob_

# Calculate which words are most strongly associated with phishing
phishing_word_weights = class_probs[1] - class_probs[0]
top_phishy_indices = np.argsort(phishing_word_weights)[-10:]

print("ðŸš¨ Top suspicious (phishy) words the AI has learned:")
for idx in reversed(top_phishy_indices):  # Most phishy at the top
    print(f"- {feature_names[idx]}")

âœ… New Accuracy after adding custom emails: 66.67 %
ðŸš¨ Top suspicious (phishy) words the AI has learned:
- now
- been
- you
- unusual
- selected
- ve
- urgent
- suspension
- this
- listen
