<a href="https://colab.research.google.com/github/Sharmila-ks/Phishing-Email-Detection/blob/main/phishingemailDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

# Load small dataset
data = pd.read_csv("phishing_email.csv").sample(100)  # Only 100 rows
data.rename(columns={'text_combined':'email_text'}, inplace=True)
# Text preprocessing (lowercase, remove punctuation)
data['cleaned_text'] = data['email_text'].str.lower().str.replace('[^\w\s]', '')

# Convert text to numbers
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(data['cleaned_text'])
y = data['label']

# Train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# Test model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print detailed performance metrics
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
def predict_email(email_text):
    email_cleaned = email_text.lower().translate(str.maketrans('', '', string.punctuation))
    email_vectorized = vectorizer.transform([email_cleaned])
    prediction = model.predict(email_vectorized)[0]
    return "Phishing Email" if prediction == 1 else "Legitimate Email"

# Example test
# Get email text as input from the user
user_input = input("Please enter the email text for phishing detection: ")

# Display the result
print(predict_email(user_input))