In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

# Load and preprocess dataset
df = pd.read_csv('emails.csv')  # Replace with actual path
df['clean_text'] = df['email_body'].apply(lambda x: re.sub(r'\W+', ' ', x.lower()))

# Add sentiment scores as features
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['clean_text'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Vectorize text data
tfidf = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf.fit_transform(df['clean_text']).toarray()
X = np.hstack((X_tfidf, df[['sentiment_score']].values))
y = df['label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train and evaluate model
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Phishing Detector Performance:")
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))