In [None]:
#!pip3 install pandas nltk scikit-learn joblib

In [None]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
# Load the dataset from a CSV file - be sure to define the specific file path if current directory path is not working
df = pd.read_csv('./spam_or_not_spam.csv')

# Display the first few rows to verify the data
df.head()

In [None]:
# Drop rows with missing values in the 'email' column
df = df.dropna(subset=['email'])

In [None]:
# Function to preprocess text data
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = ' '.join(word for word in text.split() if word not in stop_words)  # Remove stopwords
    return text

# Apply the preprocessing function to the 'email' column
df['email'] = df['email'].apply(preprocess_text)

In [None]:
# Convert text data into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['email'])
y = df['label']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model's performance on the testing set
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Save the trained model and the TF-IDF vectorizer
joblib.dump(model, 'phishing_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

In [None]:
# Function to predict whether a given email is phishing or legitimate
def predict_phishing(text):
    text = preprocess_text(text)  # Preprocess the input text
    features = vectorizer.transform([text])  # Transform text into features
    prediction = model.predict(features)  # Predict using the trained model
    probability = model.predict_proba(features)[:, 1]  # Get the probability of the email being phishing
    result = 'Phishing' if prediction[0] else 'Legitimate'
    return result, probability[0]

In [None]:
# Sample emails for testing the prediction function
sample_emails = [
    "Congratulations! You've won a $1000 gift card. Click here to claim your prize.",
    "Your account has been compromised. Please reset your password immediately by clicking this link.",
    "Important: Your account will be suspended unless you verify your information here.",
    "Urgent: Update your payment information to avoid service interruption.",
    "You've been selected for a chance to win a brand new iPhone! Click to enter now.",
    "Get free access to our exclusive membership by clicking this link.",
    "Immediate action required: Your bank account is locked. Click here to unlock.",
    "Verify your email address now to receive a special offer.",
    "Win a free vacation by providing your email address here.",
    "Your computer is infected with a virus. Click here to download antivirus software."
]

# Print the predictions with probabilities for sample emails
for email in sample_emails:
    result, probability = predict_phishing(email)
    print(f"Email: {email}\nPrediction: {result}\nProbability of Phishing: {probability:.2f}\n")