In [11]:
# Let's generate a minimal phishing email detection example using scikit-learn and a small dataset

import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib


In [12]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/CEAS_08.csv')

In [13]:
df.head()

Unnamed: 0,sender,receiver,date,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,user4@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 16:31:02 -0700",Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 18:31:03 -0500",Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,user2.9@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 20:28:00 -1200",CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,SpamAssassin Dev <xrh@spamassassin.apache.org>,"Tue, 05 Aug 2008 17:31:20 -0600",Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,user2.2@gvc.ceas-challenge.cc,"Tue, 05 Aug 2008 19:31:21 -0400",SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [14]:
# Step 1: Clean the Text in the 'body' column
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove links
    text = re.sub(r'\d+', '', text)                      # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    return text.strip()

df['cleaned_body'] = df['body'].apply(clean_text)

# Step 2: Vectorize the cleaned text
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_body'])

# Step 3: Labels
y = df['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3490
           1       0.99      0.99      0.99      4341

    accuracy                           0.99      7831
   macro avg       0.99      0.99      0.99      7831
weighted avg       0.99      0.99      0.99      7831



In [16]:
# Save the model and vectorizer
joblib.dump(model, '/content/drive/MyDrive/phishing_model.pkl')
joblib.dump(vectorizer, '/content/drive/MyDrive/vectorizer.pkl')


['/content/drive/MyDrive/vectorizer.pkl']

In [18]:
# Generate a basic Streamlit app (app.py) that uses the saved model and vectorizer

streamlit_code = """
import streamlit as st
import joblib

# Load the model and vectorizer
model = joblib.load("phishing_model.pkl")
vectorizer = joblib.load("vectorizer.pkl")

st.set_page_config(page_title="Phishing Email Detector", page_icon="🛡️")
st.title("🛡️ AI-Powered Phishing Email Detector")
st.write("Paste an email below to check if it's likely phishing or legitimate using an AI model.")

# User input
email_text = st.text_area("✉️ Enter email text here:")

if st.button("🔍 Check Email"):
    if email_text.strip() == "":
        st.warning("Please enter some email text.")
    else:
        input_vec = vectorizer.transform([email_text])
        prediction = model.predict(input_vec)
        result = "🚨 This is likely a **Phishing** email." if prediction[0] == 1 else "✅ This email looks **Legitimate**."
        st.markdown(result)
"""

# Save to file
app_path = "/content/drive/MyDrive/app.py"
with open(app_path, "w") as f:
    f.write(streamlit_code)

app_path

'/content/drive/MyDrive/app.py'

In [None]:
CEAS_08.csv