# Clasificación de Emails: ¿Phishing o Seguro?

In [9]:
safe = "re : kenneth parkhill stinson : norma has checked the internal equity in the group , and kenneth is fine in a senior specialist spot at that salary . i will be happy to extend an offer to him . did you discuss anything concerning the relocation package with kenneth ? molly x 34804 stinson gibner 11 / 01 / 2000 06 : 18 pm to : molly magee / hou / ect @ ect cc : vince j kaminski / hou / ect @ ect subject : kenneth parkhill molly , we would like to go ahead with an offer to kenneth . after talking to him again , i think he will accept . we would like to offer him the equivalent package to an incoming associate , which i understand would be $ 76 k base and a signing bonus of $ 20 k . he position would be a specialist or senior specialist ( whichever fits the salary ) reporting to me . thanks for your help , stinson x 34748"

In [10]:
phishing = """
Subject: 🛑 Urgent: Your Account Has Been Suspended
From: support@secure-paypal.com

Dear Customer,

We have detected unusual activity on your PayPal account and have temporarily limited your access for your protection.

To restore your account, please verify your information by clicking the secure link below:

👉 http://paypal.security-alert123.com

Failure to do so within 24 hours may result in permanent suspension of your account.

We apologize for the inconvenience and thank you for your cooperation.

Sincerely,  
PayPal Security Team
"""


In [11]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from pycaret.classification import *
import pandas as pd

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sofia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:

lemmer = WordNetLemmatizer()
def lematize(text):
    return ' '.join([lemmer.lemmatize(w) for w in text.split()])


In [13]:
vectorizer = TfidfVectorizer(max_features=5000)
train = pd.read_csv("Phishing_Email_Train.csv")
vectorizer.fit_transform(train['Cleaned'])
final_model = load_model('./pkl/phishing_model_final')

Transformation Pipeline and Model Successfully Loaded


In [14]:
def predict_email(email):
    cleaned = lematize(email)
    df_unseen = pd.DataFrame([cleaned], columns=['Email Text'])
    X_unseen = vectorizer.transform(df_unseen['Email Text'])
    X_unseen_df = pd.DataFrame(X_unseen.toarray(), columns=vectorizer.get_feature_names_out())
    
    unseen_predictions = predict_model(final_model, data=X_unseen_df)
    prediction = unseen_predictions['prediction_label'].values
    print(prediction)
    if prediction == 1:
        print("This email is likely a phishing attempt.")
    else:
        print("This email is likely safe.")

In [15]:
predict_email(safe)

[0]
This email is likely safe.


In [16]:
predict_email(phishing)

[1]
This email is likely a phishing attempt.
