In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
raw_df = pd.read_csv("../raw_data/Phishing_Email.csv")

raw_df.head()

Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [3]:
print(f"Num Rows:{raw_df.shape[0]}")
print(f"Num Columns:{raw_df.shape[1]}")
print(f"Column names:{raw_df.columns}")

Num Rows:18650
Num Columns:3
Column names:Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')


In [4]:
raw_df = raw_df.rename(columns={
    "Email Text": "email_text",
    "Email Type": "email_type"
})

In [5]:
raw_df["email_type"] = raw_df["email_type"].replace({
    "Safe Email": "false",
    "Phishing Email": "true"
})

In [6]:
raw_df = raw_df.drop(columns=["Unnamed: 0"])

In [7]:
print(f"Updated column names: {raw_df.columns}")
print(f"Unique values in email_type: {raw_df['email_type'].unique()}")
print(f"DataFrame shape: {raw_df.shape}")

Updated column names: Index(['email_text', 'email_type'], dtype='object')
Unique values in email_type: ['false' 'true']
DataFrame shape: (18650, 2)


In [8]:
formatted_data = []

system_prompt = "You are a classification system designed to catch phishing messages to protect people from fraudsters and criminals. You will receive a message for review and if it is a phishing email, you MUST respond with only 'true' if it is phishing, or 'false' if it is not phishing. Making a mistake or failing to comply with the output format can result in serious harm to vulnerable people."

user_prompt = (lambda text: f"Message for review: {text}")

llm_response = (lambda completion: completion)

for _, row in raw_df.iterrows():

    formatted_data.append({
        "system": system_prompt,
        "user": user_prompt(row["email_text"]),
        "assistant": llm_response(row["email_type"]),
    })

training_df = pd.DataFrame(formatted_data)


In [9]:
training_df.head()

Unnamed: 0,system,user,assistant
0,You are a classification system designed to ca...,"Message for review: re : 6 . 1100 , disc : uni...",False
1,You are a classification system designed to ca...,Message for review: the other side of * galici...,False
2,You are a classification system designed to ca...,Message for review: re : equistar deal tickets...,False
3,You are a classification system designed to ca...,Message for review: \nHello I am your hot lil ...,True
4,You are a classification system designed to ca...,Message for review: software at incredibly low...,True


In [10]:
print(f"Num Rows:{training_df.shape[0]}")
print(f"Num Columns:{training_df.shape[1]}")
print(f"Column names:{training_df.columns}")

Num Rows:18650
Num Columns:3
Column names:Index(['system', 'user', 'assistant'], dtype='object')


In [11]:
print(f"Original Distribution: {training_df['assistant'].value_counts(normalize=True)}")

Original Distribution: assistant
false    0.607078
true     0.392922
Name: proportion, dtype: float64


In [12]:
train_df, test_df = train_test_split(
    training_df,
    test_size=0.2,
    random_state=69,
    stratify=training_df["assistant"]
)

In [13]:
print("Train set distribution:")
print(train_df['assistant'].value_counts(normalize=True))
print("Test set distribution:")
print(test_df['assistant'].value_counts(normalize=True))

print(f"\nTotal samples: {len(training_df)}")
print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")

Train set distribution:
assistant
false    0.607105
true     0.392895
Name: proportion, dtype: float64
Test set distribution:
assistant
false    0.606971
true     0.393029
Name: proportion, dtype: float64

Total samples: 18650
Training samples: 14920
Test samples: 3730


In [14]:
train_df.to_csv("../processed_data/train.csv", index=False)
test_df.to_csv("../processed_data/test.csv", index=False)