Importer les bibliothèques et les méthodes nécessaires 

Corriger le path

In [1]:
import sys,os
sys.path.append(os.path.abspath(".."))

In [2]:
import pandas as pd
from preprocessing.text_preprocessing import clean_text
from features.feature_extraction import extract_all_features

Chargement du dataset Phishing Email de Kaggle

In [3]:
df_kaggle = pd.read_csv("../data/raw/Phishing_Email.csv")

# Vérifier les colonnes
print(df_kaggle.columns)
df_kaggle.head()

Index(['Unnamed: 0', 'Email Text', 'Email Type'], dtype='object')


Unnamed: 0.1,Unnamed: 0,Email Text,Email Type
0,0,"re : 6 . 1100 , disc : uniformitarianism , re ...",Safe Email
1,1,the other side of * galicismos * * galicismo *...,Safe Email
2,2,re : equistar deal tickets are you still avail...,Safe Email
3,3,\nHello I am your hot lil horny toy.\n I am...,Phishing Email
4,4,software at incredibly low prices ( 86 % lower...,Phishing Email


In [4]:
"""df_kaggle = df_kaggle.rename(columns={
    "Email Text": "text",  
    "Email Type": "label"
})"""

'df_kaggle = df_kaggle.rename(columns={\n    "Email Text": "text",  \n    "Email Type": "label"\n})'

Chargement d'autre dataset SpamAssassin

In [5]:
def load_spamassassin(base_path):
    texts, labels = [], []

    for folder, label in [("easy_ham", 0), ("hard_ham", 0), ("spam", 1)]:
        folder_path = os.path.join(base_path, folder)

        for file in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file)
            if os.path.isfile(file_path):
                with open(file_path, "r", encoding="latin1", errors="ignore") as f:
                    texts.append(f.read())
                    labels.append(label)

    return pd.DataFrame({"text": texts, "label": labels})

df_spamassassin = load_spamassassin("../data/raw/spamassassin")

Nettoyage Indépendant des 2 datasets

In [6]:
def prepare_dataset(df, text_col, label_col, label_mapping=None):
    df = df[[text_col, label_col]].copy()
    df.columns = ["text", "label"]

    # Suppression NaN
    df.dropna(subset=["text", "label"], inplace=True)

    # FORCER le texte en string (IMPORTANT pour SpamAssassin)
    df["text"] = df["text"].astype(str)

    # Nettoyage du texte
    df["text"] = df["text"].apply(clean_text)

    # Suppression textes vides
    df = df[df["text"].str.len() > 0]

    # Mapping labels si nécessaire
    if label_mapping is not None:
        df["label"] = df["label"].map(label_mapping)

    # Suppression doublons APRÈS nettoyage
    df.drop_duplicates(subset=["text"], inplace=True)

    df.reset_index(drop=True, inplace=True)
    return df

Nettoyage indépendant des datasets

In [7]:
label_map_kaggle = {
    "Safe Email": 0,
    "Phishing Email": 1
}

df_kaggle_clean = prepare_dataset(
    df_kaggle,
    text_col="Email Text",
    label_col="Email Type",
    label_mapping=label_map_kaggle
)

df_spamassassin_clean = prepare_dataset(
    df_spamassassin,
    text_col="text",
    label_col="label"
)

Fusion des 2 datasets

In [8]:
final_text_df = pd.concat(
    [df_kaggle_clean, df_spamassassin_clean],
    ignore_index=True
)

# Sécurité finale
final_text_df.drop_duplicates(subset=["text"], inplace=True)
final_text_df.reset_index(drop=True, inplace=True)

print(final_text_df.shape)
print(final_text_df["label"].value_counts())

(17425, 2)
label
0.0    10960
1.0     6465
Name: count, dtype: int64


In [10]:
final_text_df = pd.concat(
    [df_kaggle_clean, df_spamassassin_clean],
    ignore_index=True
)

print("Doublons AVANT :", final_text_df.duplicated(subset=["text"]).sum())


# Suppression définitive des doublons texte
final_text_df.drop_duplicates(subset=["text"], inplace=True)

print("Doublons APRÈS :", final_text_df.duplicated(subset=["text"]).sum())
# 2. (OPTIONNEL)
if "final_df" in globals():
    del final_df

Doublons AVANT : 0
Doublons APRÈS : 0


Extraction automatique des features

In [11]:
features_list = []

for _, row in final_text_df.iterrows():
    email_data = {
        "Body_Text": row["text"],
        "Attachments": [],
        "Received-SPF": None,
        "DKIM-Signature": None,
        "Authentication-Results": None
    }

    features = extract_all_features(email_data, row["text"])
    features["label"] = row["label"]

    features_list.append(features)

final_df = pd.DataFrame(features_list)

Vérification des valeurs manquantes

In [12]:
print(final_df.isnull().sum())

text_length              0
word_count               0
suspicious_word_count    0
url_count                0
https_url_count          0
unique_domains           0
spf_present              0
dkim_present             0
dmarc_present            0
attachment_count         0
dangerous_attachment     0
label                    0
dtype: int64


Vérification des doublons

In [13]:
print(final_df.duplicated().sum())

1716


Vérification de la répartition des classes

In [14]:
print(final_df["label"].value_counts())

label
0.0    10960
1.0     6465
Name: count, dtype: int64


In [15]:
print(final_df.head())
print(final_df.describe())

   text_length  word_count  suspicious_word_count  url_count  https_url_count  \
0          919         180                      0          0                0   
1          414          69                      0          0                0   
2         1053         209                      0          0                0   
3          500         107                      0          0                0   
4          387          64                      0          0                0   

   unique_domains  spf_present  dkim_present  dmarc_present  attachment_count  \
0               0            0             0              0                 0   
1               0            0             0              0                 0   
2               0            0             0              0                 0   
3               0            0             0              0                 0   
4               0            0             0              0                 0   

   dangerous_attachment  l

In [16]:
final_df.to_csv("../data/processed/emails_features.csv", index=False)