In [30]:
import glob
import os
import csv
import pandas as pd

def clean_csv(file_path):
    """
    Reads a CSV from file_path and applies basic cleaning steps.
    Returns the cleaned DataFrame.
    """
    df = pd.read_csv(file_path, engine='python', on_bad_lines='skip')
    
    # Rename of columns (adjust to match every dataset)
    rename_map = {
        "MessageID": "id",
        "SubjectLine": "subject",
        "Sender": "sender",
        "Receiver": "receiver",
        "ReceivedDate": "date",
        "EmailBody": "body",
        "Email Text": "body",
        "Category": "label",
        "Email Type": "label"
        # etc...
    }
    df = df.rename(columns=rename_map)
    

# Ensure these columns exist, creating them if missing
    for col in ["sender", "subject", "receiver", "date", "label", "body"]:
        if col not in df.columns:
            df[col] = None  # or some default string

    # Now fillna won't fail, because the columns definitely exist
    df["sender"] = df["sender"].fillna("(unknown sender)")
    df["subject"] = df["subject"].fillna("(no subject)")
    df["receiver"] = df["receiver"].fillna("(unknown receiver)")
    df["date"] = df["date"].fillna("(unknown date)")

    # Drop rows missing label/body if you want to
    df.dropna(subset=["label", "body"], inplace=True)
    if "Unnamed: 0" in df.columns:
        df.drop("Unnamed: 0", axis=1, inplace=True)
    if "urls" in df.columns:
        df.drop("urls", axis=1, inplace=True)

    
    # Lowercase and strip subject/body text
    if "subject" in df.columns:
        df["subject"] = (
            df["subject"]
            .astype(str)                            # Ensure it’s string
            .str.lower()                            # Convert to lowercase
            .str.strip()                            # Remove leading/trailing whitespace
            .str.replace('\n', ' ', regex=True)     # Replace newlines with a space
        )
    if "body" in df.columns:
        df["body"] = (
            df["body"]
            .astype(str)
            .str.lower()
            .str.replace('\n', ' ', regex=True)
            .str.strip()
        )
    
    # Standardize labels to numeric
    label_map = {
        "ham": 0,
        "legitimate": 0,
        "0.0": 0,
        "Safe Email": 0,
        "phishing": 1,
        "spam": 1,
        "1.0": 1,
        "Phishing Email": 1
        # etc...
    }

    if "label" in df.columns:
        df["label"] = df["label"].map(label_map).fillna(df["label"])
        # fillna() to keep unknown labels as-is or re-map them
    df["label"] = pd.to_numeric(df["label"], errors="coerce")  # Convert "1.0" -> 1.0 float
    df["label"] = df["label"].astype("Int64")                  # or int if you're sure there are no NaNs


    # Return the cleaned DataFrame
    return df

csv_files = glob.glob("../data/*.csv")

for file_path in csv_files:
    print(f"Processing {file_path}...")
    try:
        cleaned_df = clean_csv(file_path)

        # Build a new filename and places it in different directory
        base_name = os.path.basename(file_path)        # Extracts the filename from the path
        name_no_ext = os.path.splitext(base_name)[0]   # Removes the file extension
        cleaned_name = f"{name_no_ext}_cleaned.csv"    # Adds a new file extension to the filename
        output_path = os.path.join("../data/processed_data/", cleaned_name) # Combines the new filename with a directory

        cleaned_df.to_csv(output_path, index=False, quoting=csv.QUOTE_ALL) # Save the cleaned DataFrame to a new CSV file putting all fields in quotes
        print(f"Saved cleaned file to {output_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print("Done!")

Processing ../data\CEAS_08.csv...
Saved cleaned file to ../data/processed_data/CEAS_08_cleaned.csv
Processing ../data\Enron.csv...
Saved cleaned file to ../data/processed_data/Enron_cleaned.csv
Processing ../data\Ling.csv...
Saved cleaned file to ../data/processed_data/Ling_cleaned.csv
Processing ../data\Nazario.csv...
Saved cleaned file to ../data/processed_data/Nazario_cleaned.csv
Processing ../data\Nigerian_Fraud.csv...
Saved cleaned file to ../data/processed_data/Nigerian_Fraud_cleaned.csv
Processing ../data\Phishing_Email2.csv...
Saved cleaned file to ../data/processed_data/Phishing_Email2_cleaned.csv
Processing ../data\SpamAssasin.csv...
Saved cleaned file to ../data/processed_data/SpamAssasin_cleaned.csv
Done!


In [27]:
# Code to check the files
import pandas as pd

# Replace the path with your actual file path if needed:
df = pd.read_csv("../data/processed_data/Phishing_Email2_cleaned.csv")

print(df.columns)        # Shows all column names
print(df.dtypes)         # Shows the data types
pd.set_option('display.max_colwidth', 500)  # or None for no truncation
pd.set_option('display.max_columns', None)  # to see all columns
df.head()

Index(['Unnamed: 0', 'body', 'label', 'sender', 'subject', 'receiver', 'date'], dtype='object')
Unnamed: 0    object
body          object
label         object
sender        object
subject       object
receiver      object
date          object
dtype: object


Unnamed: 0.1,Unnamed: 0,body,label,sender,subject,receiver,date
0,0,"re : 6 . 1100 , disc : uniformitarianism , re : 1086 ; sex / lang dick hudson 's observations on us use of 's on ' but not 'd aughter ' as a vocative are very thought-provoking , but i am not sure that it is fair to attribute this to "" sons "" being "" treated like senior relatives "" . for one thing , we do n't normally use ' brother ' in this way any more than we do 'd aughter ' , and it is hard to imagine a natural class comprising senior relatives and 's on ' but excluding ' brother ' . for...",0.0,(unknown sender),(no subject),(unknown receiver),(unknown date)
1,1,"the other side of * galicismos * * galicismo * is a spanish term which names the improper introduction of french words which are spanish sounding and thus very deceptive to the ear . * galicismo * is often considered to be a * barbarismo * . what would be the term which designates the opposite phenomenon , that is unlawful words of spanish origin which may have crept into french ? can someone provide examples ? thank you joseph m kozono < kozonoj @ gunet . georgetown . edu >",0.0,(unknown sender),(no subject),(unknown receiver),(unknown date)
2,2,"re : equistar deal tickets are you still available to assist robert with entering the new deal tickets for equistar ? after talking with bryan hull and anita luong , kyle and i decided we only need 1 additional sale ticket and 1 additional buyback ticket set up . - - - - - - - - - - - - - - - - - - - - - - forwarded by tina valadez / hou / ect on 04 / 06 / 2000 12 : 56 pm - - - - - - - - - - - - - - - - - - - - - - - - - - - from : robert e lloyd on 04 / 06 / 2000 12 : 40 pm to : tina valade...",0.0,(unknown sender),(no subject),(unknown receiver),(unknown date)
3,3,"hello i am your hot lil horny toy. i am the one you dream about, i am a very open minded person, love to talk about and any subject. fantasy is my way of life, ultimate in sex play. ummmmmmmmmmmmmm i am wet and ready for you. it is not your looks but your imagination that matters most, with my sexy voice i can make your dream come true... hurry up! call me let me cummmmm for you.......................... toll-free: 1-877-451-teen (1-...",1.0,(unknown sender),(no subject),(unknown receiver),(unknown date)
4,4,"software at incredibly low prices ( 86 % lower ) . drapery seventeen term represent any sing . feet wild break able build . tail , send subtract represent . job cow student inch gave . let still warm , family draw , land book . glass plan include . sentence is , hat silent nothing . order , wild famous long their . inch such , saw , person , save . face , especially sentence science . certain , cry does . two depend yes , written carry .",1.0,(unknown sender),(no subject),(unknown receiver),(unknown date)
