Recommended Data Pre-processing Examples

Take 2 Columns from a CSV and save as a npy file

These get saved in a special folder, with a function later to combine them all into separate spam/ham CSV files

In [None]:
import pandas as pd
import numpy as np

def rename_columns(filename, ID, text_column_name, class_column_name, tags_to_replace):
    csv_data = pd.read_csv(filename)

    export_data = csv_data[[text_column_name, class_column_name]].copy()

    if class_column_name in export_data:
        export_data[class_column_name].replace(tags_to_replace, inplace=True)

    np.save('../Datasets/NumpyData/' + ID + '.npy', export_data.to_numpy())
    print(export_data.head())

How to get an Email Body from an email

In [None]:
import email
from email import policy
from email.parser import BytesParser

def extract_body_from_email(raw_email, file_path):
    msg = BytesParser(policy=policy.default).parsebytes(raw_email)
    if msg.is_multipart():
        for part in msg.iter_parts():
            if part.get_content_type() == 'text/plain':
                charset = part.get_content_charset()
                return decode_payload(part.get_payload(decode=True), charset, file_path)
    else:
        charset = msg.get_content_charset()
        return decode_payload(msg.get_payload(decode=True), charset, file_path)

#This is really only necessary if you have emails that cover multiple languages. Most emails are UTF-8.
def decode_payload(payload, charset, file_path):
    encodings = [charset, 'utf-8', 'ascii', 'latin-1']
    for enc in encodings:
        if enc:
            try:
                return payload.decode(enc)
            except (UnicodeDecodeError, LookupError):
                continue
    print(f"Failed to decode email in file: {file_path}")
    return None


Combine all the .npy files into a Spam and Ham CSV

In [None]:
import pandas as pd
import numpy as np
import os

def load_npy_files_and_combine_to_csv(directory, spam_output_csv, ham_output_csv):
    npy_files = [f for f in os.listdir(directory) if f.endswith('.npy')]
    print(f"Found .npy files: {npy_files}")
    
    spam_frames = []
    ham_frames = []

    for npy_file in npy_files:
        npy_path = os.path.join(directory, npy_file)
        data = np.load(npy_path, allow_pickle=True)
        df = pd.DataFrame(data, columns=["text", "label"])
        
        spam_df = df[df['label'] == 'spam']
        ham_df = df[df['label'] == 'ham']
        
        spam_frames.append(spam_df)
        ham_frames.append(ham_df)
    
    combined_spam_df = pd.concat(spam_frames, ignore_index=True).drop_duplicates()
    combined_ham_df = pd.concat(ham_frames, ignore_index=True).drop_duplicates()
    
    combined_spam_df.to_csv(spam_output_csv, index=False)
    combined_ham_df.to_csv(ham_output_csv, index=False)
    
    print(f"Spam CSV saved to: {spam_output_csv}")
    print(f"Ham CSV saved to: {ham_output_csv}")

Remove Stopwords and HTML from Email bodies

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import string

def clean_text(text, stopwords):
    try:
        # Skip emails with no body text
        if pd.isna(text):
            return ""

        # Remove HTML
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text()

        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))

        words = text.split()
        no_stopwords = [word for word in words if word.lower() not in stopwords]
        cleaned_text = " ".join(no_stopwords)
        return cleaned_text
    except Exception as e:
        print(f"Error processing text: {e}")
        return ""

Convert the "spam" and "ham" labels to a consistent mapping.

In [None]:
def map_labels_and_save(input_file, output_file, label_mapping):
    df = pd.read_csv(input_file)
    df['label'] = df['label'].map(label_mapping)
    df.to_csv(output_file, index=False)