<a href="https://colab.research.google.com/github/S-Delowar/LLM-Email-Subjector/blob/main/process_email_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
print(torch.cuda.is_available())
print(torch.__version__)


True
2.5.1+cu124


In [2]:
!pip install datasets -q

In [3]:
# dependencies
import pandas as pd
import random
from datasets import load_dataset, Dataset
import os

## Load the Dataset
Enron email parsed Dataset from Huggingface: [Hellisotherpeople/enron_emails_parsed](https://huggingface.co/datasets/Hellisotherpeople/enron_emails_parsed)

In [5]:
dataset = load_dataset("Hellisotherpeople/enron_emails_parsed", split="train")
dataset

Dataset({
    features: ['date', 'from', 'to', 'subject', 'body'],
    num_rows: 535703
})

## Process the dataset


In [6]:
df = dataset.to_pandas()

In [7]:
df.head(5)

Unnamed: 0,date,from,to,subject,body
0,"Mon, 14 May 2001 16:39:00 -0700 (PDT)",phillip.allen@enron.com,tim.belden@enron.com,,Here is our forecast
1,"Fri, 4 May 2001 13:51:00 -0700 (PDT)",phillip.allen@enron.com,john.lavorato@enron.com,Re:,Traveling to have a business meeting takes th...
2,"Wed, 18 Oct 2000 03:00:00 -0700 (PDT)",phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,test successful. way to go!!!
3,"Mon, 23 Oct 2000 06:13:00 -0700 (PDT)",phillip.allen@enron.com,randall.gay@enron.com,,"Randy, Can you send me a schedule of the sala..."
4,"Thu, 31 Aug 2000 05:07:00 -0700 (PDT)",phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,Let's shoot for Tuesday at 11:45.


In [8]:
# Keep only subject and body columns
df = df[["subject", "body"]]

In [9]:
# Clean subject: remove prefixes like "Re:", "FW:".
df["subject"] = df["subject"].str.replace(r'^\s*(Re:|FW:|Fw:)\s*', '', case=False, regex=True)

# Clean body: remove forwarded sections
df["body"] = df["body"].str.split("---------------------- Forwarded by").str[0].str.strip()

# Remove patterns like "Cc:" or closings like "Regards"
pattern = r'^(Cc:|Bcc:).*@|(Regards|Best),?\s+\w+'
df = df[~df["body"].str.contains(pattern, regex=True, case=False)].reset_index(drop=True)


  df = df[~df["body"].str.contains(pattern, regex=True, case=False)].reset_index(drop=True)


In [10]:
df.head(10)

Unnamed: 0,subject,body
0,,Here is our forecast
1,,Traveling to have a business meeting takes the...
2,test,test successful. way to go!!!
3,,"Randy, Can you send me a schedule of the salar..."
4,Hello,Let's shoot for Tuesday at 11:45.
5,Hello,"Greg, How about either next Tuesday or Thursda..."
6,,Please cc the following distribution list with...
7,PRC review - phone calls,any morning between 10 and 11:30
8,High Speed Internet Access,1. login: pallen pw: ke9davis I don't think t...
9,fixed forward or other Collar floor gas price ...,


In [11]:
df = df.dropna()  # Drop NaN values
# removing whitespaces or empty string
df = df[
    df["subject"].str.strip().astype(bool) &
    df["body"].str.strip().astype(bool)
]

In [12]:
df.shape

(325920, 2)

In [13]:
# Apply length filters
df["sub_len_char"] = df["subject"].str.len()
df["body_len_char"] = df["body"].str.len()


df.head(10)

Unnamed: 0,subject,body,sub_len_char,body_len_char
2,test,test successful. way to go!!!,4,30
4,Hello,Let's shoot for Tuesday at 11:45.,5,33
5,Hello,"Greg, How about either next Tuesday or Thursda...",5,56
7,PRC review - phone calls,any morning between 10 and 11:30,24,32
8,High Speed Internet Access,1. login: pallen pw: ke9davis I don't think t...,26,202
10,FW: fixed forward or other Collar floor gas pr...,"Mr. Buckner, For delivered gas behind San Dieg...",55,205
15,2001 Margin Plan,"Paula, 35 million is fine Phillip",16,33
21,Not business related..,I think Fletch has a good CPA. I am still doi...,22,56
22,Original Sept check/closing,"Brenda, Please use the second check as my Octo...",27,140
24,San Juan Index,"Liane, As we discussed yesterday, I am concern...",14,2130


In [14]:
import numpy as np
np.min(df["sub_len_char"]), np.max(df["sub_len_char"])

(1, 2009607)

In [15]:
np.min(df["body_len_char"]), np.max(df["body_len_char"])

(1, 2010630)

In [16]:
# For taking best emails, keep rows where subject length in between 20 to 80 characters
# and body length in between 50 to 1500

df = df[(df["sub_len_char"] >= 20) & (df["sub_len_char"] <= 80)]
df = df[(df["body_len_char"] >= 50) & (df["body_len_char"] <= 1500)]
df.reset_index(drop=True, inplace=True)

In [17]:
df.shape

(100923, 4)

In [18]:
df.tail()

Unnamed: 0,subject,body,sub_len_char,body_len_char
100918,EcoElectrica LNG Cargo in January 2002,Based on today's closing NYMEX prices there is...,39,875
100919,Reminder: Risk Management Simulation Questions,"Hi Andy, Thank you for briefly meeting with m...",47,538
100920,Non-Exempt Scorecard,The following was developed in an effort to as...,21,162
100921,Weekly Reports are Due,REMINDER: Your weekly update for Philippe and...,23,146
100922,Latest Marketing List,"Please find attached the latest, and what shou...",22,1093


In [19]:
# Create subset with randomly 15000 rows taking body & subject columns
df_subset = df[["body", "subject"]].sample(n=15_000, random_state=42).reset_index(drop=True)


In [20]:
df_subset.shape

(15000, 2)

# Including possible prompt instructions

In [21]:
# Define possible prompt instructions
instructions = [
    "Create a concise subject line for this email:",
    "Generate an email subject for this content:",
    "What would be a good subject for this message?",
    "Summarize this email into a subject line:",
    "Write a professional subject for:",
    "The following is the body of an email. Write a good subject for it:",
    "What would be an appropriate subject for this content?",
    "Suggest a subject line for the email below:",
]

In [22]:
# Function for adding prompt instructions with the body
def format_instruction(body):
    instruction = random.choice(instructions)
    return f"{instruction}\n\n{body}"

In [23]:
email_subjects = pd.DataFrame()

In [24]:
# Apply prompt formatting
email_subjects["input"] = df_subset["body"].apply(format_instruction)
email_subjects["output"] = df_subset["subject"]

In [25]:
email_subjects.head(10)

Unnamed: 0,input,output
0,Summarize this email into a subject line:\n\nK...,EES Daily for October 16th
1,Write a professional subject for:\n\nCotton Va...,Purchase and Sale Nominations - Eastrans Pipel...
2,Create a concise subject line for this email:\...,revised daily energy letter 1/22
3,What would be an appropriate subject for this ...,Video Message from Ken Lay on Up Front
4,Suggest a subject line for the email below:\n\...,Team Development Training
5,The following is the body of an email. Write a...,FANTASY FOOTBALL 2000
6,What would be an appropriate subject for this ...,Request Submitted: Access Request for patti.su...
7,Create a concise subject line for this email:\...,Weekly Activity Report-Roswell Area
8,Generate an email subject for this content:\n\...,Flight back from Super Bowl
9,Suggest a subject line for the email below:\n\...,ICE physical volumes


## Spliting into train, valid and test data

In [26]:
from sklearn.model_selection import train_test_split

In [32]:
# Split into train/validation/test as (80/10/10)
train_df, temp_df = train_test_split(email_subjects, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [33]:
# Shapes
print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (12000, 2)
Validation shape: (1500, 2)
Test shape: (1500, 2)


# Saving final data


In [34]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# Define directories
save_path = "/content/drive/MyDrive/Colab Notebooks/email_subject/processed_data"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

In [36]:
# Save the dataframes to CSV files in Google Drive

train_df.to_csv(os.path.join(save_path, 'email_subject_train.csv'), index=False)
val_df.to_csv(os.path.join(save_path, 'email_subject_val.csv'), index=False)
test_df.to_csv(os.path.join(save_path, 'emai_subject_test.csv'), index=False)


## Final data sample

In [37]:
# Sample of final data
print(f"Input (body): {email_subjects['input'][0]}")
print(f"\nOutput (Subject): {email_subjects['output'][0]}")

Input (body): Summarize this email into a subject line:

Kysa M. Alport Enron North America (O) 503-464-7486 (C) 503-706-5308

Output (Subject): EES Daily for October 16th


In [38]:

print(f"Input (body): {email_subjects['input'][1100]}")
print(f"\nOutput (Subject): {email_subjects['output'][1100]}")

Input (body): The following is the body of an email. Write a good subject for it:


Output (Subject): FBI information regarding possible terrorist threat on West
