<a href="https://colab.research.google.com/github/S-Delowar/LLM-Email-Subjector/blob/main/process_email_subjectline_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install datasets --upgrade --quiet

In [5]:
import os
import pandas as pd
from datasets import load_dataset, Dataset

## Load dataset from Huggingface Datasets
Source: [Yale-LILY/aeslc](https://huggingface.co/datasets/Yale-LILY/aeslc)

In [7]:
dataset = load_dataset("Yale-LILY/aeslc")
dataset

DatasetDict({
    train: Dataset({
        features: ['email_body', 'subject_line'],
        num_rows: 14436
    })
    validation: Dataset({
        features: ['email_body', 'subject_line'],
        num_rows: 1960
    })
    test: Dataset({
        features: ['email_body', 'subject_line'],
        num_rows: 1906
    })
})

## Process Data

In [8]:
train_df = pd.DataFrame(dataset["train"])
val_df = pd.DataFrame(dataset["validation"])
test_df = pd.DataFrame(dataset["test"])

In [9]:
train_df.head()

Unnamed: 0,email_body,subject_line
0,"Greg/Phillip, Attached is the Grande Communic...",Service Agreement
1,Phillip & Keith Attached is the first draw re...,Bishops Corner
2,Your Internet Banking accounts are now setup a...,Internet Banking
3,To our IBS Customers that are still hanging in...,Internet Banking
4,Phillip Good Morning!\nI hope you had a wonder...,SMEs for expert stories


### Cleaning

In [10]:
print(f"Before cleaning:\n===============")
print(f"Shape of Train data: {train_df.shape}")
print(f"Shape of Validation data: {val_df.shape}")
print(f"Shape of Test data: {test_df.shape}")

Before cleaning:
Shape of Train data: (14436, 2)
Shape of Validation data: (1960, 2)
Shape of Test data: (1906, 2)


In [11]:
# Cleans the email dataset by:
# - Removing rows with missing or empty 'email_body' or 'subject_line'
# - Dropping duplicates
# - Filtering subject lines by length (between 6 and 80 characters)
# - Resetting the index after cleaning

def clean_df(df):
  df = df.dropna(subset=["email_body", "subject_line"])
  df = df[~df["email_body"].str.strip().eq("")]
  df = df[~df["subject_line"].str.strip().eq("")]
  df = df.drop_duplicates()
  df = df[(df["subject_line"].str.len() > 5) & (df["subject_line"].str.len() <= 80)]
  df = df.reset_index(drop=True)
  return df

In [12]:
train_df = clean_df(train_df)
val_df = clean_df(val_df)
test_df = clean_df(test_df)

In [13]:
print(f"After Cleaning:\n===============")
print(f"Shape of Train data: {train_df.shape}")
print(f"Shape of Validation data: {val_df.shape}")
print(f"Shape of Test data: {test_df.shape}")

After Cleaning:
Shape of Train data: (12794, 2)
Shape of Validation data: (1734, 2)
Shape of Test data: (1718, 2)


### Add Instruction

In [14]:
import random

# Define possible prompt instructions

def generate_prompt(row):
  instructions = [
    "Generate a concise subject line for this email:",
    "What would be a good subject for this message?",
    "Write a professional subject for this mail:",
    "Suggest a subject line for the email below:",
  ]

  instruction = random.choice(instructions)
  return f"{instruction}\n\n{row['email_body']}"


def format_with_instruction(df):
  processed_df = pd.DataFrame({
      "input": df.apply(generate_prompt, axis=1),
      "output": df["subject_line"]
  })

  return processed_df

In [15]:
final_train_df = format_with_instruction(train_df)
final_val_df = format_with_instruction(val_df)
final_test_df = format_with_instruction(test_df)

In [16]:
print(f"""Sample Input:\n=========\n{final_train_df["input"][3000]}""")
print(f"""Sample Output:\n=========\n{final_train_df["output"][3000]}""")

Sample Input:
Suggest a subject line for the email below:

Houston Attendees:  Please convene in Conference Room 791 for the meeting to discuss ETS Risk Management Activities.
This videoconference is scheduled for Monday, October 1, 1:00 - 3:00 p.m.
If you have not yet responded to the meeting invitation (sent on September 19), please do so at your earliest convenience.
Please let me know if you have any questions or require additional information.
Thanks, everyone.

Sample Output:
Meeting Location in Houston


## Save the final data

In [17]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# Define directories
save_path = "/content/drive/MyDrive/email_subjectline/processed_data"

# Create the directory if it doesn't exist
os.makedirs(save_path, exist_ok=True)

In [19]:
# Save the dataframes to CSV files in Google Drive

final_train_df.to_csv(os.path.join(save_path, 'email_subjectline_train.csv'), index=False)
final_val_df.to_csv(os.path.join(save_path, 'email_subjectline_val.csv'), index=False)
final_test_df.to_csv(os.path.join(save_path, 'emai_subjectline_test.csv'), index=False)
