# Process Enron Email Data to Select 300 Users with Balanced Sent and Received Emails for Virtual Donkey Project
You don't need to run this file, the processed data is already available in the Datasets folder.

In [4]:


import pandas as pd
import os

# Load the data
data = pd.read_csv('') # Path to Enron data
data = data.rename(columns={'person': 'person_ent'})

# Filter out emails where the sender and recipient are the same
data = data[data['from'] != data['to']]
data = data[data['Subject'].str.len() >= 10]

# Count the number of emails sent by each person
sent_counts = data['from'].value_counts()

# Count the number of emails received by each person
received_counts = data['to'].value_counts()

# Filter the people who have sent and received more than 50 emails
eligible_people = sent_counts[(sent_counts > 50) & (received_counts > 50)].index

print(f"Number of eligible people: {len(eligible_people)}")


# Randomly select 20 people from this filtered list
selected_people = pd.Series(eligible_people).sample(n=300, random_state=42)

# Initialize an empty list to store the reshaped data
reshaped_data = []

# Initialize a dictionary to store individual DataFrames for each person
person_dfs = {}

# For each selected person, get 50 emails they sent and 50 they received (randomly selected)
for person in selected_people:
    # Randomly sample 50 emails the person sent
    sent_emails = data[data['from'] == person].sample(n=50, random_state=42)
    sent_emails = sent_emails.assign(direction='sent')
    
    # Randomly sample 50 emails the person received
    received_emails = data[data['to'] == person].sample(n=50, random_state=42)
    received_emails = received_emails.assign(direction='received')
    
    # Combine sent and received emails for this person
    person_emails = pd.concat([sent_emails, received_emails])
    
    # Add a column for the person's name
    person_emails = person_emails.assign(person=person)
    
    # Append to the reshaped_data list
    reshaped_data.append(person_emails)
    
    # Store in the dictionary
    person_dfs[person] = person_emails

# Combine all the data for the 20 people into a single DataFrame
reshaped_df = pd.concat(reshaped_data)

# Sort the DataFrame by person and direction to make it easy to read
reshaped_df = reshaped_df.sort_values(by=['person', 'direction'])

# Display the reshaped DataFrame
reshaped_df.head(100)  # Show the first 100 rows for example

# Save the reshaped DataFrame to a CSV file
reshaped_df.to_csv('../Datasets/300_Enron_Users_For_VirtualDonkey/Filtered50People.csv', index=False)

# Save individual DataFrames to separate CSV files with sequential naming
output_dir = '../Datasets/300_Enron_Users_For_VirtualDonkey/individual_emails'
os.makedirs(output_dir, exist_ok=True)

# Use a counter to name files sequentially
counter = 1
for person, df in person_dfs.items():
    filename = f'Person{counter}_emails.csv'
    df.to_csv(os.path.join(output_dir, filename), index=False)
    counter += 1

# Display one of the individual DataFrames for verification
for person, df in person_dfs.items():
    print(f"Data for {person}:")
    print(df.head())  # Show the first 5 rows for example
    print("-" * 50)
    break  # Just to show one example, remove this to show all


Number of eligible people: 517
Data for eric.boyt@enron.com:
                                                  payload  \
508468  See below. No e-dash necessary..... I have alr...   
342933  Scott, Thanks again for taking the time to tal...   
142886  I just heard back from credit.... No e-dash is...   
497942  -----Original Message----- From: Tanya Rohauer...   
304942  Here is the latest on this issue: On Monday, A...   

                       from  \
508468  eric.boyt@enron.com   
342933  eric.boyt@enron.com   
142886  eric.boyt@enron.com   
497942  eric.boyt@enron.com   
304942  eric.boyt@enron.com   

                                                       to  \
508468                               troy.black@enron.com   
342933                               scott.neal@enron.com   
142886        dave.fuller@enron.com, jeffrey.oh@enron.com   
497942                           charles.weldon@enron.com   
304942  billy.lemmons@enron.com, james.ducote@enron.co...   

                  

In [13]:
reshaped_df

Unnamed: 0,payload,from,to,entity_list,entity_count,Subject,person_ent,phone number,address,passport number,...,credit card brand,iban,credit card expiration date,medical condition,medication,blood type,health insurance number,cvv,direction,person
453333,"Jim, would it be logical to go back to March 2...",c..williams@enron.com,d..steffes@enron.com,"'Jim' : 'person', 'Steffes, James D.' : 'perso...",3,RE: Discussion of California Background & Issues,1,0,0,0,...,0,0,0,0,0,0,0,0,received,d..steffes@enron.com
454557,You're receiving this email as a ConsumerInfo....,tracy@consumerinfo.com,d..steffes@enron.com,"'You' : 'person', 'email' : 'email', 'Responsy...",3,"Friend, Get your complete picture.",1,0,0,0,...,1,0,0,0,0,0,0,0,received,d..steffes@enron.com
454380,"Jim, I called Elizabeth Sager on this and am w...",ray.alvarez@enron.com,d..steffes@enron.com,"'Jim' : 'person', 'Elizabeth Sager' : 'person'...",3,On the lookout for Sierra Pacific,1,0,0,0,...,0,0,0,0,0,0,0,0,received,d..steffes@enron.com
397951,Can you guys handle this in my absence -----Or...,b..sanders@enron.com,d..steffes@enron.com,"'Seabron Adamson' : 'person', 'seabron.adamson...",7,FW: Portland General PRIVILEGED AND CONFIDENTIAL,1,0,0,0,...,0,0,0,0,0,0,0,0,received,d..steffes@enron.com
456517,This is from Pat Keene - SWEPCO case. Rubena -...,patrick.keene@enron.com,d..steffes@enron.com,"'Pat Keene' : 'person', 'Chris Reeder' : 'pers...",26,FW: Statement of Position,1,0,0,0,...,0,0,0,0,0,0,0,0,received,d..steffes@enron.com
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370456,fyi... -----Original Message----- From: Custom...,victor.lamadrid@enron.com,"lia.halstead@enron.com, l..kelly@enron.com, ro...",'Margaret E. Kleiner' : 'person',1,FW: Transit on the Web Up Grade,1,0,0,0,...,0,0,0,0,0,0,0,0,sent,victor.lamadrid@enron.com
36948,fyi -----Original Message----- From: navigator...,victor.lamadrid@enron.com,"chuck.ames@enron.com, f..brawner@enron.com, ch...","'navigator@nisource.com' : 'email', 'navigator...",2,FW: LINE SM-123,0,0,0,0,...,0,0,0,0,0,0,0,0,sent,victor.lamadrid@enron.com
489788,"All, A fix to prevent the occurrence of deals ...",victor.lamadrid@enron.com,"robert.allwein@enron.com, airam.arteaga@enron....","'Truong, Dat' : 'person', 'Wei, Zhiyong' : 'pe...",22,FW: Request for Migration of Sitara EOLBridge ...,1,0,0,0,...,0,0,0,0,0,0,0,0,sent,victor.lamadrid@enron.com
342043,-----Original Message----- From: navigator@nis...,victor.lamadrid@enron.com,"natalie.baker@enron.com, robert.allwein@enron....","'navigator@nisource.com' : 'email', 'navigator...",2,FW: CGT Retainage Reduction Effective Feruary ...,0,0,0,0,...,0,0,0,0,0,0,0,0,sent,victor.lamadrid@enron.com
