In [0]:
%pip install python-docx
%restart_python

In [0]:
from docx import Document
from docx.shared import Pt, Inches
from datetime import datetime, timedelta
import random
import os
import tempfile
import shutil

# Sample fictitious data
first_names = ["Emma", "Liam", "Olivia", "Noah", "Ava", "Ethan", "Sophia", "Mason", "Isabella", "William"]
last_names = ["Johnson", "Smith", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]
domains = ["techcorp.com", "globalinc.net", "innovate.io", "solutions.biz", "enterprise.org"]
departments = ["Finance", "HR", "Operations", "IT", "Sales", "Marketing", "Legal", "Procurement"]

def generate_ssn():
    """Generate a fictitious SSN"""
    return f"{random.randint(100, 999)}-{random.randint(10, 99)}-{random.randint(1000, 9999)}"

def generate_phone():
    """Generate a fictitious phone number"""
    return f"+1-{random.randint(200, 999)}-{random.randint(200, 999)}-{random.randint(1000, 9999)}"

def generate_credit_card():
    """Generate a fictitious credit card number"""
    return f"{random.randint(4000, 4999)} {random.randint(1000, 9999)} {random.randint(1000, 9999)} {random.randint(1000, 9999)}"

def generate_address():
    """Generate a fictitious address"""
    street_num = random.randint(100, 9999)
    streets = ["Oak St", "Maple Ave", "Pine Rd", "Cedar Ln", "Elm Dr", "Birch Way"]
    cities = ["New York, NY", "Los Angeles, CA", "Chicago, IL", "Houston, TX", "Phoenix, AZ"]
    return f"{street_num} {random.choice(streets)}, {random.choice(cities)} {random.randint(10000, 99999)}"

def create_email_document(doc_num):
    """Create a Word document with a fictitious email"""
    doc = Document()
    
    # Email metadata
    from_name = f"{random.choice(first_names)} {random.choice(last_names)}"
    to_name = f"{random.choice(first_names)} {random.choice(last_names)}"
    from_email = f"{from_name.lower().replace(' ', '.')}@{random.choice(domains)}"
    to_email = f"{to_name.lower().replace(' ', '.')}@{random.choice(domains)}"
    
    date = datetime.now() - timedelta(days=random.randint(1, 365))
    
    # Email header
    doc.add_heading(f'Email #{doc_num}', 0)
    
    header = doc.add_paragraph()
    header.add_run('From: ').bold = True
    header.add_run(f'{from_name} <{from_email}>\n')
    header.add_run('To: ').bold = True
    header.add_run(f'{to_name} <{to_email}>\n')
    header.add_run('Date: ').bold = True
    header.add_run(f'{date.strftime("%B %d, %Y at %I:%M %p")}\n')
    header.add_run('Subject: ').bold = True
    
    # Email subjects with different scenarios
    subjects = [
        "Employee Information Update Required",
        "Expense Reimbursement Request",
        "New Hire Documentation",
        "Benefits Enrollment Confirmation",
        "Travel Authorization Form",
        "Direct Deposit Update",
        "Background Check Results",
        "Performance Review Schedule",
        "Emergency Contact Information",
        "Payroll Correction Notice"
    ]
    header.add_run(f'{subjects[doc_num - 1]}\n\n')
    
    doc.add_paragraph('─' * 80)
    
    # Email body with PII
    body = doc.add_paragraph()
    
    if doc_num == 1:
        body.add_run(f"Hi {to_name.split()[0]},\n\n")
        body.add_run(f"We need to update your employee records. Please verify the following information:\n\n")
        body.add_run(f"Full Name: {to_name}\n")
        body.add_run(f"SSN: {generate_ssn()}\n")
        body.add_run(f"Date of Birth: {(datetime.now() - timedelta(days=random.randint(8000, 18000))).strftime('%m/%d/%Y')}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Address: {generate_address()}\n\n")
        body.add_run("Please confirm by end of week.\n\n")
    
    elif doc_num == 2:
        body.add_run(f"Dear {to_name.split()[0]},\n\n")
        body.add_run(f"Your expense reimbursement has been processed:\n\n")
        body.add_run(f"Employee ID: EMP-{random.randint(10000, 99999)}\n")
        body.add_run(f"Amount: ${random.randint(100, 5000)}.{random.randint(10, 99)}\n")
        body.add_run(f"Payment Method: Direct Deposit\n")
        body.add_run(f"Bank Account (last 4): {random.randint(1000, 9999)}\n")
        body.add_run(f"Credit Card Used: {generate_credit_card()}\n\n")
        body.add_run("Funds will be deposited within 3-5 business days.\n\n")
    
    elif doc_num == 3:
        body.add_run(f"Hello {to_name.split()[0]},\n\n")
        body.add_run(f"Welcome to the team! Here's your new hire information:\n\n")
        body.add_run(f"Full Name: {to_name}\n")
        body.add_run(f"SSN: {generate_ssn()}\n")
        body.add_run(f"Email: {to_email}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Department: {random.choice(departments)}\n")
        body.add_run(f"Start Date: {(datetime.now() + timedelta(days=random.randint(1, 30))).strftime('%m/%d/%Y')}\n")
        body.add_run(f"Salary: ${random.randint(50, 150)}K annually\n\n")
    
    elif doc_num == 4:
        body.add_run(f"Hi {to_name.split()[0]},\n\n")
        body.add_run(f"Your benefits enrollment is confirmed:\n\n")
        body.add_run(f"Employee: {to_name}\n")
        body.add_run(f"Member ID: {random.randint(100000000, 999999999)}\n")
        body.add_run(f"Date of Birth: {(datetime.now() - timedelta(days=random.randint(8000, 18000))).strftime('%m/%d/%Y')}\n")
        body.add_run(f"Dependents:\n")
        body.add_run(f"  - {random.choice(first_names)} {last_names[doc_num-1]} (Spouse) - DOB: {(datetime.now() - timedelta(days=random.randint(8000, 18000))).strftime('%m/%d/%Y')}\n")
        body.add_run(f"  - {random.choice(first_names)} {last_names[doc_num-1]} (Child) - DOB: {(datetime.now() - timedelta(days=random.randint(2000, 5000))).strftime('%m/%d/%Y')}\n\n")
    
    elif doc_num == 5:
        body.add_run(f"Dear {to_name.split()[0]},\n\n")
        body.add_run(f"Your travel authorization has been approved:\n\n")
        body.add_run(f"Traveler: {to_name}\n")
        body.add_run(f"Passport Number: {random.randint(100000000, 999999999)}\n")
        body.add_run(f"Driver's License: {random.choice(['CA', 'NY', 'TX', 'FL'])}{random.randint(10000000, 99999999)}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Emergency Contact: {random.choice(first_names)} {random.choice(last_names)} - {generate_phone()}\n\n")
    
    elif doc_num == 6:
        body.add_run(f"Hello {to_name.split()[0]},\n\n")
        body.add_run(f"Please update your direct deposit information:\n\n")
        body.add_run(f"Employee: {to_name}\n")
        body.add_run(f"SSN: {generate_ssn()}\n")
        body.add_run(f"Bank Name: {random.choice(['Chase Bank', 'Bank of America', 'Wells Fargo', 'Citibank'])}\n")
        body.add_run(f"Routing Number: {random.randint(100000000, 999999999)}\n")
        body.add_run(f"Account Number: {random.randint(1000000000, 9999999999)}\n\n")
    
    elif doc_num == 7:
        body.add_run(f"Confidential - {to_name.split()[0]},\n\n")
        body.add_run(f"Background check completed:\n\n")
        body.add_run(f"Candidate: {to_name}\n")
        body.add_run(f"SSN: {generate_ssn()}\n")
        body.add_run(f"DOB: {(datetime.now() - timedelta(days=random.randint(8000, 18000))).strftime('%m/%d/%Y')}\n")
        body.add_run(f"Address: {generate_address()}\n")
        body.add_run(f"Criminal Record: None\n")
        body.add_run(f"Credit Score: {random.randint(650, 800)}\n\n")
    
    elif doc_num == 8:
        body.add_run(f"Hi {to_name.split()[0]},\n\n")
        body.add_run(f"Your performance review is scheduled:\n\n")
        body.add_run(f"Employee: {to_name}\n")
        body.add_run(f"Employee ID: EMP-{random.randint(10000, 99999)}\n")
        body.add_run(f"Email: {to_email}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Manager: {random.choice(first_names)} {random.choice(last_names)}\n")
        body.add_run(f"Review Date: {(datetime.now() + timedelta(days=random.randint(1, 14))).strftime('%m/%d/%Y')}\n\n")
    
    elif doc_num == 9:
        body.add_run(f"Dear {to_name.split()[0]},\n\n")
        body.add_run(f"Please verify your emergency contact information:\n\n")
        body.add_run(f"Employee: {to_name}\n")
        body.add_run(f"Primary Contact: {random.choice(first_names)} {random.choice(last_names)}\n")
        body.add_run(f"Relationship: {random.choice(['Spouse', 'Parent', 'Sibling', 'Partner'])}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Secondary Contact: {random.choice(first_names)} {random.choice(last_names)}\n")
        body.add_run(f"Phone: {generate_phone()}\n")
        body.add_run(f"Medical Conditions: {random.choice(['None', 'Diabetes', 'Asthma', 'Allergies (Penicillin)'])}\n\n")
    
    else:  # doc_num == 10
        body.add_run(f"Hi {to_name.split()[0]},\n\n")
        body.add_run(f"Payroll correction processed:\n\n")
        body.add_run(f"Employee: {to_name}\n")
        body.add_run(f"SSN: {generate_ssn()}\n")
        body.add_run(f"Pay Period: {(datetime.now() - timedelta(days=14)).strftime('%m/%d/%Y')} - {datetime.now().strftime('%m/%d/%Y')}\n")
        body.add_run(f"Gross Pay: ${random.randint(2000, 8000)}.{random.randint(10, 99)}\n")
        body.add_run(f"Net Pay: ${random.randint(1500, 6000)}.{random.randint(10, 99)}\n")
        body.add_run(f"Direct Deposit Account: ****{random.randint(1000, 9999)}\n\n")
    
    body.add_run(f"Best regards,\n{from_name}\n")
    body.add_run(f"{random.choice(departments)} Department\n")
    body.add_run(f"{from_email}\n")
    body.add_run(f"{generate_phone()}")
    
    # Save document to temp file first, then copy to Volume
    volume_path = '/Volumes/shm/default/raw_docx/'
    filename = f"email_{doc_num:02d}_{subjects[doc_num-1].lower().replace(' ', '_')[:30]}.docx"
    
    # Create temp file
    with tempfile.NamedTemporaryFile(mode='wb', suffix='.docx', delete=False) as tmp_file:
        temp_path = tmp_file.name
        doc.save(temp_path)
    
    # Copy to Volume using shutil (works on serverless)
    destination_path = os.path.join(volume_path, filename)
    shutil.copy(temp_path, destination_path)
    
    # Clean up temp file
    os.remove(temp_path)
    
    print(f"Created: {filename}")
    return filename

# Generate all 10 documents
print("Generating 10 fictitious email documents with PII data...\n")
created_files = []
for i in range(1, 11):
    filename = create_email_document(i)
    created_files.append(filename)

print(f"\n✓ Successfully created {len(created_files)} documents!")
print("\nFiles created:")
for f in created_files:
    print(f"  - {f}")