In [44]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    os.chdir(project_root)

In [46]:
from dotenv import load_dotenv

load_dotenv()

False

In [47]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"CUDA device name: {torch.cuda.get_device_name()}")

CUDA available: True
CUDA device count: 1
Current CUDA device: 0
CUDA device name: NVIDIA GeForce RTX 3060 Laptop GPU


In [48]:
import random
from datetime import datetime, timedelta

PRODUCTS = [
    "EchoWave One",
    "EchoWave Mini",
    "SoundBar Flex",
    "BassHub Pro",
    "AuraPods",
    "StreamLink Hub",
    "PulseDock",
    "WavePanel Touch",
    "EchoWave One Plus",
    "AuraPods Gen 2",
    "BassHub Pro Limited",
]

SUBSCRIPTIONS = [
    "Echo+ Unlimited",
    "SmartSound Premium",
    "HomeAudio Pro",
]

COUNTRIES = ["Germany", "France", "USA", "UK", "Netherlands", "Sweden"]

COMPANIES = [
    "AudioCore Distributors",
    "MegaTech Retail",
    "GlobalElectro",
    "Elite Sound Partners",
    "NextWav Studios",
]

TONE_STYLES = [
    "Angry and all caps",
    "Overly polite and long-winded",
    "Rushed and fragmented",
    "Confused with multiple questions",
    "Sarcastic",
    "Passive-aggressive",
    "Short and rude",
    "Polite and unsure",
    "Demanding and direct",
    "Overly casual",
    "Helpful and satisfied",
    "Excited and grateful",
    "Neutral and professional",
    "Concerned but polite",
    "Frustrated but calm",
]

VAGUENESS_LEVELS = [
    {
        "name": "clear",
        "description": "Include product name, order number, dates, and detailed context. Both subject and body are present and specific.",
        "weight": 0.3,
    },
    {
        "name": "medium",
        "description": "Include some key info, but leave out details (e.g. no order number). Subject/body may be vague or phrased as a question.",
        "weight": 0.5,
    },
    {
        "name": "vague",
        "description": "Message is emotional or unclear. Subject or body may be missing. Could be as short as 'It’s broken' or 'Need help'.",
        "weight": 0.2,
    },
]


def random_order_date(max_days=365, bias_power=2):
    r = random.random()
    days_ago = int((1 - r**bias_power) * max_days)
    return (datetime.today() - timedelta(days=days_ago)).strftime("%B %d")


def tech_support_seed():
    question_only_issues = [
        "where’s the manual?",
        "how long does the battery last?",
        "is it waterproof?",
        "what does the app need?",
        "does it support multipoint?",
        "can it connect to older devices?",
        "what model should I buy?",
        "what features does it have?",
        "will this work with my setup?",
        "need product advice for a specific case",
        "is it compatible with my system?",
    ]
    product_issues = [
        "won't connect",
        "keeps restarting",
        "firmware failed",
        "no sound",
        "blinking lights",
        "overheating",
        "charging issue",
        "won't turn on",
        "Bluetooth not pairing",
        "disconnects randomly",
        "voice assistant doesn’t respond",
        "buttons don’t work",
        "mic not working",
        "buzzing or static noise",
        "volume too low",
        "app can’t find device",
        "touch panel doesn’t respond",
        "firmware update broke it",
    ]
    r = random.random()
    if r < 0.3:
        return {"issue_type": random.choice(question_only_issues)}
    elif r < 0.7:
        return {
            "product": random.choice(PRODUCTS),
            "issue_type": random.choice(product_issues),
        }
    else:
        return {
            "product": random.choice(PRODUCTS),
            "issue_type": random.choice(product_issues),
            "firmware_version": f"{random.randint(2, 5)}.{random.randint(0, 9)}.{random.randint(0, 9)}",
        }


def billing_seed():
    return {
        "subscription": random.choice(SUBSCRIPTIONS),
        "charge_amount": random.choice(["$14.99", "$29.99", "$7.99"]),
        "charge_date": (
            datetime.today() - timedelta(days=random.randint(1, 25))
        ).strftime("%B %d"),
        "problem": random.choice(
            [
                "unexpected charge",
                "charged after cancel",
                "double billing",
                "invoice mismatch",
            ]
        ),
        "billing_method": random.choice(["credit card", "paypal", "bank account"]),
    }


def shipping_seed():
    return {
        "product": random.choice(PRODUCTS),
        "order_number": random.randint(10000, 99999),
        "order_date": random_order_date(),
        "destination_country": random.choice(COUNTRIES),
        "issue": random.choice(
            [
                "not delivered",
                "wrong address",
                "no tracking info",
                "stuck in transit",
                "return label missing",
                "return instructions unclear",
            ]
        ),
        "extra_note": random.choice(
            [
                "needs fast resolution",
                "already emailed once",
                "item needed for event",
                "wants refund after return",
            ]
        ),
    }


def legal_seed():
    return {
        "country": random.choice(COUNTRIES),
        "request_type": random.choice(
            [
                "GDPR data deletion",
                "account removal",
                "privacy concern",
                "policy dispute",
            ]
        ),
        "urgency": random.choice(
            [
                "casual inquiry",
                "demand immediate action",
                "considering legal steps",
            ]
        ),
    }


def sales_seed():
    return {
        "company": random.choice(COMPANIES),
        "product": random.choice(PRODUCTS),
        "quantity": random.choice([10, 25, 50, 100, 250]),
        "interest_type": random.choice(
            [
                "reseller deal",
                "bulk internal use",
                "corporate gifting",
                "influencer campaign",
            ]
        ),
        "deadline": random.choice(["2 weeks", "1 month", "urgent, this week", None]),
        "pre_sale_question": random.choice(
            [
                "asking about pricing",
                "asking about stock availability",
                "asking about shipping time",
                "asking about custom branding",
                "asking about lead time",
                "asking what product is best for their use case",
                "asking what’s available for their budget",
                "asking for general recommendations",
                "asking if they can test a sample unit",
            ]
        ),
    }


departments = [
    {
        "name": "Tech Support",
        "description": "Handles product problems and questions: connectivity, audio issues, firmware bugs, app/device failures, or general usage questions.",
        "seed_info_generator": tech_support_seed,
    },
    {
        "name": "Billing",
        "description": "Handles charges, refunds, subscription problems, billing method issues, and cancellations.",
        "seed_info_generator": billing_seed,
    },
    {
        "name": "Shipping",
        "description": "Handles delivery delays, missing packages, incorrect shipments, returns, and tracking problems.",
        "seed_info_generator": shipping_seed,
    },
    {
        "name": "Legal",
        "description": "Handles GDPR, account deletion, privacy concerns, policy disputes, and legal threats.",
        "seed_info_generator": legal_seed,
    },
    {
        "name": "Sales",
        "description": "Handles bulk orders, reseller and B2B inquiries, influencer campaigns, and corporate gifting.",
        "seed_info_generator": sales_seed,
    },
]


In [49]:
import asyncio
import json

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_deepseek import ChatDeepSeek
from tqdm.notebook import tqdm

llm = ChatDeepSeek(model="deepseek-chat", temperature=1)

system_prompt = PromptTemplate.from_template("""
You are simulating a real customer writing a support email to a consumer audio tech company.

Instructions:
- ONLY return valid JSON: {{"subject": "...", "body": "...", "department": "..."}}
- Do NOT explain anything or write outside the JSON block.

Vagueness rules:
- If vagueness_level = "clear":
    - Include product name, order number, dates, and details.
    - Both subject and body should be present and specific.
- If vagueness_level = "medium":
    - Include some key info, but leave out details (e.g. no order number).
    - Subject/body may be vague, short, or phrased as a question.
- If vagueness_level = "vague":
    - Message should be emotional, minimal, or unclear.
    - Subject OR body may be completely empty.
    - OK to include messages like: "Need help." / "It’s broken." / "WTF is this charge???"
    - Avoid any polished structure.

Be realistic: messy grammar, emotional tone, or random format is fine.
""")

human_prompt = PromptTemplate.from_template("""
Customer tone: {tone_style}
Vagueness level: {vagueness_level}
Vagueness instruction: {vagueness_instruction}

Customer situation/context:
{seed_info_json}

Department: {department}
""")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt.template),
        ("human", human_prompt.template),
    ]
)


parser = JsonOutputParser()

chain = prompt | llm | parser


async def generate_email_async(dept_config):
    seed_info = dept_config["seed_info_generator"]()
    tone = random.choice(TONE_STYLES)
    vagueness_entry = random.choices(
        VAGUENESS_LEVELS, weights=[e["weight"] for e in VAGUENESS_LEVELS]
    )[0]
    output = await chain.ainvoke(
        {
            "department": dept_config["name"],
            "department_description": dept_config["description"],
            "tone_style": tone,
            "vagueness_level": vagueness_entry["name"],
            "vagueness_instruction": vagueness_entry["description"],
            "seed_info_json": json.dumps(seed_info, indent=2),
        }
    )

    return output


async def generate_all_emails_async(departments, distribution, emails_total):
    tasks = []
    for dept in departments:
        for _ in range(distribution.get(dept["name"], 0)):
            tasks.append(generate_email_async(dept))

    results = [None] * len(tasks)
    pbar = tqdm(total=len(tasks), desc="Generating emails", unit="email")

    async def wrapped(idx, coro):
        result = await coro
        results[idx] = result
        pbar.update(1)

    await asyncio.gather(*[wrapped(i, task) for i, task in enumerate(tasks)])
    return results


In [50]:
await generate_email_async(departments[0])

{'subject': 'EchoWave keeps dropping',
 'body': 'Disconnects all the time. Fix it!',
 'department': 'Tech Support'}

In [51]:
TOTAL_EMAILS = 2000

DISTRIBUTION_WEIGHTS = {
    "Tech Support": 0.3,
    "Billing": 0.2,
    "Shipping": 0.25,
    "Legal": 0.1,
    "Sales": 0.15,
}

department_distribution = {
    dept: round(DISTRIBUTION_WEIGHTS[dept] * TOTAL_EMAILS)
    for dept in DISTRIBUTION_WEIGHTS
}


In [52]:
all_emails = await generate_all_emails_async(
    departments, department_distribution, TOTAL_EMAILS
)

Generating emails:   0%|          | 0/100 [00:00<?, ?email/s]

In [56]:
for i, email in enumerate(all_emails):
    print(f"Email {i + 1}:")
    print(f"Subject: {email['subject']}")
    print(f"Body: {email['body']}")
    print(f"Department: {email['department']}")
    print("-" * 50)


Email 1:
Subject: Is the SoundBuds Pro truly waterproof? Need clarification ASAP.
Body: I recently purchased the SoundBuds Pro (Order #SB12345 on 05/15/2024) and the product description claims it's waterproof. However, I can't find any details about the IP rating or how long it can be submerged. I need a clear answer before I take it swimming. Don't give me vague marketing speak—I want technical specifications and warranty coverage for water damage.
Department: Tech Support
--------------------------------------------------
Email 2:
Subject: Could you kindly clarify the app requirements?
Body: Hello there, I hope this message finds you well. I was wondering if you might be able to assist me with understanding what exactly the app needs to function properly? I've been trying to set it up, but I'm not entirely sure about the necessary specifications or permissions it requires. Any guidance you could provide would be greatly appreciated. Thank you so much for your time and assistance!
Dep

In [54]:
import pandas as pd

df = pd.DataFrame(all_emails)

df

Unnamed: 0,subject,body,department
0,Is the SoundBuds Pro truly waterproof? Need cl...,I recently purchased the SoundBuds Pro (Order ...,Tech Support
1,Could you kindly clarify the app requirements?,"Hello there, I hope this message finds you wel...",Tech Support
2,Help choosing,Which one should I get???,Tech Support
3,WavePanel Touch won't pair?,Having trouble with Bluetooth pairing. Firmwar...,Tech Support
4,"Yo, why's my SoundBar Flex blinking like crazy?","Hey, so my SoundBar Flex keeps blinking these ...",Tech Support
...,...,...,...
95,Inquiry Regarding Shipping Time for BassHub Pr...,"Dear MegaTech Retail Team, I hope this message...",Sales
96,Request for Sample Unit of WavePanel Touch for...,I am writing to inquire about the possibility ...,Sales
97,Custom branding??,Need info ASAP. EchoWave Mini.,Sales
98,Need help ASAP,"I need to know what to buy, and I don't have t...",Sales


In [55]:
from paths import DATA_DIR

df.to_csv(DATA_DIR / "customer_support_emails_dataset.csv", index=True)