In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    os.chdir(project_root)

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

In [53]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"CUDA device name: {torch.cuda.get_device_name()}")

CUDA available: True
CUDA device count: 1
Current CUDA device: 0
CUDA device name: NVIDIA GeForce RTX 3060 Laptop GPU


In [54]:
import random
from datetime import datetime, timedelta

PRODUCTS = [
    "Bose QuietComfort Ultra",
    "JBL Charge 5",
    "Sonos Move 2",
    "Apple AirPods Pro (2nd Gen)",
    "Samsung Galaxy Buds2 Pro",
    "Sennheiser Momentum 4",
    "Anker Soundcore Motion X600",
    "Marshall Emberton II",
    "Sony SRS-XG300",
]

PAID_SERVICES = [
    "extended warranty",
    "device protection plan",
    "music streaming bundle",
    "setup assistance service",
    "premium support plan",
]

COUNTRIES = ["Germany", "France", "USA", "UK", "Netherlands", "Sweden"]

COMPANIES = [
    "TechDepot Europe",
    "SmartAudio Direct",
    "GadgetZone Retail",
    "HiFi Distribution Group",
    "Elite Audio Buyers",
]

TONE_STYLES = [
    "Angry and all caps",
    "Overly polite and long-winded",
    "Rushed and fragmented",
    "Confused with multiple questions",
    "Sarcastic",
    "Passive-aggressive",
    "Short and rude",
    "Polite and unsure",
    "Demanding and direct",
    "Overly casual",
    "Helpful and satisfied",
    "Excited and grateful",
    "Neutral and professional",
    "Concerned but polite",
    "Frustrated but calm",
]

VAGUENESS_LEVELS = [
    {
        "name": "clear",
        "description": "Include product name, order number, dates, and detailed context. Both subject and body are present and specific.",
        "weight": 0.3,
    },
    {
        "name": "medium",
        "description": "Include some key info, but leave out details (e.g. no order number). Subject/body may be vague or phrased as a question.",
        "weight": 0.5,
    },
    {
        "name": "vague",
        "description": "Message is emotional or unclear. Subject or body may be missing. Could be as short as 'It’s broken' or 'Need help'.",
        "weight": 0.2,
    },
]

LANGUAGE_QUALITY_LEVELS = [
    {
        "name": "Good",
        "description": "Spelling and grammar mostly correct.",
        "weight": 1,
    },
    {
        "name": "Average",
        "description": "Contains some spelling or grammar mistakes.",
        "weight": 2,
    },
    {
        "name": "Poor",
        "description": "Frequent grammar or spelling issues, slang or typos.",
        "weight": 1,
    },
]

DISTRACTION_TOPICS = [
    "I had a really stressful week and honestly didn't want to deal with this.",
    "My kid accidentally messed with the speaker settings, and I can't even explain what happened.",
    "I tried calling your support line but was on hold for over 40 minutes.",
    "I’ve been a loyal customer for years and never had this kind of issue before.",
    "I was chatting with your bot assistant, but it just looped the same answer.",
    "This may be my fault but I just need someone to walk me through it slowly.",
    "The issue started during a party and it really ruined the mood.",
    "I’m not super tech-savvy, so forgive me if this sounds dumb.",
]


def wrap_seed_with_distraction(seed_info):
    seed_info = dict(seed_info)
    seed_info["distraction_intro"] = random.choice(DISTRACTION_TOPICS)
    return seed_info


def random_order_date(max_days=365, bias_power=2):
    r = random.random()
    days_ago = int((1 - r**bias_power) * max_days)
    return (datetime.today() - timedelta(days=days_ago)).strftime("%B %d")


def tech_support_seed():
    question_only_issues = [
        "where’s the manual?",
        "how long does the battery last?",
        "is it waterproof?",
        "what does the app need?",
        "does it support multipoint?",
        "can it connect to older devices?",
        "what model should I buy?",
        "what features does it have?",
        "will this work with my setup?",
        "need product advice for a specific case",
        "is it compatible with my system?",
    ]
    product_issues = [
        "won't connect",
        "keeps restarting",
        "firmware failed",
        "no sound",
        "blinking lights",
        "overheating",
        "charging issue",
        "won't turn on",
        "Bluetooth not pairing",
        "disconnects randomly",
        "voice assistant doesn’t respond",
        "buttons don’t work",
        "mic not working",
        "buzzing or static noise",
        "volume too low",
        "app can’t find device",
        "touch panel doesn’t respond",
        "firmware update broke it",
    ]
    r = random.random()
    if r < 0.3:
        return {"issue_type": random.choice(question_only_issues)}
    elif r < 0.7:
        return {
            "product": random.choice(PRODUCTS),
            "issue_type": random.choice(product_issues),
        }
    else:
        return {
            "product": random.choice(PRODUCTS),
            "issue_type": random.choice(product_issues),
            "firmware_version": f"{random.randint(2, 5)}.{random.randint(0, 9)}.{random.randint(0, 9)}",
        }


def billing_seed():
    return {
        "service": random.choice(PAID_SERVICES),
        "charge_amount": random.choice(["$9.99", "$19.99", "$49.99"]),
        "charge_date": (
            datetime.today() - timedelta(days=random.randint(1, 25))
        ).strftime("%B %d"),
        "problem": random.choice(
            [
                "unauthorized charge",
                "charged after canceling",
                "duplicate payment",
                "wrong amount billed",
            ]
        ),
        "payment_method": random.choice(["credit card", "PayPal", "Apple Pay"]),
    }


def shipping_seed():
    return {
        "product": random.choice(PRODUCTS),
        "order_number": random.randint(10000, 99999),
        "order_date": random_order_date(),
        "destination_country": random.choice(COUNTRIES),
        "issue": random.choice(
            [
                "not delivered",
                "wrong address",
                "no tracking info",
                "stuck in transit",
                "return label missing",
                "return instructions unclear",
            ]
        ),
        "extra_note": random.choice(
            [
                "needs fast resolution",
                "already emailed once",
                "item needed for event",
                "wants refund after return",
            ]
        ),
    }


def legal_seed():
    return {
        "country": random.choice(COUNTRIES),
        "request_type": random.choice(
            [
                "GDPR data deletion",
                "account removal",
                "privacy concern",
                "policy dispute",
            ]
        ),
        "urgency": random.choice(
            [
                "casual inquiry",
                "demand immediate action",
                "considering legal steps",
            ]
        ),
    }


def sales_seed():
    return {
        "company": random.choice(COMPANIES),
        "product": random.choice(PRODUCTS),
        "quantity": random.choice([10, 25, 50, 100, 250]),
        "interest_type": random.choice(
            [
                "reseller deal",
                "bulk internal use",
                "corporate gifting",
                "influencer campaign",
            ]
        ),
        "deadline": random.choice(["2 weeks", "1 month", "urgent, this week", None]),
        "pre_sale_question": random.choice(
            [
                "asking about pricing",
                "asking about stock availability",
                "asking about shipping time",
                "asking about custom branding",
                "asking about lead time",
                "asking what product is best for their use case",
                "asking what’s available for their budget",
                "asking for general recommendations",
                "asking if they can test a sample unit",
            ]
        ),
    }


def care_seed():
    general_issues = [
        "can't log in",
        "need to change email",
        "can’t access my past orders",
        "trouble reaching support",
        "want to leave feedback",
        "don’t understand how the return policy works",
        "I need to talk to a human",
        "website is broken",
        "I have multiple accounts and want to merge them",
        "need help but don’t know who to contact",
    ]
    return {
        "user_id": f"user_{random.randint(1000, 9999)}",
        "issue": random.choice(general_issues),
        "distraction_intro": random.choice(DISTRACTION_TOPICS),
    }


departments = [
    {
        "name": "Tech Support",
        "description": "Handles questions and problems related to audio products.",
        "seed_info_generator": tech_support_seed,
    },
    {
        "name": "Billing",
        "description": "Handles charges, refunds, and service payment issues.",
        "seed_info_generator": billing_seed,
    },
    {
        "name": "Shipping",
        "description": "Handles delivery, tracking, and return issues for purchases.",
        "seed_info_generator": shipping_seed,
    },
    {
        "name": "Legal",
        "description": "Handles privacy, data, and account-related legal requests.",
        "seed_info_generator": legal_seed,
    },
    {
        "name": "Sales",
        "description": "Handles bulk orders, reseller inquiries, and business purchases.",
        "seed_info_generator": sales_seed,
    },
    {
        "name": "Customer Care",
        "description": "Handles general or unclear issues, feedback, and misrouted messages.",
        "seed_info_generator": care_seed,
    },
]


In [55]:
import asyncio
import json

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_deepseek import ChatDeepSeek
from tqdm.notebook import tqdm

SEMAPHORE_LIMIT = 100

llm = ChatDeepSeek(model="deepseek-chat", temperature=1)

system_prompt = PromptTemplate.from_template("""
You are simulating a real customer writing a support email to a consumer audio tech company.

Instructions:
- ONLY return valid JSON: {{"subject": "...", "body": "...", "department": "..."}}
- Do NOT explain anything or write outside the JSON block.
- Be realistic: include messy grammar, emotional tone, vague questions, or random formatting.
""")

human_prompt = PromptTemplate.from_template("""
Customer tone: {tone_style}
Vagueness level: {vagueness_level}
Vagueness instruction: {vagueness_instruction}
Language level: {language_level}
Language instruction: {language_instruction}

Customer situation/context:
{seed_info_json}

Department: {department}
""")

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt.template),
        ("human", human_prompt.template),
    ]
)

parser = JsonOutputParser()
chain = prompt | llm | parser


semaphore = asyncio.Semaphore(SEMAPHORE_LIMIT)


async def generate_email_async(dept_config):
    seed_info = wrap_seed_with_distraction(dept_config["seed_info_generator"]())

    tone = random.choice(TONE_STYLES)
    vagueness_entry = random.choices(
        VAGUENESS_LEVELS, weights=[e["weight"] for e in VAGUENESS_LEVELS]
    )[0]
    language_entry = random.choices(
        LANGUAGE_QUALITY_LEVELS, weights=[e["weight"] for e in LANGUAGE_QUALITY_LEVELS]
    )[0]

    async with semaphore:
        return await chain.ainvoke(
            {
                "department": dept_config["name"],
                "department_description": dept_config["description"],
                "tone_style": tone,
                "vagueness_level": vagueness_entry["name"],
                "vagueness_instruction": vagueness_entry["description"],
                "seed_info_json": json.dumps(seed_info, indent=2),
                "language_level": language_entry["name"],
                "language_instruction": language_entry["description"],
            }
        )


async def generate_all_emails_async(departments, distribution, emails_total):
    tasks = []
    for dept in departments:
        for _ in range(distribution.get(dept["name"], 0)):
            tasks.append((len(tasks), generate_email_async(dept)))

    results = [None] * len(tasks)
    pbar = tqdm(total=len(tasks), desc="Generating emails", unit="email")

    async def wrapped(idx, coro):
        results[idx] = await coro
        pbar.update(1)

    await asyncio.gather(*[wrapped(i, coro) for i, coro in tasks])
    return results


In [56]:
await generate_email_async(departments[0])

{'subject': 'Beats Studio Pro not working after update?',
 'body': "Hey there,\n\nSo I updated my Beats Studio Pro to firmware 5.5.8 and now they're acting all weird. Like, the sound cuts out sometimes and the buttons don't respond right. I had a really stressful week and honestly didn't want to deal with this. Can you help me fix it or do I need to roll back the update?\n\nThanks,",
 'department': 'Tech Support'}

In [57]:
TOTAL_EMAILS = 1000

DISTRIBUTION_WEIGHTS = {
    "Tech Support": 0.3,
    "Billing": 0.2,
    "Shipping": 0.2,
    "Legal": 0.1,
    "Sales": 0.15,
    "Customer Care": 0.05,
}

# DISTRIBUTION_WEIGHTS = {
#     "Tech Support": 1 / 6,
#     "Billing": 1 / 6,
#     "Shipping": 1 / 6,
#     "Legal": 1 / 6,
#     "Sales": 1 / 6,
#     "Customer Care": 1 / 6,
# }

department_distribution = {
    dept: round(DISTRIBUTION_WEIGHTS[dept] * TOTAL_EMAILS)
    for dept in DISTRIBUTION_WEIGHTS
}


In [58]:
all_emails = await generate_all_emails_async(
    departments, department_distribution, TOTAL_EMAILS
)

Generating emails:   0%|          | 0/1000 [00:00<?, ?email/s]

In [59]:
from helpers.print import print_emails

print_emails(all_emails)

Email 1:
Subject: Blinking lights on Emberton II
Body: Hey, so my Marshall Emberton II is doing this weird blinking light thing. I might have messed something up, but I don't know. Just tell me how to fix it, step by step, because I'm not techy. Thanks, I guess.
Department: Tech Support
--------------------------------------------------
Email 2:
Subject: My Sony speaker isn't working properly?
Body: Dear Support Team,

I hope this email finds you well! I'm writing because I'm having a bit of trouble with my Sony SRS-XG300 speaker. It just won't turn on, no matter what I try. I was chatting with your bot assistant earlier, but it kept giving me the same answer over and over, which wasn't very helpful, so I thought I'd reach out to a real person.

I'm currently on firmware version 5.0.9, if that helps. I've tried charging it overnight and pressing the power button in different ways, but nothing seems to work. It's quite frustrating because I really love this speaker and use it all the ti

In [60]:
import pandas as pd

df = pd.DataFrame(all_emails)

df

Unnamed: 0,subject,body,department
0,Blinking lights on Emberton II,"Hey, so my Marshall Emberton II is doing this ...",Tech Support
1,My Sony speaker isn't working properly?,"Dear Support Team,\n\nI hope this email finds ...",Tech Support
2,Speaker keeps disconnecting,"Hi, I have a Sony SRS-XG300 and it keeps disco...",Tech Support
3,Hello there! Quick question about waterproofin...,"Dear wonderful support team,\n\nI hope this em...",Tech Support
4,AirPods Pro not working after update?,"Hi there, I have the AirPods Pro 2nd gen and a...",Tech Support
...,...,...,...
995,Feedback on Recent Experience with SoundBlast Pro,"Dear Customer Care,\n\nI’ve been a loyal custo...",Customer Care
996,Feedback about my recent experience,"Hi there,\n\nI wanted to share some feedback a...",Customer Care
997,Having trouble getting help?,"Hey there,\n\nI'm not super tech-savvy, so for...",Customer Care
998,Cant see my past orders - order #4626 from 10/...,Hey so like i was tryin to check my old orders...,Customer Care


In [5]:
import pandas as pd

from paths import DATA_DIR

df = pd.read_csv(DATA_DIR / "customer_support_emails_dataset.csv", index_col=0)

df.head()

Unnamed: 0,subject,body,department
0,Blinking lights on Emberton II,"Hey, so my Marshall Emberton II is doing this ...",Tech Support
1,My Sony speaker isn't working properly?,"Dear Support Team,\n\nI hope this email finds ...",Tech Support
2,Speaker keeps disconnecting,"Hi, I have a Sony SRS-XG300 and it keeps disco...",Tech Support
3,Hello there! Quick question about waterproofin...,"Dear wonderful support team,\n\nI hope this em...",Tech Support
4,AirPods Pro not working after update?,"Hi there, I have the AirPods Pro 2nd gen and a...",Tech Support


In [7]:
from datasets import Dataset

from helpers.const import HF_HUB_DATASET_ID

dataset = Dataset.from_pandas(df, preserve_index=False)

dataset.push_to_hub(HF_HUB_DATASET_ID)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/h3en1x/audio-retailer-customer-support-tickets/commit/e749d058b4edba025f56c93a4f46a298cf3fe647', commit_message='Upload dataset', commit_description='', oid='e749d058b4edba025f56c93a4f46a298cf3fe647', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/h3en1x/audio-retailer-customer-support-tickets', endpoint='https://huggingface.co', repo_type='dataset', repo_id='h3en1x/audio-retailer-customer-support-tickets'), pr_revision=None, pr_num=None)