In [1]:
import os
import json
import pandas as pd
from collections import Counter
from datasets import load_dataset
from IPython.display import display
from typing import Tuple
from pathlib import Path
from pydantic import BaseModel

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
ecomerce_data = load_dataset('bitext/Bitext-retail-ecommerce-llm-chatbot-training-dataset')

Generating train split: 100%|██████████| 44884/44884 [00:00<00:00, 146630.39 examples/s]



In [17]:
ecomerce_data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'category', 'tags', 'response'],
        num_rows: 44884
    })
})

In [25]:
# Lowercase all columns for the e-commerce dataset (strings and lists of strings)
# Works for: instruction, intent, category, tags, response

# Columns present in the dataset
cols = ecomerce_data['train'].column_names

# Safe lowercase for nested structures
def _lower_value(v):
    if isinstance(v, str):
        return v.lower()
    if isinstance(v, list):
        return [_lower_value(x) for x in v]
    return v


def to_lower_batch(batch):
    out = {}
    for c in cols:
        values = batch.get(c)
        if values is None:
            continue
        out[c] = [_lower_value(v) for v in values]
    return out


In [26]:
# Apply lowercasing to all splits (train-only in this dataset)
ecomerce_data = ecomerce_data.map(to_lower_batch, batched=True)

# Quick preview of the first few rows after lowercasing
print("Columns:", ecomerce_data['train'].column_names)
preview_rows = 3
preview_dict = {c: ecomerce_data['train'][c][:preview_rows] for c in ecomerce_data['train'].column_names}
display(pd.DataFrame(preview_dict))


Map: 100%|██████████| 44884/44884 [00:00<00:00, 234686.31 examples/s]
Map: 100%|██████████| 44884/44884 [00:00<00:00, 234686.31 examples/s]


Columns: ['instruction', 'intent', 'category', 'tags', 'response']


Unnamed: 0,instruction,intent,category,tags,response
0,i got to add an item to the cart,add_product,cart,bl,i'll get right on it! i'm here to assist you i...
1,wanna add fucking products to the basket can h...,add_product,cart,bcimqwz,i sincerely apologize if you've encountered an...
2,i have to add products to the basket i ned help,add_product,cart,bcmqz,you bet! i'm here to assist you in adding prod...


In [27]:
def compute_label_counts(ds, intent_col: str = "intent") -> pd.DataFrame:
    """
    Compute counts of rows per unique intent across all splits in a DatasetDict.

    Args:
        ds: Hugging Face DatasetDict (e.g., {"train": Dataset, ...}).
        intent_col: Name of the intent/text label column (default: "intent").

    Returns:
        pandas.DataFrame with columns: intent, <split>_count for each split, and total.
    """
    # Use all available splits in the DatasetDict
    splits: Tuple[str, ...] = tuple(ds.keys())
    assert len(splits) > 0, "DatasetDict appears to be empty"

    # Collect the set of all intents across splits (strings)
    all_intents = set()
    for split in splits:
        assert split in ds, f"Split '{split}' not found in dataset"
        col_values = ds[split][intent_col]
        all_intents.update(col_values)

    intents_sorted = sorted(all_intents)
    df = pd.DataFrame({"intent": intents_sorted})

    # Add counts for each split
    count_cols = []
    for split in splits:
        counts = Counter(ds[split][intent_col])
        col_name = f"{split}_count"
        count_cols.append(col_name)
        df[col_name] = [counts.get(intent, 0) for intent in intents_sorted]

    # Total across splits
    df["total"] = df[count_cols].sum(axis=1) if count_cols else 0

    # Sort by intent for readability
    return df.sort_values("intent").reset_index(drop=True)


In [28]:
# Count rows per intent for the e-commerce dataset
label_counts_df = compute_label_counts(ecomerce_data, intent_col="intent")
display(label_counts_df)


Unnamed: 0,intent,train_count,total
0,add_product,957,957
1,availability,972,972
2,availability_in_store,756,756
3,availability_online,993,993
4,cancel_order,996,996
5,change_account,987,987
6,change_order,961,961
7,close_account,995,995
8,customer_service,992,992
9,damaged_delivery,992,992


In [32]:
# set(label_counts_df['intent'].unique())

In [33]:
with open(data_dir / 'new_intents_recommended.json', 'r') as f:
    new_intents_to_augument = json.load(f)
print(f'Number of new intents to augment: {len(new_intents_to_augument)}')

Number of new intents to augment: 51


In [34]:
new_intents_to_augument

{'apply_discount_code': 'Apply a promotional or discount code to the order or cart.',
 'remove_discount_code': 'Remove or deactivate a previously applied discount/promo code from the order.',
 'check_cart_items': 'Verify the list of items currently in the shopping cart and their quantities.',
 'check_wishlist': 'Retrieve or view items saved in the user’s wishlist.',
 'add_to_wishlist': 'Add a product to the user’s wishlist or favourites list.',
 'remove_from_wishlist': 'Remove a product from the user’s wishlist or favourites list.',
 'share_wishlist': 'Share the wishlist with another user or via link/email.',
 'gift_wrap_option': 'Request or inquire about the gift-wrapping option for an order.',
 'schedule_delivery': 'Schedule or change the delivery date/time for an order.',
 'change_delivery_address': 'Update the delivery/shipping address for an existing order.',
 'split_order_items': 'Split order items into multiple shipments or delivery dates.',
 'combine_orders': 'Combine multiple 

In [35]:
system_prompt = """
You generate realistic, privacy-safe e-commerce customer utterances for training an intent classifier.

REQUIREMENTS
- Stay strictly on the target intent; one intent per line.
- No PII: no real names, emails, phone numbers, addresses, order IDs, tracking numbers, or payment card numbers.
- No real brand or store names; use generic references like “the website”, “the app”, “my order”, or “the store”.
- Language must sound natural and conversational, as if written by real customers on live chat, Reddit, Twitter, or a chatbot.
- Utterances should be short or medium length (4–25 words typical; up to ~35 words acceptable).
- Vary tone and style: polite, neutral, frustrated, curious, excited, or apologetic.
- Vary surface form: questions, statements, short fragments, or exclamations.
- Cover realistic user channels: website, app, phone chat, in-store kiosk, or email support.
- Include occasional mild natural noise (typos, slang, emoji, or casual abbreviations) in a few lines (≤10%) but keep meaning clear.
- Use generic placeholders only (e.g., “$20”, “yesterday”, “my last order”, “a jacket”, “the delivery”, “my coupon”).
- English only.

OUTPUT FORMAT
Return ONLY a valid JSON array of strings (each string = one e-commerce customer utterance). 
Do NOT include markdown, explanations, or comments.
"""

In [36]:
user_prompt = """Target intent: {INTENT_NAME}
Intent description: {INTENT_DESCRIPTION}

Generate {N} DISTINCT e-commerce customer utterances that clearly express ONLY this intent.
Make sure the utterances sound natural and reflect realistic shopping or support conversations from online stores or delivery apps.
Return EXACTLY {N} items as a JSON array of strings.
"""

In [37]:
from __future__ import annotations
from typing import List, Dict, Any
import json
import math

from pydantic import BaseModel, Field, ValidationError

from mlx_lm import load, generate

In [44]:
class BaseLLMHandler:
    """
    Base wrapper around an MLX LLM.
    - loads model + tokenizer in constructor
    - provides .format(user_prompt) to build chat-style prompt
    - provides .invoke(user_prompt) to actually generate text
    """

    def __init__(
        self,
        system_prompt: str,
        model_name: str = "mlx-community/Llama-3.2-8B-Instruct-4bit",
        max_tokens: int = 512,
        temperature: float = 0.7,
    ) -> None:
        """
        Args:
            system_prompt: high-level instructions for the model
            model_name: HF / MLX community model name
            max_tokens: max new tokens to generate
            temperature: sampling temperature
        """
        self.system_prompt = system_prompt
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.temperature = temperature

        self.model, self.tokenizer = load(self.model_name)

    def format(self, user_prompt: str) -> str:
        """
        Build a chat prompt using MLX's tokenizer chat template.
        We pass system + user as messages.
        """
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_prompt},
        ]
        prompt = self.tokenizer.apply_chat_template(
            messages, add_generation_prompt=True
        )
        return prompt

    def invoke(self, user_prompt: str) -> str:
        """
        Format + generate from the MLX model, return raw text.
        """
        prompt = self.format(user_prompt)
        # mlx_lm.generate returns a string continuation
        out_text = generate(
            self.model,
            self.tokenizer,
            prompt=prompt,
            max_tokens=self.max_tokens,
            verbose=False,
        )
        return out_text

In [48]:
class SyntheticDataGenerator(BaseLLMHandler):
    """
    Uses BaseLLMHandler to generate JSON-shaped synthetic e-commerce utterances
    and validates them with Pydantic.
    """

    class SyntheticUtterance(BaseModel):
        intent: str = Field(..., description="The e-commerce intent name.")
        text: str = Field(..., description="Customer utterance related to this e-commerce intent.")
        channel: str = Field(
            "app",
            description="Where the customer is contacting from, e.g. app, web, phone, store, chat.",
        )
        style: str = Field(
            "neutral",
            description="Tone of the message, e.g. neutral, polite, frustrated, urgent, casual.",
        )

    def __init__(
        self,
        system_prompt: str = (
            "You generate realistic, privacy-safe e-commerce customer utterances for training an intent classifier.\n"
            "REQUIREMENTS\n"
            "- Stay strictly on the target intent; one intent per line.\n"
            "- No PII: no real names, emails, phone numbers, addresses, order IDs, tracking numbers, or card numbers.\n"
            "- No real brand/store names; use generic references like “the website”, “the app”, “my order”, “the store”.\n"
            "- Natural, conversational e-commerce style (live chat, support, shopping apps, delivery apps).\n"
            "- Short/medium length (4–25 words typical; up to ~35 ok).\n"
            "- Vary tone (polite, neutral, frustrated, curious) and surface form (questions, statements, fragments).\n"
            "- Allow mild natural noise (typos/slang) in a few lines but keep meaning clear.\n"
            "- Use generic placeholders only (e.g., “$20”, “yesterday”, “my last order”, “a jacket”, “the coupon”).\n"
            "- English only.\n"
            "OUTPUT FORMAT\n"
            "Return ONLY a valid JSON array of objects I can parse. No markdown, no comments."
        ),
        model_name: str = "mlx-community/Llama-3.2-8B-Instruct-4bit",
        max_tokens: int = 768,
        temperature: float = 0.85,
    ) -> None:
        super().__init__(
            system_prompt=system_prompt,
            model_name=model_name,
            max_tokens=max_tokens,
            temperature=temperature,
        )

    @staticmethod
    def _build_user_prompt(intent_name: str, intent_description: str, n: int) -> str:
        schema_str = json.dumps(
            {
                "intent": "string (must be the EXACT intent name)",
                "text": "string (customer asks about that e-commerce intent only)",
                "channel": "string (one of: app, web, phone, store, chat)",
                "style": "string (one of: neutral, polite, frustrated, urgent, casual)"
            },
            indent=2,
        )
        return (
            f"Target intent name: {intent_name}\n"
            f"Intent description: {intent_description}\n"
            f"Generate {n} DISTINCT synthetic e-commerce customer utterances that clearly match this intent description.\n"
            f"- Use the description to guide what the customer would actually ask.\n"
            f"- Do NOT drift into other intents.\n"
            f"Each item MUST be a JSON object following this shape:\n"
            f"{schema_str}\n\n"
            f"Return EXACTLY {n} items as a JSON array. No explanations, no markdown, no extra text."
        )

    @staticmethod
    def _parse_json_array(raw: str) -> List[Dict[str, Any]]:
        """
        Robust JSON array parser:
        - handle ```json ... ``` or ``` ... ```
        - then slice from first '[' to last ']'
        """
        text = raw.strip()

        # 1) unwrap code fences if present
        if text.startswith("```"):
            # remove the opening fence
            # possible patterns:
            # ```json\n[ ... ]\n```
            # ```\n[ ... ]\n```
            lines = text.splitlines()
            # drop first line (``` or ```json)
            if len(lines) >= 2 and lines[0].startswith("```"):
                lines = lines[1:]
            # drop trailing ``` if present
            if len(lines) >= 1 and lines[-1].startswith("```"):
                lines = lines[:-1]
            text = "\n".join(lines).strip()

        # 2) narrow to JSON array
        start = text.find("[")
        end = text.rfind("]")
        if start == -1 or end == -1 or end < start:
            raise ValueError("No JSON array found in model output")
        array_str = text[start : end + 1]

        # 3) load
        return json.loads(array_str)

    def generate(
        self,
        intent_name: str,
        intent_description: str,
        n_samples: int = 50,
    ) -> List["SyntheticDataGenerator.SyntheticUtterance"]:
        per_batch = 40
        n_batches = math.ceil(n_samples / per_batch)

        final_items: List[SyntheticDataGenerator.SyntheticUtterance] = []

        for _ in range(n_batches):
            needed = min(per_batch, n_samples - len(final_items))
            user_prompt = self._build_user_prompt(intent_name, intent_description, needed)
            raw = self.invoke(user_prompt)

            try:
                arr = self._parse_json_array(raw)
            except Exception as e:
                # keep the raw for debugging
                raise RuntimeError(
                    f"Failed to parse model JSON for intent '{intent_name}': {e}\nRAW:\n{raw}"
                )

            for item in arr:
                try:
                    item["intent"] = intent_name  # enforce exact intent
                    obj = self.SyntheticUtterance(**item)
                    final_items.append(obj)
                except ValidationError:
                    # bad row → skip
                    continue

        return final_items[:n_samples]

In [None]:
gen = SyntheticDataGenerator(
        #model_name="mlx-community/gemma-3n-E4B-it-lm-4bit",
        model_name="mlx-community/Qwen3-4B-8bit",
        max_tokens=768,
    )

Fetching 9 files:  33%|███▎      | 3/9 [03:40<07:21, 73.56s/it]



In [50]:
utterances = gen.generate(
    intent_name="apply_discount_code",
    intent_description="Apply a promotional or discount code to the order or cart.",
    n_samples=10
)

In [51]:
utterances

[SyntheticUtterance(intent='apply_discount_code', text='How do I use a promo code?', channel='chat', style='polite'),
 SyntheticUtterance(intent='apply_discount_code', text='I have a coupon code, where do I enter it?', channel='app', style='casual'),
 SyntheticUtterance(intent='apply_discount_code', text='Can I apply this discount to my order?', channel='web', style='neutral'),
 SyntheticUtterance(intent='apply_discount_code', text="I tried to use a code, but it didn't work.", channel='chat', style='frustrated'),
 SyntheticUtterance(intent='apply_discount_code', text='Is there a way to apply a discount?', channel='phone', style='polite'),
 SyntheticUtterance(intent='apply_discount_code', text='I found a coupon online, can I use it?', channel='app', style='casual'),
 SyntheticUtterance(intent='apply_discount_code', text="I'm having trouble with a discount code. It says invalid.", channel='chat', style='frustrated'),
 SyntheticUtterance(intent='apply_discount_code', text='Where can I ent