In [1]:
# !pip install groq
# !pip install python-dotenv

In [63]:
import os
import math
import json
import pandas as pd
from groq import Groq
from pathlib import Path
from dotenv import load_dotenv
from collections import Counter
from datasets import load_dataset
from tqdm import tqdm
import time
from IPython.display import display
from typing import List, Tuple, Dict, Any
from pydantic import BaseModel, Field, ValidationError

load_dotenv()
project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'
groq_api_key = os.getenv("groq_api_key")

In [3]:
ecomerce_data = load_dataset('bitext/Bitext-retail-ecommerce-llm-chatbot-training-dataset')

In [4]:
ecomerce_data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'intent', 'category', 'tags', 'response'],
        num_rows: 44884
    })
})

In [5]:
# Lowercase all columns for the e-commerce dataset (strings and lists of strings)
# Works for: instruction, intent, category, tags, response

# Columns present in the dataset
cols = ecomerce_data['train'].column_names

# Safe lowercase for nested structures
def _lower_value(v):
    if isinstance(v, str):
        return v.lower()
    if isinstance(v, list):
        return [_lower_value(x) for x in v]
    return v


def to_lower_batch(batch):
    out = {}
    for c in cols:
        values = batch.get(c)
        if values is None:
            continue
        out[c] = [_lower_value(v) for v in values]
    return out


In [6]:
# Apply lowercasing to all splits (train-only in this dataset)
ecomerce_data = ecomerce_data.map(to_lower_batch, batched=True)

# Quick preview of the first few rows after lowercasing
print("Columns:", ecomerce_data['train'].column_names)
preview_rows = 3
preview_dict = {c: ecomerce_data['train'][c][:preview_rows] for c in ecomerce_data['train'].column_names}
display(pd.DataFrame(preview_dict))


Columns: ['instruction', 'intent', 'category', 'tags', 'response']


Unnamed: 0,instruction,intent,category,tags,response
0,i got to add an item to the cart,add_product,cart,bl,i'll get right on it! i'm here to assist you i...
1,wanna add fucking products to the basket can h...,add_product,cart,bcimqwz,i sincerely apologize if you've encountered an...
2,i have to add products to the basket i ned help,add_product,cart,bcmqz,you bet! i'm here to assist you in adding prod...


In [7]:
def compute_label_counts(ds, intent_col: str = "intent") -> pd.DataFrame:
    """
    Compute counts of rows per unique intent across all splits in a DatasetDict.

    Args:
        ds: Hugging Face DatasetDict (e.g., {"train": Dataset, ...}).
        intent_col: Name of the intent/text label column (default: "intent").

    Returns:
        pandas.DataFrame with columns: intent, <split>_count for each split, and total.
    """
    # Use all available splits in the DatasetDict
    splits: Tuple[str, ...] = tuple(ds.keys())
    assert len(splits) > 0, "DatasetDict appears to be empty"

    # Collect the set of all intents across splits (strings)
    all_intents = set()
    for split in splits:
        assert split in ds, f"Split '{split}' not found in dataset"
        col_values = ds[split][intent_col]
        all_intents.update(col_values)

    intents_sorted = sorted(all_intents)
    df = pd.DataFrame({"intent": intents_sorted})

    # Add counts for each split
    count_cols = []
    for split in splits:
        counts = Counter(ds[split][intent_col])
        col_name = f"{split}_count"
        count_cols.append(col_name)
        df[col_name] = [counts.get(intent, 0) for intent in intents_sorted]

    # Total across splits
    df["total"] = df[count_cols].sum(axis=1) if count_cols else 0

    # Sort by intent for readability
    return df.sort_values("intent").reset_index(drop=True)


In [8]:
# Count rows per intent for the e-commerce dataset
label_counts_df = compute_label_counts(ecomerce_data, intent_col="intent")
display(label_counts_df)


Unnamed: 0,intent,train_count,total
0,add_product,957,957
1,availability,972,972
2,availability_in_store,756,756
3,availability_online,993,993
4,cancel_order,996,996
5,change_account,987,987
6,change_order,961,961
7,close_account,995,995
8,customer_service,992,992
9,damaged_delivery,992,992


In [9]:
with open(data_dir / 'new_intents_recommended.json', 'r') as f:
    new_intents_to_augument = json.load(f)
print(f'Number of new intents to augment: {len(new_intents_to_augument)}')

Number of new intents to augment: 51


In [10]:
new_intents_to_augument

{'apply_discount_code': 'Apply a promotional or discount code to the order or cart.',
 'remove_discount_code': 'Remove or deactivate a previously applied discount/promo code from the order.',
 'check_cart_items': 'Verify the list of items currently in the shopping cart and their quantities.',
 'check_wishlist': 'Retrieve or view items saved in the user’s wishlist.',
 'add_to_wishlist': 'Add a product to the user’s wishlist or favourites list.',
 'remove_from_wishlist': 'Remove a product from the user’s wishlist or favourites list.',
 'share_wishlist': 'Share the wishlist with another user or via link/email.',
 'gift_wrap_option': 'Request or inquire about the gift-wrapping option for an order.',
 'schedule_delivery': 'Schedule or change the delivery date/time for an order.',
 'change_delivery_address': 'Update the delivery/shipping address for an existing order.',
 'split_order_items': 'Split order items into multiple shipments or delivery dates.',
 'combine_orders': 'Combine multiple 

In [36]:
system_prompt = """
You generate realistic, privacy-safe e-commerce customer utterances for training an intent classifier.

[REQUIREMENTS]
- Stay strictly on the target intent; one intent per line.
- No PII: no real names, emails, phone numbers, addresses, order IDs, tracking numbers, or payment card numbers.
- No real brand or store names; use generic references like “the website”, “the app”, “my order”, or “the store”.
- Utterances should be short or medium length (4–25 words typical; up to ~35 words acceptable).
- Vary tone and style: polite, neutral, frustrated, curious, excited, or apologetic.
- Output must be VALID JSON.

[OUTPUT FORMAT]
Return ONLY a JSON array of objects; each object has:
- "intent": string
- "text": string
- "channel": string
- "style": string
No markdown, no comments, no extra keys.
"""

In [39]:
class BaseLLMHandler:
    """
    Base wrapper around a Groq LLM.
    - loads model + tokenizer in constructor
    - provides .format(user_prompt) to build chat-style prompt
    - provides .invoke(user_prompt) to actually generate text
    """

    def __init__(
        self,
        system_prompt: str,
        model_name: str,
        max_tokens: int = 512,
        temperature: float = 0.7,
        api_key: str | None = None,
    ) -> None:
        self.system_prompt = system_prompt
        self.model_name = model_name
        self.max_tokens = max_tokens
        self.temperature = temperature
        
        self.client = Groq(api_key=api_key or os.getenv("GROQ_API_KEY"))

    def format(self, user_prompt: str) -> List[Dict[str, str]]:
        """
        Build a chat prompt for Groq (system + user).
        """
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_prompt},
        ]
        return messages

    def invoke(self, user_prompt: str) -> str:
        """
        Call Groq chat completions and return raw text.
        """
        messages = self.format(user_prompt)
        resp = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
            temperature=self.temperature,
            max_tokens=self.max_tokens,
            stream=False,
        )
        return resp.choices[0].message.content

In [46]:
class SyntheticDataGenerator(BaseLLMHandler):
    """
    Uses BaseLLMHandler to generate JSON-shaped synthetic e-commerce utterances
    and validates them with Pydantic.
    """

    class SyntheticUtterance(BaseModel):
        intent: str = Field(..., description="The e-commerce intent name.")
        text: str = Field(..., description="Customer utterance related to this e-commerce intent.")
        channel: str = Field(
            "app",
            description="Where the customer is contacting from, e.g. app, web, phone, store, chat.",
        )
        style: str = Field(
            "neutral",
            description="Tone of the message, e.g. neutral, polite, frustrated, urgent, casual.",
        )

    def __init__(
        self,
        system_prompt: str,
        model_name: str,
        max_tokens: int = 768,
        temperature: float = 0.85,
        api_key: str | None = None,
    ) -> None:
        super().__init__(
            system_prompt=system_prompt,
            model_name=model_name,
            max_tokens=max_tokens,
            temperature=temperature,
            api_key=api_key,
        )

    @staticmethod
    def _build_user_prompt(intent_name: str, intent_description: str, n: int) -> str:
        # we tell the model the exact JSON object we want
        schema_str = json.dumps(
            {
                "intent": "string (must be the EXACT intent name)",
                "text": "string (customer asks about that e-commerce intent only)",
                "channel": "string (one of: app, web, phone, store, chat)",
                "style": "string (one of: neutral, polite, frustrated, urgent, casual)"
            },
            indent=2,
        )
        return (
            f"Target intent name: {intent_name}\n"
            f"Intent description: {intent_description}\n"
            f"Generate {n} DISTINCT synthetic e-commerce customer utterances that clearly match this intent description.\n"
            f"- Use the description to guide what the customer would actually ask.\n"
            f"- Do NOT drift into other intents.\n"
            f"Each item MUST be a JSON object following this shape:\n"
            f"{schema_str}\n\n"
            f"Return EXACTLY {n} items as a JSON array. No explanations, no markdown, no extra text."
        )

    @staticmethod
    def _parse_json_array(raw: str) -> List[Dict[str, Any]]:
        """
        Robust JSON array parser:
        - handle ```json ... ``` or ``` ... ```
        - then slice from first '[' to last ']'
        """
        text = raw.strip()

        # unwrap common fences
        if text.startswith("```"):
            lines = text.splitlines()
            if lines and lines[0].startswith("```"):
                lines = lines[1:]
            if lines and lines[-1].startswith("```"):
                lines = lines[:-1]
            text = "\n".join(lines).strip()

        start = text.find("[")
        end = text.rfind("]")
        if start == -1 or end == -1 or end < start:
            raise ValueError("No JSON array found in model output")
        array_str = text[start : end + 1]
        return json.loads(array_str)

    def generate(
        self,
        intent_name: str,
        intent_description: str,
        n_samples: int = 50,
    ) -> List["SyntheticDataGenerator.SyntheticUtterance"]:
        """
        Generate n_samples utterances in batches and validate with Pydantic.
        Includes retry logic (3 attempts) per batch to handle transient JSON issues.
        """
        per_batch = 20
        n_batches = math.ceil(n_samples / per_batch)

        final_items: List[SyntheticDataGenerator.SyntheticUtterance] = []

        for batch_idx in range(n_batches):
            needed = min(per_batch, n_samples - len(final_items))

            # ---------------- NEW: retry mechanism ----------------
            success = False
            for attempt in range(3):
                try:
                    user_prompt = self._build_user_prompt(intent_name, intent_description, needed)
                    raw = self.invoke(user_prompt)

                    arr = self._parse_json_array(raw)  # attempt JSON parse
                    success = True
                    break  # exit retry loop if success
                except Exception as e:
                    print(
                        f"[WARN] Batch {batch_idx+1}/{n_batches}, attempt {attempt+1}/3 failed for "
                        f"intent '{intent_name}': {e.__class__.__name__}: {e}"
                    )
                    # small sleep between retries could be added here if desired
                    continue

            # if after 3 tries we still failed, skip this batch
            if not success:
                print(
                    f"[ERROR] Skipping batch {batch_idx+1}/{n_batches} for intent '{intent_name}' after 3 failed attempts."
                )
                continue
            # -------------------------------------------------------

            # validate items with Pydantic
            for item in arr:
                try:
                    # enforce exact intent name
                    item["intent"] = intent_name
                    obj = self.SyntheticUtterance(**item)
                    final_items.append(obj)
                except ValidationError:
                    continue

        print(
            f"[INFO] Completed generation for intent '{intent_name}'. "
            f"Collected {len(final_items)} utterances out of requested {n_samples}."
        )

        return final_items[:n_samples]

In [61]:
gen = SyntheticDataGenerator(
    system_prompt=system_prompt,
    model_name="openai/gpt-oss-20b",
    max_tokens=2048,
    api_key=groq_api_key,
)

In [64]:
def generate_utterances_for_intents(
    data_dir: Path,
    intent_desp_json_filename: str,
    num_samples_per_intent: int = 10,
    generator: SyntheticDataGenerator = gen,
    ) -> None:
    intent_desp_json_filepath = data_dir / intent_desp_json_filename
    with open(intent_desp_json_filepath, 'r') as f:
        new_intents_to_augument = json.load(f)
    print(f'Number of new intents to augment: {len(new_intents_to_augument)}')
    
    for intent, description in tqdm(new_intents_to_augument.items()):
        print(f"Generating samples for intent: {intent}")
        utterances = generator.generate(
            intent_name=intent,
            intent_description=description,
            n_samples=num_samples_per_intent,
            )
        time.sleep(0.1)
        output_file = data_dir / f"synthetic_utterances_{intent}.jsonl"
        with open(output_file, "w", encoding="utf-8") as f:
            for u in utterances:
                f.write(json.dumps(u.model_dump(), ensure_ascii=False) + "\n")
        print(f"Saved {len(utterances)} samples to {output_file}")

In [65]:
generate_utterances_for_intents(
    data_dir=data_dir,
    intent_desp_json_filename='new_intents_recommended.json',
    num_samples_per_intent=1000
)

Number of new intents to augment: 51


  0%|          | 0/51 [00:00<?, ?it/s]

Generating samples for intent: apply_discount_code
[WARN] Batch 4/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 7/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 7/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 2/3 failed for intent 'apply_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 2/3 failed for intent 'apply_discount_code': ValueError: No JSON array found i

  2%|▏         | 1/51 [01:27<1:12:55, 87.51s/it]

[INFO] Completed generation for intent 'apply_discount_code'. Collected 993 utterances out of requested 1000.
Saved 993 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_apply_discount_code.jsonl
Generating samples for intent: remove_discount_code
[WARN] Batch 31/50, attempt 1/3 failed for intent 'remove_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 31/50, attempt 1/3 failed for intent 'remove_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 44/50, attempt 1/3 failed for intent 'remove_discount_code': ValueError: No JSON array found in model output
[WARN] Batch 44/50, attempt 1/3 failed for intent 'remove_discount_code': ValueError: No JSON array found in model output


  4%|▍         | 2/51 [02:48<1:08:14, 83.56s/it]

[INFO] Completed generation for intent 'remove_discount_code'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_remove_discount_code.jsonl
Generating samples for intent: check_cart_items
[WARN] Batch 20/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in model output
[WARN] Batch 41/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in model output
[WARN] Batch 41/50, attempt 1/3 failed for intent 'check_cart_items': ValueError: No JSON array found in mo

  6%|▌         | 3/51 [04:15<1:08:09, 85.20s/it]

[INFO] Completed generation for intent 'check_cart_items'. Collected 991 utterances out of requested 1000.
Saved 991 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_cart_items.jsonl
Generating samples for intent: check_wishlist


  8%|▊         | 4/51 [05:33<1:04:33, 82.42s/it]

[INFO] Completed generation for intent 'check_wishlist'. Collected 993 utterances out of requested 1000.
Saved 993 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_wishlist.jsonl
Generating samples for intent: add_to_wishlist
[WARN] Batch 8/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 8/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 31/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 31/50, attempt 1/3 failed for intent 'add_to_wishlist': ValueError: No JSON array found in model output
[WARN] Bat

 10%|▉         | 5/51 [06:58<1:04:00, 83.48s/it]

[INFO] Completed generation for intent 'add_to_wishlist'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_add_to_wishlist.jsonl
Generating samples for intent: remove_from_wishlist
[WARN] Batch 19/50, attempt 1/3 failed for intent 'remove_from_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'remove_from_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 32/50, attempt 1/3 failed for intent 'remove_from_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 32/50, attempt 1/3 failed for intent 'remove_from_wishlist': ValueError: No JSON array found in model output


 12%|█▏        | 6/51 [08:20<1:02:12, 82.95s/it]

[INFO] Completed generation for intent 'remove_from_wishlist'. Collected 994 utterances out of requested 1000.
Saved 994 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_remove_from_wishlist.jsonl
Generating samples for intent: share_wishlist
[WARN] Batch 11/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 39/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WARN] Batch 39/50, attempt 1/3 failed for intent 'share_wishlist': ValueError: No JSON array found in model output
[WA

 14%|█▎        | 7/51 [09:46<1:01:33, 83.95s/it]

[INFO] Completed generation for intent 'share_wishlist'. Collected 986 utterances out of requested 1000.
Saved 986 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_share_wishlist.jsonl
Generating samples for intent: gift_wrap_option
[WARN] Batch 50/50, attempt 1/3 failed for intent 'gift_wrap_option': ValueError: No JSON array found in model output
[WARN] Batch 50/50, attempt 1/3 failed for intent 'gift_wrap_option': ValueError: No JSON array found in model output


 16%|█▌        | 8/51 [11:07<59:23, 82.88s/it]  

[INFO] Completed generation for intent 'gift_wrap_option'. Collected 991 utterances out of requested 1000.
Saved 991 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_gift_wrap_option.jsonl
Generating samples for intent: schedule_delivery


 18%|█▊        | 9/51 [12:26<57:15, 81.80s/it]

[INFO] Completed generation for intent 'schedule_delivery'. Collected 994 utterances out of requested 1000.
Saved 994 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_schedule_delivery.jsonl
Generating samples for intent: change_delivery_address
[WARN] Batch 6/50, attempt 1/3 failed for intent 'change_delivery_address': ValueError: No JSON array found in model output
[WARN] Batch 6/50, attempt 1/3 failed for intent 'change_delivery_address': ValueError: No JSON array found in model output
[WARN] Batch 49/50, attempt 1/3 failed for intent 'change_delivery_address': ValueError: No JSON array found in model output
[WARN] Batch 49/50, attempt 1/3 failed for intent 'change_delivery_address': ValueError: No JSON array found in model output


 20%|█▉        | 10/51 [13:56<57:34, 84.25s/it]

[INFO] Completed generation for intent 'change_delivery_address'. Collected 993 utterances out of requested 1000.
Saved 993 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_change_delivery_address.jsonl
Generating samples for intent: split_order_items
[WARN] Batch 13/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON array found in model output
[WARN] Batch 13/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON array found in model output
[WARN] Batch 17/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON array found in model output
[WARN] Batch 17/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON array found in model output
[WARN] Batch 22/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON array found in model output
[WARN] Batch 22/50, attempt 1/3 failed for intent 'split_order_items': ValueError: No JSON arra

 22%|██▏       | 11/51 [15:29<57:51, 86.80s/it]

[INFO] Completed generation for intent 'split_order_items'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_split_order_items.jsonl
Generating samples for intent: combine_orders
[WARN] Batch 11/50, attempt 1/3 failed for intent 'combine_orders': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'combine_orders': ValueError: No JSON array found in model output
[WARN] Batch 46/50, attempt 1/3 failed for intent 'combine_orders': ValueError: No JSON array found in model output
[WARN] Batch 46/50, attempt 1/3 failed for intent 'combine_orders': ValueError: No JSON array found in model output


 24%|██▎       | 12/51 [16:51<55:31, 85.42s/it]

[INFO] Completed generation for intent 'combine_orders'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_combine_orders.jsonl
Generating samples for intent: checkout_guest
[WARN] Batch 17/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 17/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 21/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 21/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'checkout_guest': ValueError: No JSON array found in model output
[WARN] Batch 39

 25%|██▌       | 13/51 [18:19<54:37, 86.24s/it]

[INFO] Completed generation for intent 'checkout_guest'. Collected 990 utterances out of requested 1000.
Saved 990 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_checkout_guest.jsonl
Generating samples for intent: checkout_saved_address
[WARN] Batch 1/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueError: No JSON array found in model output
[WARN] Batch 1/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueError: No JSON array found in model output
[WARN] Batch 22/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueError: No JSON array found in model output
[WARN] Batch 22/50, attempt 1/3 failed for intent 'checkout_saved_address': ValueErro

 27%|██▋       | 14/51 [19:57<55:23, 89.83s/it]

[INFO] Completed generation for intent 'checkout_saved_address'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_checkout_saved_address.jsonl
Generating samples for intent: select_payment_plan
[WARN] Batch 5/50, attempt 1/3 failed for intent 'select_payment_plan': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'select_payment_plan': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 2/3 failed for intent 'select_payment_plan': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 2/3 failed for intent 'select_payment_plan': ValueError: No JSON array found in model output


 29%|██▉       | 15/51 [21:28<54:06, 90.19s/it]

[INFO] Completed generation for intent 'select_payment_plan'. Collected 990 utterances out of requested 1000.
Saved 990 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_select_payment_plan.jsonl
Generating samples for intent: apply_store_credit
[WARN] Batch 1/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array found in model output
[WARN] Batch 1/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array found in model output
[WARN] Batch 2/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array found in model output
[WARN] Batch 2/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'apply_store_credit': ValueError: No JSON array fou

 31%|███▏      | 16/51 [23:02<53:10, 91.15s/it]

[INFO] Completed generation for intent 'apply_store_credit'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_apply_store_credit.jsonl
Generating samples for intent: inquire_gift_card_balance


 33%|███▎      | 17/51 [24:25<50:18, 88.78s/it]

[INFO] Completed generation for intent 'inquire_gift_card_balance'. Collected 986 utterances out of requested 1000.
Saved 986 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_inquire_gift_card_balance.jsonl
Generating samples for intent: redeem_gift_card
[WARN] Batch 37/50, attempt 1/3 failed for intent 'redeem_gift_card': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'redeem_gift_card': ValueError: No JSON array found in model output


 35%|███▌      | 18/51 [25:56<49:11, 89.44s/it]

[INFO] Completed generation for intent 'redeem_gift_card'. Collected 995 utterances out of requested 1000.
Saved 995 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_redeem_gift_card.jsonl
Generating samples for intent: report_missing_coupon
[WARN] Batch 6/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: No JSON array found in model output
[WARN] Batch 6/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: No JSON array found in model output
[WARN] Batch 24/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: No JSON array found in model output
[WARN] Batch 24/50, attempt 1/3 failed for intent 'report_missing_coupon': ValueError: 

 37%|███▋      | 19/51 [27:27<47:55, 89.87s/it]

[INFO] Completed generation for intent 'report_missing_coupon'. Collected 987 utterances out of requested 1000.
Saved 987 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_report_missing_coupon.jsonl
Generating samples for intent: product_recommendation_request
[WARN] Batch 7/50, attempt 1/3 failed for intent 'product_recommendation_request': ValueError: No JSON array found in model output
[WARN] Batch 7/50, attempt 1/3 failed for intent 'product_recommendation_request': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'product_recommendation_request': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'product_recommendation_request': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'product_recommendation_request': ValueError: No JSON array found in model output
[WARN] Batch 23/50, att

 39%|███▉      | 20/51 [29:07<48:03, 93.02s/it]

[INFO] Completed generation for intent 'product_recommendation_request'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_product_recommendation_request.jsonl
Generating samples for intent: compare_products
[WARN] Batch 3/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON array found in model output
[WARN] Batch 3/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'compare_products': ValueError: No JSON a

 41%|████      | 21/51 [31:06<50:19, 100.66s/it]

[INFO] Completed generation for intent 'compare_products'. Collected 972 utterances out of requested 1000.
Saved 972 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_compare_products.jsonl
Generating samples for intent: check_product_reviews
[WARN] Batch 4/50, attempt 1/3 failed for intent 'check_product_reviews': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'check_product_reviews': ValueError: No JSON array found in model output
[WARN] Batch 42/50, attempt 1/3 failed for intent 'check_product_reviews': ValueError: No JSON array found in model output
[WARN] Batch 42/50, attempt 1/3 failed for intent 'check_product_reviews': ValueError: No JSON array found in model output


 43%|████▎     | 22/51 [32:32<46:36, 96.42s/it] 

[INFO] Completed generation for intent 'check_product_reviews'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_product_reviews.jsonl
Generating samples for intent: submit_product_review
[WARN] Batch 2/50, attempt 1/3 failed for intent 'submit_product_review': ValueError: No JSON array found in model output
[WARN] Batch 2/50, attempt 1/3 failed for intent 'submit_product_review': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'submit_product_review': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'submit_product_review': ValueError: No JSON array found in model output
[WARN] Batch 14/50, attempt 1/3 failed for intent 'submit_product_review': ValueError: No JSON array found in model output
[WARN] Batch 14/50, attempt 1/3 failed for intent 'submit_product_review': Valu

 45%|████▌     | 23/51 [34:15<45:53, 98.34s/it]

[INFO] Completed generation for intent 'submit_product_review'. Collected 994 utterances out of requested 1000.
Saved 994 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_submit_product_review.jsonl
Generating samples for intent: request_size_chart
[WARN] Batch 10/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON array found in model output
[WARN] Batch 26/50, attempt 1/3 failed for intent 'request_size_chart': ValueError: No JSON a

 47%|████▋     | 24/51 [35:43<42:53, 95.32s/it]

[INFO] Completed generation for intent 'request_size_chart'. Collected 990 utterances out of requested 1000.
Saved 990 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_size_chart.jsonl
Generating samples for intent: check_product_material_info


 49%|████▉     | 25/51 [37:04<39:28, 91.09s/it]

[INFO] Completed generation for intent 'check_product_material_info'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_product_material_info.jsonl
Generating samples for intent: check_backorder_status
[WARN] Batch 5/50, attempt 1/3 failed for intent 'check_backorder_status': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'check_backorder_status': ValueError: No JSON array found in model output
[WARN] Batch 15/50, attempt 1/3 failed for intent 'check_backorder_status': ValueError: No JSON array found in model output
[WARN] Batch 15/50, attempt 1/3 failed for intent 'check_backorder_status': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'check_backorder_status': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'check_ba

 51%|█████     | 26/51 [38:53<40:07, 96.29s/it]

[INFO] Completed generation for intent 'check_backorder_status'. Collected 968 utterances out of requested 1000.
Saved 968 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_backorder_status.jsonl
Generating samples for intent: notify_when_available
[WARN] Batch 9/50, attempt 1/3 failed for intent 'notify_when_available': ValueError: No JSON array found in model output
[WARN] Batch 9/50, attempt 1/3 failed for intent 'notify_when_available': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'notify_when_available': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'notify_when_available': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 2/3 failed for intent 'notify_when_available': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 2/3 failed for intent 'notify_when_available': 

 53%|█████▎    | 27/51 [40:30<38:37, 96.56s/it]

[INFO] Completed generation for intent 'notify_when_available'. Collected 991 utterances out of requested 1000.
Saved 991 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_notify_when_available.jsonl
Generating samples for intent: request_bulk_pricing
[WARN] Batch 14/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueError: No JSON array found in model output
[WARN] Batch 14/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueError: No JSON array found in model output
[WARN] Batch 25/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueError: No JSON array found in model output
[WARN] Batch 25/50, attempt 1/3 failed for intent 'request_bulk_pricing': ValueEr

 55%|█████▍    | 28/51 [42:06<36:53, 96.22s/it]

[INFO] Completed generation for intent 'request_bulk_pricing'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_bulk_pricing.jsonl
Generating samples for intent: request_product_bundle
[WARN] Batch 1/50, attempt 1/3 failed for intent 'request_product_bundle': ValueError: No JSON array found in model output
[WARN] Batch 1/50, attempt 1/3 failed for intent 'request_product_bundle': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'request_product_bundle': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'request_product_bundle': ValueError: No JSON array found in model output
[WARN] Batch 13/50, attempt 1/3 failed for intent 'request_product_bundle': ValueError: No JSON array found in model output
[WARN] Batch 13/50, attempt 1/3 failed for intent 'request_product_bundle

 57%|█████▋    | 29/51 [43:47<35:48, 97.68s/it]

[INFO] Completed generation for intent 'request_product_bundle'. Collected 984 utterances out of requested 1000.
Saved 984 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_product_bundle.jsonl
Generating samples for intent: gift_message_request
[WARN] Batch 16/50, attempt 1/3 failed for intent 'gift_message_request': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'gift_message_request': ValueError: No JSON array found in model output
[WARN] Batch 32/50, attempt 1/3 failed for intent 'gift_message_request': ValueError: No JSON array found in model output
[WARN] Batch 32/50, attempt 1/3 failed for intent 'gift_message_request': ValueError: No JSON array found in model output
[WARN] Batch 42/50, attempt 1/3 failed for intent 'gift_message_request': ValueError: No JSON array found in model output
[WARN] Batch 42/50, attempt 1/3 failed for intent 'gift_message_request': Value

 59%|█████▉    | 30/51 [45:23<34:02, 97.28s/it]

[INFO] Completed generation for intent 'gift_message_request'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_gift_message_request.jsonl
Generating samples for intent: track_return_status
[WARN] Batch 15/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No JSON array found in model output
[WARN] Batch 15/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No JSON array found in model output
[WARN] Batch 35/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No JSON array found in model output
[WARN] Batch 35/50, attempt 1/3 failed for intent 'track_return_status': ValueError: No J

 61%|██████    | 31/51 [46:50<31:23, 94.18s/it]

[INFO] Completed generation for intent 'track_return_status'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_track_return_status.jsonl
Generating samples for intent: cancel_return_request
[WARN] Batch 11/50, attempt 1/3 failed for intent 'cancel_return_request': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'cancel_return_request': ValueError: No JSON array found in model output
[WARN] Batch 15/50, attempt 1/3 failed for intent 'cancel_return_request': ValueError: No JSON array found in model output
[WARN] Batch 15/50, attempt 1/3 failed for intent 'cancel_return_request': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'cancel_return_request': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'cancel_return_request': Valu

 63%|██████▎   | 32/51 [48:23<29:43, 93.89s/it]

[INFO] Completed generation for intent 'cancel_return_request'. Collected 992 utterances out of requested 1000.
Saved 992 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_cancel_return_request.jsonl
Generating samples for intent: exchange_out_of_store
[WARN] Batch 4/50, attempt 1/3 failed for intent 'exchange_out_of_store': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'exchange_out_of_store': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'exchange_out_of_store': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'exchange_out_of_store': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'exchange_out_of_store': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'exchange_out_of_store': Valu

 65%|██████▍   | 33/51 [50:38<31:49, 106.08s/it]

[INFO] Completed generation for intent 'exchange_out_of_store'. Collected 993 utterances out of requested 1000.
Saved 993 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_exchange_out_of_store.jsonl
Generating samples for intent: request_restock_date
[WARN] Batch 37/50, attempt 1/3 failed for intent 'request_restock_date': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'request_restock_date': ValueError: No JSON array found in model output


 67%|██████▋   | 34/51 [51:57<27:48, 98.16s/it] 

[INFO] Completed generation for intent 'request_restock_date'. Collected 984 utterances out of requested 1000.
Saved 984 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_restock_date.jsonl
Generating samples for intent: preorder_product
[WARN] Batch 9/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in model output
[WARN] Batch 9/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in model output
[WARN] Batch 11/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in model output
[WARN] Batch 25/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in model output
[WARN] Batch 25/50, attempt 1/3 failed for intent 'preorder_product': ValueError: No JSON array found in mode

 69%|██████▊   | 35/51 [53:33<26:00, 97.54s/it]

[INFO] Completed generation for intent 'preorder_product'. Collected 990 utterances out of requested 1000.
Saved 990 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_preorder_product.jsonl
Generating samples for intent: cancel_preorder
[WARN] Batch 23/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'cancel_preorder': ValueError: No JSON array found in model output
[WAR

 71%|███████   | 36/51 [55:09<24:16, 97.08s/it]

[INFO] Completed generation for intent 'cancel_preorder'. Collected 987 utterances out of requested 1000.
Saved 987 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_cancel_preorder.jsonl
Generating samples for intent: price_drop_alert_setup
[WARN] Batch 18/50, attempt 1/3 failed for intent 'price_drop_alert_setup': ValueError: No JSON array found in model output
[WARN] Batch 18/50, attempt 1/3 failed for intent 'price_drop_alert_setup': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'price_drop_alert_setup': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'price_drop_alert_setup': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 2/3 failed for intent 'price_drop_alert_setup': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 2/3 failed for intent 'price_drop_alert_setup': Value

 73%|███████▎  | 37/51 [56:51<22:59, 98.53s/it]

[INFO] Completed generation for intent 'price_drop_alert_setup'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_price_drop_alert_setup.jsonl
Generating samples for intent: price_match_request
[WARN] Batch 1/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSON array found in model output
[WARN] Batch 1/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSON array found in model output
[WARN] Batch 6/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSON array found in model output
[WARN] Batch 6/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSON array found in model output
[WARN] Batch 9/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSON array found in model output
[WARN] Batch 9/50, attempt 1/3 failed for intent 'price_match_request': ValueError: No JSO

 75%|███████▍  | 38/51 [58:40<22:00, 101.56s/it]

[INFO] Completed generation for intent 'price_match_request'. Collected 987 utterances out of requested 1000.
Saved 987 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_price_match_request.jsonl
Generating samples for intent: loyalty_points_balance_query


 76%|███████▋  | 39/51 [59:57<18:50, 94.23s/it] 

[INFO] Completed generation for intent 'loyalty_points_balance_query'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_loyalty_points_balance_query.jsonl
Generating samples for intent: redeem_loyalty_points
[WARN] Batch 23/50, attempt 1/3 failed for intent 'redeem_loyalty_points': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'redeem_loyalty_points': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'redeem_loyalty_points': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'redeem_loyalty_points': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 2/3 failed for intent 'redeem_loyalty_points': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 2/3 failed for intent 'redeem_loy

 78%|███████▊  | 40/51 [1:01:32<17:19, 94.46s/it]

[INFO] Completed generation for intent 'redeem_loyalty_points'. Collected 993 utterances out of requested 1000.
Saved 993 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_redeem_loyalty_points.jsonl
Generating samples for intent: tier_membership_upgrade
[WARN] Batch 14/50, attempt 1/3 failed for intent 'tier_membership_upgrade': ValueError: No JSON array found in model output
[WARN] Batch 14/50, attempt 1/3 failed for intent 'tier_membership_upgrade': ValueError: No JSON array found in model output
[WARN] Batch 33/50, attempt 1/3 failed for intent 'tier_membership_upgrade': ValueError: No JSON array found in model output
[WARN] Batch 33/50, attempt 1/3 failed for intent 'tier_membership_upgrade': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'tier_membership_upgrade': ValueError: No JSON array found in model output
[WARN] Batch 37/50, attempt 1/3 failed for intent 'tier_members

 80%|████████  | 41/51 [1:03:00<15:26, 92.64s/it]

[INFO] Completed generation for intent 'tier_membership_upgrade'. Collected 995 utterances out of requested 1000.
Saved 995 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_tier_membership_upgrade.jsonl
Generating samples for intent: subscription_cancel
[WARN] Batch 13/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError: No JSON array found in model output
[WARN] Batch 13/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError: No JSON array found in model output
[WARN] Batch 20/50, attempt 1/3 failed for intent 'subscription_cancel': ValueError

 82%|████████▏ | 42/51 [1:04:29<13:42, 91.41s/it]

[INFO] Completed generation for intent 'subscription_cancel'. Collected 995 utterances out of requested 1000.
Saved 995 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_subscription_cancel.jsonl
Generating samples for intent: subscription_pause
[WARN] Batch 25/50, attempt 1/3 failed for intent 'subscription_pause': ValueError: No JSON array found in model output
[WARN] Batch 25/50, attempt 1/3 failed for intent 'subscription_pause': ValueError: No JSON array found in model output


 84%|████████▍ | 43/51 [1:05:45<11:35, 86.89s/it]

[INFO] Completed generation for intent 'subscription_pause'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_subscription_pause.jsonl
Generating samples for intent: subscription_resume
[WARN] Batch 32/50, attempt 1/3 failed for intent 'subscription_resume': ValueError: No JSON array found in model output
[WARN] Batch 32/50, attempt 1/3 failed for intent 'subscription_resume': ValueError: No JSON array found in model output


 86%|████████▋ | 44/51 [1:07:04<09:50, 84.31s/it]

[INFO] Completed generation for intent 'subscription_resume'. Collected 988 utterances out of requested 1000.
Saved 988 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_subscription_resume.jsonl
Generating samples for intent: subscribe_to_newsletter
[WARN] Batch 6/50, attempt 1/3 failed for intent 'subscribe_to_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 6/50, attempt 1/3 failed for intent 'subscribe_to_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'subscribe_to_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 16/50, attempt 1/3 failed for intent 'subscribe_to_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'subscribe_to_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 28/50, attempt 1/3 failed for intent 'subscribe_to_newsl

 88%|████████▊ | 45/51 [1:08:36<08:40, 86.83s/it]

[INFO] Completed generation for intent 'subscribe_to_newsletter'. Collected 986 utterances out of requested 1000.
Saved 986 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_subscribe_to_newsletter.jsonl
Generating samples for intent: unsubscribe_from_newsletter
[WARN] Batch 19/50, attempt 1/3 failed for intent 'unsubscribe_from_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 19/50, attempt 1/3 failed for intent 'unsubscribe_from_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 34/50, attempt 1/3 failed for intent 'unsubscribe_from_newsletter': ValueError: No JSON array found in model output
[WARN] Batch 34/50, attempt 1/3 failed for intent 'unsubscribe_from_newsletter': ValueError: No JSON array found in model output


 90%|█████████ | 46/51 [1:09:56<07:03, 84.67s/it]

[INFO] Completed generation for intent 'unsubscribe_from_newsletter'. Collected 987 utterances out of requested 1000.
Saved 987 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_unsubscribe_from_newsletter.jsonl
Generating samples for intent: opt_in_sms_alerts
[WARN] Batch 14/50, attempt 1/3 failed for intent 'opt_in_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 14/50, attempt 1/3 failed for intent 'opt_in_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 46/50, attempt 1/3 failed for intent 'opt_in_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 46/50, attempt 1/3 failed for intent 'opt_in_sms_alerts': ValueError: No JSON array found in model output


 92%|█████████▏| 47/51 [1:11:23<05:41, 85.26s/it]

[INFO] Completed generation for intent 'opt_in_sms_alerts'. Collected 994 utterances out of requested 1000.
Saved 994 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_opt_in_sms_alerts.jsonl
Generating samples for intent: opt_out_sms_alerts
[WARN] Batch 23/50, attempt 1/3 failed for intent 'opt_out_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'opt_out_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 39/50, attempt 1/3 failed for intent 'opt_out_sms_alerts': ValueError: No JSON array found in model output
[WARN] Batch 39/50, attempt 1/3 failed for intent 'opt_out_sms_alerts': ValueError: No JSON array found in model output


 94%|█████████▍| 48/51 [1:12:49<04:17, 85.73s/it]

[INFO] Completed generation for intent 'opt_out_sms_alerts'. Collected 996 utterances out of requested 1000.
Saved 996 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_opt_out_sms_alerts.jsonl
Generating samples for intent: request_order_invoice_pdf
[WARN] Batch 4/50, attempt 1/3 failed for intent 'request_order_invoice_pdf': ValueError: No JSON array found in model output
[WARN] Batch 4/50, attempt 1/3 failed for intent 'request_order_invoice_pdf': ValueError: No JSON array found in model output
[WARN] Batch 35/50, attempt 1/3 failed for intent 'request_order_invoice_pdf': ValueError: No JSON array found in model output
[WARN] Batch 35/50, attempt 1/3 failed for intent 'request_order_invoice_pdf': ValueError: No JSON array found in model output


 96%|█████████▌| 49/51 [1:14:15<02:51, 85.60s/it]

[INFO] Completed generation for intent 'request_order_invoice_pdf'. Collected 982 utterances out of requested 1000.
Saved 982 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_order_invoice_pdf.jsonl
Generating samples for intent: request_custom_engraving
[WARN] Batch 5/50, attempt 1/3 failed for intent 'request_custom_engraving': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'request_custom_engraving': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 2/3 failed for intent 'request_custom_engraving': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 2/3 failed for intent 'request_custom_engraving': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 'request_custom_engraving': ValueError: No JSON array found in model output
[WARN] Batch 10/50, attempt 1/3 failed for intent 're

 98%|█████████▊| 50/51 [1:15:59<01:31, 91.10s/it]

[INFO] Completed generation for intent 'request_custom_engraving'. Collected 996 utterances out of requested 1000.
Saved 996 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_request_custom_engraving.jsonl
Generating samples for intent: check_warranty_coverage
[WARN] Batch 5/50, attempt 1/3 failed for intent 'check_warranty_coverage': ValueError: No JSON array found in model output
[WARN] Batch 5/50, attempt 1/3 failed for intent 'check_warranty_coverage': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'check_warranty_coverage': ValueError: No JSON array found in model output
[WARN] Batch 23/50, attempt 1/3 failed for intent 'check_warranty_coverage': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'check_warranty_coverage': ValueError: No JSON array found in model output
[WARN] Batch 27/50, attempt 1/3 failed for intent 'check_wa

100%|██████████| 51/51 [1:17:41<00:00, 91.40s/it]

[INFO] Completed generation for intent 'check_warranty_coverage'. Collected 989 utterances out of requested 1000.
Saved 989 samples to /Users/rahulnenavath/Documents/Personal-Projects/IntentClassification/Data/synthetic_utterances_check_warranty_coverage.jsonl





In [66]:
def build_synthetic_dataset(data_dir: Path) -> pd.DataFrame:
    jsonl_files = sorted(data_dir.glob("synthetic_utterances_*.jsonl"))
    rows = []
    
    for file in jsonl_files:
        intent_name = file.stem.replace("synthetic_utterances_", "")
        
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    d = json.loads(line)
                    d["intent"] = intent_name
                    d["source_file"] = file.name
                    rows.append(d)
                except json.JSONDecodeError:
                    print(f"[WARN] Skipped malformed line in {file.name}")
        
    df = pd.DataFrame(rows)
    print("Loaded:", len(df), "synthetic samples.")
    return df    

In [67]:
synt_df = build_synthetic_dataset(data_dir)

Loaded: 50461 synthetic samples.


In [71]:
synt_df.head()

Unnamed: 0,intent,text,channel,style,source_file
0,add_to_wishlist,I want to add this jacket to my wishlist.,app,neutral,synthetic_utterances_add_to_wishlist.jsonl
1,add_to_wishlist,Could you please add the red sneakers to my fa...,web,polite,synthetic_utterances_add_to_wishlist.jsonl
2,add_to_wishlist,"Add the smartwatch to my wishlist, please.",phone,casual,synthetic_utterances_add_to_wishlist.jsonl
3,add_to_wishlist,I need this book added to my wish list right now.,chat,urgent,synthetic_utterances_add_to_wishlist.jsonl
4,add_to_wishlist,Please add the blue hoodie to my favorites list.,store,polite,synthetic_utterances_add_to_wishlist.jsonl


In [68]:
# synt_df[synt_df['intent'] == 'unsubscribe_from_newsletter'].to_dict(orient='records')

In [73]:
synt_df.to_csv(data_dir / 'synthetic_generated_utterances.csv', sep=',')

In [81]:
ecom_df = ecomerce_data['train'].to_pandas()

In [83]:
ecom_df = ecom_df.rename(columns={
    'instruction': 'text'
})

In [88]:
combined_df = pd.concat([
    ecom_df[['text', 'intent']],
    synt_df[['text', 'intent']]
]).reset_index()

In [98]:
# Drop the 'index' column if it was added via reset_index()
if 'index' in combined_df.columns:
    combined_df = combined_df.drop(columns=['index'])

# Drop exact duplicate rows (across all remaining columns)
combined_df = combined_df.drop_duplicates(keep='first')

combined_df.reset_index(drop=True, inplace=True)

print(f"Rows after deduplication: {len(combined_df)}")

Rows after deduplication: 91088


In [99]:
print(f'Total Intents after synthetically generating intents: {combined_df["intent"].nunique()}')

Total Intents after synthetically generating intents: 97


In [100]:
combined_df.to_csv(data_dir / 'combined_utt_int_dataset.tsv', sep='\t', index=False)