# asd

In [25]:
from typing import List, Type, Dict
from pydantic import BaseModel, ValidationError
from openai import OpenAI
import time
import json
import os
from dotenv import load_dotenv

load_dotenv()

True

In [49]:
class SyntheticDataGenerator:
    def __init__(
            self,
            schema: Type[BaseModel],
            prompt_template: str,
            output_file: str = "synthetic_data.jsonl",
            n_samples: int = 100,
            model: str = "gpt-3.5-turbo-1106",  # Updated to a model that supports JSON mode
            max_retries: int = 3,
            delay: float = 1.0,
            temperature: float = 0.0
    ):
        self.schema = schema
        self.prompt_template = prompt_template
        self.output_file = output_file
        self.n_samples = n_samples
        self.model = model
        self.max_retries = max_retries
        self.delay = delay
        self.temperature = temperature
        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    def _call_openai(self, prompt: str) -> str:
        for attempt in range(self.max_retries):
            try:
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that generates support ticket data in JSON format."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=self.temperature,
                    response_format={"type": "json_object"}  # Force JSON output
                )
                content = response.choices[0].message.content
                print(f"Raw API response: {content}")  # Debug print
                return content
            except Exception as e:
                print(f"OpenAI API error (attempt {attempt + 1}/{self.max_retries}): {e}")
                if attempt < self.max_retries - 1:
                    time.sleep(self.delay)
        return ""
    
    def _generate_sample(self) -> Dict:
        raw_output = self._call_openai(self.prompt_template)
        try:
            data = json.loads(raw_output)
            sample = self.schema(**data)
            return sample.dict()
        except (json.JSONDecodeError, ValidationError) as e:
            print(f"Parsing error: {e}")
            return {}

    def _write_to_file(self, data: List[Dict]) -> None:
        with open(self.output_file, "w") as f:
            for item in data:
                if item:
                    f.write(json.dumps(item) + "\n")

    def run(self) -> None:
        print(f"Generating {self.n_samples} samples...")
        data = []
        for _ in range(self.n_samples):
            sample = self._generate_sample()
            if sample:
                data.append(sample)
            time.sleep(self.delay)
        self._write_to_file(data)
        print(f"Saved {len(data)} samples to {self.output_file}")

In [50]:
class SupportTicket(BaseModel):
    customer_name: str
    issue_summary: str
    issue_detail: str
    priority: str  # low, medium, high

support_ticket_prompt = """
Generate a synthetic customer support ticket in JSON format with the following fields:
- customer_name
- issue_summary
- issue_detail
- priority (low, medium, high)

Example:
{
  "customer_name": "Alice Johnson",
  "issue_summary": "App crash on launch",
  "issue_detail": "The mobile app crashes immediately after tapping the icon on Android 14.",
  "priority": "high"
}

The response must be a valid JSON object with exactly these fields and no additional content.
Now generate one new ticket:
"""

In [51]:
generator = SyntheticDataGenerator(
    schema=SupportTicket,
    prompt_template=support_ticket_prompt,
    output_file="support_tickets.jsonl",
    n_samples=50
)

generator.run()

Generating 50 samples...
Raw API response: {
  "customer_name": "Bob Smith",
  "issue_summary": "Unable to login",
  "issue_detail": "I am unable to login to my account using my username and password.",
  "priority": "medium"
}


C:\Users\RaviB\AppData\Local\Temp\ipykernel_25836\3676110474.py:49: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  return sample.dict()


Raw API response: {
  "customer_name": "Bob Smith",
  "issue_summary": "Unable to login",
  "issue_detail": "I am unable to login to my account using my username and password. I have tried resetting my password but still cannot access my account.",
  "priority": "medium"
}
Raw API response: {
  "customer_name": "Bob Smith",
  "issue_summary": "Unable to login",
  "issue_detail": "I am unable to login to my account using my username and password.",
  "priority": "medium"
}
Raw API response: {
  "customer_name": "Bob Smith",
  "issue_summary": "Unable to login",
  "issue_detail": "I am unable to login to my account using my username and password.",
  "priority": "medium"
}
Raw API response: {
  "customer_name": "Bob Smith",
  "issue_summary": "Unable to access account",
  "issue_detail": "I am unable to log in to my account using my usual credentials. I have tried resetting my password but still cannot access my account.",
  "priority": "medium"
}
Raw API response: {
  "customer_name": "