In [2]:
from openai import OpenAI
import yaml
import os

# Automatically locate the file in the current directory
CONFIG_PATH = "C:\\Users\\HOME\\github\\config.yaml"

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

api_key = config["api_key"]
base_url = config["base_url"]

client = OpenAI(
    api_key=api_key,  # Replace with your actual API key
    base_url=base_url
)
print("Config loaded")


Config loaded


In [1]:
response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "Give json output like column name, data type and min and max length"},
                {"role": "user", "content": "Generate a dataset schema with 5 columns for transaction data. Take the best transaction dataset from kaggle"}
            ]
        )
print(response.choices[0].message.content)

NameError: name 'client' is not defined

In [None]:
from openai import OpenAI
from pydantic import ValidationError
import pandas as pd
import json

class SchemaAgent:
    def __init__(self, llm_client: OpenAI):
        self.llm = llm_client

    def generate_from_prompt(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.prompt, "Prompt required for LLM generation"
        system = (
            "You are a data schema generator. "
            "Generate a JSON schema object from the user prompt with field names, types, and constraints "
            "in this format: { 'columns': [ { 'name': ..., 'type': ..., 'min': ..., 'max': ..., ... } ] }."
        )

        user_msg = f"Use-case: {schema_prompt.use_case}\nPrompt: {schema_prompt.prompt}"

        response = self.llm.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": system},
                {"role": "user", "content": user_msg}
            ]
        )

        try:
            parsed = json.loads(response.choices[0].message.content)
            return SchemaObject(use_case=schema_prompt.use_case, **parsed)
        except (json.JSONDecodeError, ValidationError) as e:
            raise ValueError(f"LLM returned invalid schema: {e}")

    def generate_from_csv(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.csv_path, "CSV path required for CSV parsing"
        df = pd.read_csv(schema_prompt.csv_path)
        cols = []

        for col in df.columns:
            dtype = df[col].dtype
            col_type = "string"
            if pd.api.types.is_integer_dtype(dtype):
                col_type = "int"
            elif pd.api.types.is_float_dtype(dtype):
                col_type = "float"
            elif pd.api.types.is_datetime64_any_dtype(dtype):
                col_type = "datetime"
            elif pd.api.types.is_categorical_dtype(dtype) or df[col].nunique() < 10:
                col_type = "categorical"

            col_schema = ColumnSchema(
                name=col,
                type=col_type,
                min=float(df[col].min()) if col_type in ["int", "float"] else None,
                max=float(df[col].max()) if col_type in ["int", "float"] else None,
                values=list(df[col].dropna().unique()) if col_type == "categorical" else None,
                format="%Y-%m-%d" if col_type == "datetime" else None
            )
            cols.append(col_schema)

        return SchemaObject(use_case=schema_prompt.use_case, columns=cols)


In [5]:
from openai import OpenAI
from schema_agent import SchemaAgent
from schema_models import SchemaPrompt
# Instantiate agent
schema_agent = SchemaAgent(client)

# Define prompt
prompt = SchemaPrompt(
    use_case="Transaction fraud detection",
    prompt="Generate a transaction dataset schema similar to top fraud datasets on Kaggle. Include fields like amount, merchant, user_id, timestamp, is_fraud"
)

# Get structured output
schema = schema_agent.generate_from_prompt(prompt)

# Print result
print(schema.model_dump_json(indent=2))

{
  "use_case": "Transaction fraud detection",
  "columns": [
    {
      "name": "amount",
      "type": "float",
      "min": 0.01,
      "max": 10000.0,
      "format": null,
      "values": null
    },
    {
      "name": "merchant",
      "type": "string",
      "min": null,
      "max": null,
      "format": null,
      "values": null
    },
    {
      "name": "user_id",
      "type": "string",
      "min": null,
      "max": null,
      "format": null,
      "values": null
    },
    {
      "name": "timestamp",
      "type": "datetime",
      "min": null,
      "max": null,
      "format": "%Y-%m-%d %H:%M:%S",
      "values": null
    },
    {
      "name": "is_fraud",
      "type": "categorical",
      "min": null,
      "max": null,
      "format": null,
      "values": [
        "0",
        "1"
      ]
    },
    {
      "name": "transaction_type",
      "type": "categorical",
      "min": null,
      "max": null,
      "format": null,
      "values": [
        "online",
 