In [17]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Read values
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")

# Initialize OpenAI-compatible client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

print("Config loaded from .env")

Config loaded from .env


In [18]:
import os
print("Working directory:", os.getcwd())
print("Files here:", os.listdir())

Working directory: /app
Files here: ['.dockerignore', '.env', '.git', '.ipynb_checkpoints', 'config.yaml', 'docker-compose.yaml', 'Dockerfile', 'python', 'README.md', 'requirements.txt']


In [19]:
import importlib.util

spec = importlib.util.spec_from_file_location("schema_models", "/app/python/schema_models.py")
schema_models = importlib.util.module_from_spec(spec)
spec.loader.exec_module(schema_models)

SchemaPrompt = schema_models.SchemaPrompt
SchemaObject = schema_models.SchemaObject
ColumnSchema = schema_models.ColumnSchema

In [None]:
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
from pydantic import ValidationError
from schema_models import SchemaObject, SchemaPrompt, ColumnSchema
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

def test_llm(client):
    test = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": "Say hello"}]
    )
    print("✅ Test LLM response:", test.choices[0].message.content)


class SchemaAgent:
    def __init__(self, llm_client: OpenAI):
        self.llm = llm_client

    def generate_from_prompt(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.prompt, "Prompt is required"
        system_msg = (
            "You are a strict schema generator. Return ONLY a JSON object like:\n"
            "{\n"
            "  \"columns\": [\n"
            "    {\"name\": \"age\", \"type\": \"int\", \"min\": 0, \"max\": 120},\n"
            "    {\"name\": \"gender\", \"type\": \"categorical\", \"values\": [\"M\", \"F\"]},\n"
            "    {\"name\": \"admission_date\", \"type\": \"datetime\", \"format\": \"%Y-%m-%d\"}\n"
            "  ]\n"
            "}"
        )
        user_msg = f"Use-case: {schema_prompt.use_case}\nPrompt: {schema_prompt.prompt}"

        response = self.llm.chat.completions.create(
            model="deepseek-chat",
            messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
            ]
        )

        text = response.choices[0].message.content
        if not text or not text.strip():
            raise ValueError("❌ Empty response from LLM. Check API key, base URL, or network.")
        print("✅ LLM Output:\n", text)
        if not text or not text.strip():
            raise ValueError("LLM response is empty or invalid. Check API status or quota.")

        try:
            # Try to extract valid JSON from potentially messy output
            json_start = text.find('{')
            json_end = text.rfind('}') + 1
            parsed = json.loads(text[json_start:json_end])
            return SchemaObject(use_case=schema_prompt.use_case, **parsed)
        except (json.JSONDecodeError, ValidationError) as e:
            print(f"❌ LLM output invalid: {e}")
            raise ValueError(f"LLM returned malformed or invalid schema.\nRaw output:\n{text}")

    def generate_from_csv(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.csv_path, "CSV path is required"
        df = pd.read_csv(schema_prompt.csv_path)
        cols = []

        for col in df.columns:
            dtype = df[col].dtype
            col_type = "string"
            if pd.api.types.is_integer_dtype(dtype):
                col_type = "int"
            elif pd.api.types.is_float_dtype(dtype):
                col_type = "float"
            elif pd.api.types.is_datetime64_any_dtype(dtype):
                col_type = "datetime"
            elif pd.api.types.is_categorical_dtype(dtype) or df[col].nunique() < 10:
                col_type = "categorical"

            col_schema = ColumnSchema(
                name=col,
                type=col_type,
                min=float(df[col].min()) if col_type in ["int", "float"] else None,
                max=float(df[col].max()) if col_type in ["int", "float"] else None,
                values=list(map(str, df[col].dropna().unique())) if col_type == "categorical" else None,
                format="%Y-%m-%d" if col_type == "datetime" else None
            )
            cols.append(col_schema)

        return SchemaObject(use_case=schema_prompt.use_case, columns=cols)


In [21]:
schema_prompt = SchemaPrompt(
    use_case="Employee record generation",
    prompt="Generate a schema for employee records including age (18-65), gender (M/F), role, and join date"
)

schema_agent = SchemaAgent(llm_client=client)  # `client` = your DeepSeek OpenAI-compatible instance

schema = schema_agent.generate_from_prompt(schema_prompt)

print(schema)

ValueError: LLM returned invalid schema: Expecting value: line 1 column 1 (char 0)

In [15]:
test_prompt = {
    "model": "deepseek-chat",
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Say hello."}
    ]
}

test_response = client.chat.completions.create(**test_prompt)

print("Test response raw content:", test_response.choices[0].message.content)

Test response raw content: Hello! 😊 How can I assist you today?
