In [2]:
from openai import OpenAI
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Read values
api_key = os.getenv("OPENAI_API_KEY")
base_url = os.getenv("OPENAI_BASE_URL")

# Initialize OpenAI-compatible client
client = OpenAI(
    api_key=api_key,
    base_url=base_url
)

print("Config loaded from .env")

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [None]:
import os
print("Working directory:", os.getcwd())
print("Files here:", os.listdir())

Working directory: /app
Files here: ['.dockerignore', '.env', '.git', '.gitignore', '.ipynb_checkpoints', 'config.yaml', 'docker-compose.yaml', 'Dockerfile', 'python', 'README.md', 'requirements.txt']


In [None]:
import importlib.util

spec = importlib.util.spec_from_file_location("schema_models", "/app/python/schema_models.py")
schema_models = importlib.util.module_from_spec(spec)
spec.loader.exec_module(schema_models)

SchemaPrompt = schema_models.SchemaPrompt
SchemaObject = schema_models.SchemaObject
ColumnSchema = schema_models.ColumnSchema

In [None]:
import json
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
from pydantic import ValidationError
import os

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_BASE_URL")
)

def test_llm(client):
    test = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": "Say hello"}]
    )
    print("✅ Test LLM response:", test.choices[0].message.content)


class SchemaAgent:
    def __init__(self, llm_client: OpenAI):
        self.llm = llm_client

    def generate_from_prompt(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.prompt, "Prompt is required"
        system_msg = (
            "You are a strict schema generator. Return ONLY a JSON object like:\n"
            "{\n"
            "  \"columns\": [\n"
            "    {\"name\": \"age\", \"type\": \"int\", \"min\": 0, \"max\": 120},\n"
            "    {\"name\": \"gender\", \"type\": \"categorical\", \"values\": [\"M\", \"F\"]},\n"
            "    {\"name\": \"admission_date\", \"type\": \"datetime\", \"format\": \"%Y-%m-%d\"}\n"
            "  ]\n"
            "}"
        )
        user_msg = f"Use-case: {schema_prompt.use_case}\nPrompt: {schema_prompt.prompt}"

        response = self.llm.chat.completions.create(
            model="deepseek-chat",
            messages=[
            {"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}
            ]
        )

        text = response.choices[0].message.content
        if not text or not text.strip():
            raise ValueError("❌ Empty response from LLM. Check API key, base URL, or network.")
        print("✅ LLM Output:\n", text)
        if not text or not text.strip():
            raise ValueError("LLM response is empty or invalid. Check API status or quota.")

        try:
            # Try to extract valid JSON from potentially messy output
            json_start = text.find('{')
            json_end = text.rfind('}') + 1
            parsed = json.loads(text[json_start:json_end])
            return SchemaObject(use_case=schema_prompt.use_case, **parsed)
        except (json.JSONDecodeError, ValidationError) as e:
            print(f"❌ LLM output invalid: {e}")
            raise ValueError(f"LLM returned malformed or invalid schema.\nRaw output:\n{text}")

    def generate_from_csv(self, schema_prompt: SchemaPrompt) -> SchemaObject:
        assert schema_prompt.csv_path, "CSV path is required"
        df = pd.read_csv(schema_prompt.csv_path)
        cols = []

        for col in df.columns:
            dtype = df[col].dtype
            col_type = "string"
            if pd.api.types.is_integer_dtype(dtype):
                col_type = "int"
            elif pd.api.types.is_float_dtype(dtype):
                col_type = "float"
            elif pd.api.types.is_datetime64_any_dtype(dtype):
                col_type = "datetime"
            elif pd.api.types.is_categorical_dtype(dtype) or df[col].nunique() < 10:
                col_type = "categorical"

            col_schema = ColumnSchema(
                name=col,
                type=col_type,
                min=float(df[col].min()) if col_type in ["int", "float"] else None,
                max=float(df[col].max()) if col_type in ["int", "float"] else None,
                values=list(map(str, df[col].dropna().unique())) if col_type == "categorical" else None,
                format="%Y-%m-%d" if col_type == "datetime" else None
            )
            cols.append(col_schema)

        return SchemaObject(use_case=schema_prompt.use_case, columns=cols)


In [None]:
schema_prompt = SchemaPrompt(
    use_case="Employee record generation",
    prompt="Generate a schema for employee records including age (18-65), gender (M/F), role, and join date"
)

schema_agent = SchemaAgent(llm_client=client)  # `client` = your DeepSeek OpenAI-compatible instance

schema = schema_agent.generate_from_prompt(schema_prompt)

print(schema)

✅ LLM Output:
 ```json
{
  "columns": [
    {"name": "age", "type": "int", "min": 18, "max": 65},
    {"name": "gender", "type": "categorical", "values": ["M", "F"]},
    {"name": "role", "type": "string"},
    {"name": "join_date", "type": "datetime", "format": "%Y-%m-%d"}
  ]
}
```
use_case='Employee record generation' columns=[ColumnSchema(name='age', type='int', min=18.0, max=65.0, format=None, values=None), ColumnSchema(name='gender', type='categorical', min=None, max=None, format=None, values=['M', 'F']), ColumnSchema(name='role', type='string', min=None, max=None, format=None, values=None), ColumnSchema(name='join_date', type='datetime', min=None, max=None, format='%Y-%m-%d', values=None)]


In [None]:
!pip install ctgan


Collecting ctgan
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting torch>=1.13.0 (from ctgan)
  Downloading torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting rdt>=1.14.0 (from ctgan)
  Downloading rdt-1.17.0-py3-none-any.whl.metadata (10 kB)
Collecting scipy>=1.9.2 (from rdt>=1.14.0->ctgan)
  Downloading scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scikit-learn>=1.1.0 (from rdt>=1.14.0->ctgan)
  Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting joblib>=1.2.0 (from scikit-learn>=1.1.0->rdt>=1.14.0->ctgan)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.1.0->rdt>=1.14.0->ctgan)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from torch>=1.13.0->ctgan)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9

In [None]:
import pandas as pd
from ctgan import CTGAN
from typing import List
from faker import Faker
import random

class CTGANGeneratorAgent:
    def __init__(self):
        self.faker = Faker()
        self.model = CTGAN(epochs=100)
        self.generated_categories = {}

    def get_dynamic_category(self, col_name: str, generator_fn, max_unique=200):
        if col_name not in self.generated_categories:
            self.generated_categories[col_name] = list({generator_fn() for _ in range(max_unique)})
        return random.choice(self.generated_categories[col_name])
    
    
    def generate_fake_data_from_schema(self, schema: SchemaObject, n=100) -> pd.DataFrame:
        rows = []
        for _ in range(n):
            row = {}
            for col in schema.columns:
                if col.type == "int":
                    row[col.name] = random.randint(int(col.min or 0), int(col.max or 100))
                elif col.type == "float":
                    row[col.name] = round(random.uniform(col.min or 0.0, col.max or 100.0), 2)
                elif col.type == "categorical":
                    row[col.name] = random.choice(col.values or ["Unknown"])
                elif col.type == "datetime":
                    fmt = col.format or "%Y-%m-%d"
                    row[col.name] = self.faker.date_between(start_date='-5y', end_date='today').strftime(fmt)
                elif col.type == "string":
                    row[col.name] = None
                else:
                    row[col.name] = None
            rows.append(row)
        return pd.DataFrame(rows)

    def fit_ctgan_on_fake_data(self, fake_df: pd.DataFrame, schema: SchemaObject):
        cat_cols = [col.name for col in schema.columns if col.type == "categorical"]
        self.model.fit(fake_df, discrete_columns=cat_cols)

    def sample(self, n=100) -> pd.DataFrame:
        return self.model.sample(n)

    def generate_from_schema(self, schema: SchemaObject, n=100) -> pd.DataFrame:
        fake_df = self.generate_fake_data_from_schema(schema, n=100)

        valid_types = ["int", "float", "categorical"]
        safe_cols = [col.name for col in schema.columns if col.type in valid_types]
        fake_df = fake_df[safe_cols]

        self.fit_ctgan_on_fake_data(fake_df, schema)
        sampled = self.sample(n)
        return self.enforce_bounds(sampled, schema)
    
    def enforce_bounds(self, df: pd.DataFrame, schema: SchemaObject) -> pd.DataFrame:
        for col in schema.columns:
            if col.type == "int":
                df[col.name] = df[col.name].clip(lower=col.min or 0, upper=col.max or 100).astype(int)
            elif col.type == "float":
                df[col.name] = df[col.name].clip(lower=col.min or 0.0, upper=col.max or 100.0).astype(float)
        return df


In [None]:


schema_prompt = SchemaPrompt(
    use_case="Retail customer data",
    prompt="Create schema with name,place ,age (18-60), gender (M/F), city, salary (100k-200k), join_date"
)

schema = SchemaAgent(llm_client=client).generate_from_prompt(schema_prompt)

gen = CTGANGeneratorAgent()
df = gen.generate_from_schema(schema, n=5)

print(df)


✅ LLM Output:
 ```json
{
  "columns": [
    {"name": "name", "type": "string"},
    {"name": "place", "type": "string"},
    {"name": "age", "type": "int", "min": 18, "max": 60},
    {"name": "gender", "type": "categorical", "values": ["M", "F"]},
    {"name": "city", "type": "string"},
    {"name": "salary", "type": "int", "min": 100000, "max": 200000},
    {"name": "join_date", "type": "datetime", "format": "%Y-%m-%d"}
  ]
}
```
   age gender  salary
0   36      F  116898
1   30      M  100000
2   36      F  155347
3   25      F  150042
4   18      F  118637


In [6]:
import os
print("Current working directory:", os.getcwd())

Current working directory: /app


In [8]:
from pathlib import Path
import json
fn_txt_path = Path("/app/dataset/first_names.txt")

with open(fn_txt_path, "r", encoding="utf-8") as f:
    first_names = [line.strip() for line in f if line.strip()]

print("✅ Loaded", len(first_names), "names")

json_path = "/app/dataset/first_names.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(first_names, f, indent=2)

✅ Loaded 163896 names


In [10]:
from pathlib import Path
import json
ln_txt_path = Path("/app/dataset/last_names.txt")

with open(ln_txt_path, "r", encoding="utf-8") as f:
    last_names = [line.strip() for line in f if line.strip()]

print("✅ Loaded", len(last_names), "names")

json_path = "/app/dataset/last_names.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(last_names, f, indent=2)

✅ Loaded 98343 names


In [11]:
chars = sorted(set("".join(names)))
char2idx = {ch: i for i, ch in enumerate(chars)}
idx2char = {i: ch for ch, i in char2idx.items()}
VOCAB_SIZE = len(char2idx)

NameError: name 'names' is not defined

In [None]:
tokens = set()
for name in names:
    tokens.update(name.split())

word2idx = {word: i for i, word in enumerate(sorted(tokens))}
idx2word = {i: word for word, i in word2idx.items()}
VOCAB_SIZE = len(word2idx)


In [None]:
sequences = [[char2idx[ch] for ch in name] for name in names]
sequences

[[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [8, 16, 26, 15, 8],
 [8, 16, 26, 15, 8, 15],
 [8, 17, 8, 32],
 [8, 8, 0, 16, 26, 15, 8],
 [8, 8, 0, 16, 26, 15, 8, 15],
 [8, 8, 9, 8, 21],
 [8, 8, 9, 8, 26],
 [8, 8, 9, 15, 8],
 [8, 8, 9, 16, 8],
 [8, 8, 9, 16, 11],
 [8, 8, 9, 16, 11, 8, 15],
 [8, 8, 9, 16, 11, 8, 20, 20, 8, 25],
 [8, 8, 9, 16, 25],
 [8, 8, 9, 25, 16, 12, 19, 19, 8],
 [8, 8, 10, 15, 8, 19],
 [8, 8, 11],
 [8, 8, 11, 8],
 [8, 8, 11, 8, 20],
 [8, 8, 11, 8, 21],
 [8, 8, 11, 8, 25, 26, 15],
 [8, 8, 11, 8, 32, 8],
 [8, 8, 11, 12, 21],
 [8, 8, 11, 12, 26, 15],
 [8, 8, 11, 15, 8, 20],
 [8, 8, 11, 15, 8, 21],
 [8, 8, 11, 15, 8, 29],
 [8, 8, 11, 15, 8, 29, 8, 21],
 [8, 8, 11, 15, 16],
 [8, 8, 11, 15, 16, 25, 8],
 [8, 8, 11, 15, 16, 25, 8, 21],
 [8, 8, 11, 15, 16, 26, 15],
 [8, 8, 11, 15, 16, 27, 15],
 [8, 8, 11, 15, 16, 27, 15, 32, 8],
 [8, 8, 11, 15, 25, 16, 27, 15],
 [8, 8, 11, 15, 29, 16, 18],
 [8, 8, 11, 15, 29, 16, 18, 8],
 [8, 8, 11, 15, 32, 8],
 [8, 8, 11, 15

In [12]:
import random
import json


with open("/app/dataset/first_names.json","r",encoding='utf8') as f:
    first_names = json.load(f)

with open("/app/dataset/last_names.json","r",encoding='utf8') as f:
    last_names = json.load(f)


def generate_names(first_names,last_names,count=10000):
    return [f"{f} {l}" for f,l in zip(
    random.choices(first_names,k=count),
    random.choices(last_names,k=count)
    )]
    

In [14]:
full_names = generate_names(first_names,last_names,count=10000)
len(full_names)

10000

In [65]:
def build_vocab(sequences):
    vocab = sorted(set("".join(sequences)))
    char2idx = {ch: idx + 2 for idx, ch in enumerate(vocab)}  # Start from 2
    char2idx["<PAD>"] = 0
    char2idx["<START>"] = 1
    idx2char = {idx: ch for ch, idx in char2idx.items()}
    return char2idx, idx2char

In [66]:
cha,idx = build_vocab(full_names)

In [67]:
idx

{2: ' ',
 3: "'",
 4: '-',
 5: '/',
 6: 'a',
 7: 'b',
 8: 'c',
 9: 'd',
 10: 'e',
 11: 'f',
 12: 'g',
 13: 'h',
 14: 'i',
 15: 'j',
 16: 'k',
 17: 'l',
 18: 'm',
 19: 'n',
 20: 'o',
 21: 'p',
 22: 'q',
 23: 'r',
 24: 's',
 25: 't',
 26: 'u',
 27: 'v',
 28: 'w',
 29: 'x',
 30: 'y',
 31: 'z',
 32: '\x9a',
 33: 'á',
 34: 'â',
 35: 'ã',
 36: 'ä',
 37: 'å',
 38: 'æ',
 39: 'ç',
 40: 'è',
 41: 'é',
 42: 'ê',
 43: 'ë',
 44: 'ì',
 45: 'í',
 46: 'ï',
 47: 'ð',
 48: 'ñ',
 49: 'ó',
 50: 'ô',
 51: 'õ',
 52: 'ö',
 53: 'ø',
 54: 'ú',
 55: 'ü',
 56: 'ý',
 57: 'þ',
 58: 'ā',
 59: 'ă',
 60: 'ć',
 61: 'č',
 62: 'đ',
 63: 'ė',
 64: 'ğ',
 65: 'ī',
 66: 'ı',
 67: 'ļ',
 68: 'ł',
 69: 'ř',
 70: 'ş',
 71: 'ţ',
 72: 'ž',
 0: '<PAD>',
 1: '<START>'}

In [None]:
!pip install torch


Collecting torch
  Downloading torch-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.6.77 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.6.80 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.6 kB)
Collec

In [74]:
import torch
from torch.nn.utils.rnn import pad_sequence

def prepare_dataset(sequences, char2idx):
    X, Y = [], []
    for seq in sequences:
        ids = [char2idx["<START>"]] + [char2idx[c] for c in seq if c in char2idx]
        if len(ids) < 2:
            continue
        X.append(torch.tensor(ids[:-1]))
        Y.append(torch.tensor(ids[1:]))
    return pad_sequence(X, batch_first=True), pad_sequence(Y, batch_first=True)

    

In [75]:
X, Y = prepare_dataset(full_names, cha)

In [70]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset,DataLoader


In [71]:
#parameters
embedding_dim = 1024
hidden_dim = 512
batch_size = 64
num_epochs = 20
learning_rate = 0.01
vocab_size = len(cha)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [80]:


class StringGEN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=512, hidden_dim=512):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        output, hidden = self.gru(x, hidden)
        logits = self.fc(output)
        return logits, hidden

In [81]:
X_tensor,Y_tensor = prepare_dataset(full_names, cha)
dataset = TensorDataset(X_tensor, Y_tensor)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [82]:
model = StringGEN(vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss(ignore_index=cha["<PAD>"])



In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for bat_x, bat_y in dataloader:
        bat_x, bat_y = bat_x.to(device), bat_y.to(device)
        optimizer.zero_grad()
        logits, _ = model(bat_x)
        loss = loss_fn(logits.view(-1, vocab_size), bat_y.view(-1))
        loss.backward()
        optimizer.step()
        #scheduler.step()  # Optional if you're using a learning rate scheduler
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

Epoch 1/20, Loss: 0.0000
Epoch 2/20, Loss: 0.0000
Epoch 3/20, Loss: 0.0000
Epoch 4/20, Loss: 0.0000


In [None]:
def generate_name(model, char2idx, idx2char, max_len=20):
    model.eval()
    start_id = torch.tensor([[char2idx["<START>"]]], dtype=torch.long).to(device)
    hidden = None
    input_ids = start_id
    output_str = ""

    for _ in range(max_len):
        logits, hidden = model(input_ids, hidden)  # input_ids: [1,1]
        probs = torch.softmax(logits[:, -1, :], dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # shape: [1,1]
        char = idx2char.get(next_id.item(), "")
        if char == "<PAD>":
            break
        output_str += char
        input_ids = next_id  # keep it as [1,1] for next GRU input

    return output_str

In [79]:
print(generate_name(model, cha, idx))
print(generate_name(model, cha, idx))

ValueError: GRU: Expected input to be 2D or 3D, got 4D instead