# Synthetic Data Generation with DSPy and Pydantic
This notebook demonstrates generating synthetic sales data with **DSPy** using column-level signatures and **Pydantic v2** for validation.


 Demonstrates how to configure **DSPy** to work with a locally 
 hosted **Ollama** LLM (Llama 3.2). Dspy: https://dspy.ai/ Its a declarative framework enabling us to build AI worflows using LLM

In [2]:
#  Imports & DSPy/Ollama setup
import csv
import io
import pandas as pd
import dspy
import litellm

litellm.drop_params = True

lm = dspy.LM(
    "ollama_chat/llama3.2",
    api_base="http://localhost:11434",
    api_key=""
)
dspy.settings.configure(lm=lm)


## Step 1 Load Original Dataset

In [3]:
import pandas as pd
try:
    df_input = pd.read_csv("data/sales_data.csv")
    print("\nOriginal Dataset:\n", df_input.head())
except FileNotFoundError:
    print(" No 'sales_data.csv' found.")


Original Dataset:
   Customer_ID  Salary  Expenses Region
0        C001   55000     20000  North
1        C002  120000     80000  South
2        C003   75000     40000   East
3        C004   25000     10000   West
4        C005  180000    120000  North


## Step: 2 Column-Level Signatures with DSPy
These signatures define how synthetic data is generated per column with chain-of-thought reasoning.

In [4]:
import dspy
from pydantic import BaseModel, field_validator, model_validator
from typing import ClassVar

# Salary Generator
class SalaryGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating salary in range 10000-200000")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Salary = dspy.OutputField(desc="Final integer salary between 10000 and 200000. Output only the number.")

# Expenses Generator
class ExpensesGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating expenses < salary")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Expenses_in = dspy.InputField(desc="Original expenses from input row")
    Expenses = dspy.OutputField(desc="Final integer expenses, positive and < Salary_in. Output only the number.")

# Region Generator
class RegionGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about picking exactly one region")
    Region_in = dspy.InputField(desc="Original region from input row")
    Region = dspy.OutputField(desc="Final region: North, South, East, or West")

## Step:3 Pydantic v2 Model
Define a model to validate the synthetic data with constraints and auto-increment Customer_IDs.

In [5]:
class SyntheticRowModel(BaseModel):
    Customer_ID: str
    Salary: int
    Expenses: int
    Region: str

    _id_counter: ClassVar[int] = 0

    @classmethod
    def generate_id(cls) -> str:
        cls._id_counter += 1
        return f"C{cls._id_counter:03d}"

    @field_validator("Salary")
    @classmethod
    def validate_salary(cls, v: int):
        if not (10_000 <= v <= 200_000):
            raise ValueError("Salary must be between 10,000 and 200,000")
        return v

    @field_validator("Region")
    @classmethod
    def validate_region(cls, v: str):
        allowed = {"North", "South", "East", "West"}
        if v not in allowed:
            raise ValueError(f"Region must be one of {allowed}")
        return v

    @model_validator(mode="after")
    def validate_expenses_vs_salary(self):
        if self.Expenses >= self.Salary:
            raise ValueError("Expenses must be less than Salary")
        if self.Expenses < 0:
            raise ValueError("Expenses must be positive")
        return self

## Step: 4 Agentic Row Generator
This module uses DSPy predictors to generate synthetic rows.

In [6]:
class SyntheticRowAgent(dspy.Module):
    def __init__(self):
        super().__init__()
        self.salary_agent = dspy.Predict(SalaryGen, temperature=0)
        self.expenses_agent = dspy.Predict(ExpensesGen, temperature=0)
        self.region_agent = dspy.Predict(RegionGen, temperature=0.5)

    def forward(self, seed: str, row):
        salary = self.salary_agent(seed=seed, Salary_in=row['Salary'])
        expenses = self.expenses_agent(seed=seed, Salary_in=row['Salary'], Expenses_in=row['Expenses'])
        region = self.region_agent(seed=seed, Region_in=row['Region'])

        return {
            "Salary": salary.Salary,
            "Expenses": expenses.Expenses,
            "Region": region.Region,
        }

## Step: 5 Helper Function: Generate Single Validated Row

In [7]:
def generate_unique_row(agent, seed, row):
    synthetic_row = agent(seed=seed, row=row)
    synthetic_row["Customer_ID"] = SyntheticRowModel.generate_id()
    validated_row = SyntheticRowModel(**synthetic_row)
    return validated_row.model_dump()

## Step: 6 Generate Full Synthetic Dataset

In [8]:
def generate_synthetic_data_unique(df_input, n_samples_per_row=2):
    agent = SyntheticRowAgent()
    synthetic_records = []

    for idx, row in df_input.iterrows():
        for i in range(n_samples_per_row):
            seed = f"{idx}-{i}"
            unique_row = generate_unique_row(agent, seed, row)
            synthetic_records.append(unique_row)

    df_synthetic = pd.DataFrame(synthetic_records)
    return df_synthetic

## Step : 7 Run Synthetic Generation

In [9]:
df_synthetic = generate_synthetic_data_unique(df_input, n_samples_per_row=2)
df_synthetic.to_csv("data/synthetic_sales_data.csv", index=False)
df_synthetic.head(10)

Unnamed: 0,Customer_ID,Salary,Expenses,Region
0,C001,150000,20000,North
1,C002,150000,15000,East
2,C003,150000,80000,North
3,C004,150000,80000,North
4,C005,150000,40000,East
5,C006,150000,15000,East
6,C007,15000,10000,North
7,C008,150000,8000,North
8,C009,150000,60000,East
9,C010,150000,60000,West
