### Demonstrates how to configure **DSPy** to work with a locally 
### hosted **Ollama** LLM (Llama 3.2). Dspy: https://dspy.ai/ Its a declarative framework enabling us to build AI worflows using LLM

In [40]:
#  Imports & DSPy/Ollama setup
import csv
import io
import pandas as pd
import dspy
import litellm

litellm.drop_params = True

lm = dspy.LM(
    "ollama_chat/llama3.2",
    api_base="http://localhost:11434",
    api_key=""
)
dspy.settings.configure(lm=lm)


In [41]:
# Load original dataset 
import pandas as pd

try:
   
    df_input = pd.read_csv("data/sales_data.csv")
    print("\n Original Dataset:\n", df_input.head())
except FileNotFoundError:
    print(" No 'sales_data.csv' found.")




 Original Dataset:
   Customer_ID  Salary  Expenses Region
0        C001   55000     20000  North
1        C002  120000     80000  South
2        C003   75000     40000   East
3        C004   25000     10000   West
4        C005  180000    120000  North


### Column-level Signatures with Thoughts

These DSPy Signatures generate synthetic dataset columns:
- **Inputs**: `seed` / original values (`*_in`)  
- **Thought**: explicit reasoning step (chain-of-thought)  
- **Output**: final synthetic value (constrained and validated)  

Captures reasoning for transparency while producing realistic synthetic data.


In [42]:
# Column-level Signatures with thoughts 

class CustomerIDGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating a unique customer ID")
    Customer_ID = dspy.OutputField(desc="Final unique ID like C### example C123,C011,C234")


class SalaryGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating salary in range 10000-200000")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Salary = dspy.OutputField(
        desc="Final integer salary between 10000 and 200000. Output only the number, no commas or text."
    )


class ExpensesGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating expenses < salary")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Expenses_in = dspy.InputField(desc="Original expenses from input row")
    Expenses = dspy.OutputField(
        desc="Final integer expenses, positive and < Salary_in. Output only the number."
    )


class RegionGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about picking exactly one region")
    Region_in = dspy.InputField(desc="Original region from input row")
    Region = dspy.OutputField(desc="Final region: North, South, East, or West")


In [53]:
#  Agentic Row Generator

class SyntheticRowAgent(dspy.Module):
    def __init__(self):
        super().__init__()
        self.customer_agent = dspy.Predict(CustomerIDGen, temperature=0.5)
        self.salary_agent = dspy.Predict(SalaryGen, temperature=0)
        self.expenses_agent = dspy.Predict(ExpensesGen, temperature=0)
        self.region_agent = dspy.Predict(RegionGen, temperature=0.5)

    def forward(self, seed: str, row):
        customer = self.customer_agent(seed=seed)
        salary = self.salary_agent(seed=seed, Salary_in=row['Salary'])
        expenses = self.expenses_agent(seed=seed, Salary_in=row['Salary'], Expenses_in=row['Expenses'])
        region = self.region_agent(seed=seed, Region_in=row['Region'])

        return {
            "Customer_ID": customer.Customer_ID,
            "Salary": salary.Salary,
            "Expenses": expenses.Expenses,
            "Region": region.Region,
            "Thoughts": {
                "Customer": customer.thought,
                "Salary": salary.thought,
                "Expenses": expenses.thought,
                "Region": region.thought
            }
        }


In [54]:
# Cell 5 - Generate synthetic dataset based on input

def generate_synthetic_data(df_input, n_samples_per_row=2):
    agent = SyntheticRowAgent()
    synthetic_records = []

    for idx, row in df_input.iterrows():
        for i in range(n_samples_per_row):
            seed = f"{idx}-{i}"
            synthetic_row = agent(seed=seed, row=row)
            synthetic_records.append(synthetic_row)

    df_synthetic = pd.DataFrame(synthetic_records)
    return df_synthetic




In [55]:
# Generate synthetic dataset

df_synthetic = generate_synthetic_data(df_input, n_samples_per_row=2)

In [56]:
df_synthetic[["Customer_ID", "Salary", "Expenses", "Region"]]

Unnamed: 0,Customer_ID,Salary,Expenses,Region
0,C123,150000,20000,North
1,C123,15000,15000,East
2,C123,150000,80000,North
3,C123,150000,80000,North
4,C123,150000,40000,East
5,C123,150000,15000,East
6,C123,15000,10000,North
7,C123,15000,8000,North
8,C1234,180000,60000,East
9,C123,180000,60000,West


In [16]:


df_synthetic.to_csv("data/synthetic_sales_data_v3.csv", index=False)

In [4]:
# ======================================================
#  Imports & DSPy/Ollama setup
# ======================================================
import pandas as pd
import dspy
import litellm
from pydantic import BaseModel, field_validator, model_validator
from typing import ClassVar

litellm.drop_params = True

lm = dspy.LM(
    "ollama_chat/llama3.2",
    api_base="http://localhost:11434",
    api_key=""
)
dspy.settings.configure(lm=lm)


# ======================================================
#  Load original dataset
# ======================================================
try:
    df_input = pd.read_csv("data/sales_data.csv")
    print("\nOriginal Dataset:\n", df_input.head())
except FileNotFoundError:
    print("⚠️ No 'sales_data.csv' found.")
    df_input = pd.DataFrame([
        {"Customer_ID": "C000", "Salary": 150000, "Expenses": 20000, "Region": "East"},
        {"Customer_ID": "C001", "Salary": 180000, "Expenses": 50000, "Region": "South"},
    ])


# ======================================================
#  Column-level Signatures
# ======================================================
class SalaryGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating salary in range 10000-200000")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Salary = dspy.OutputField(
        desc="Final integer salary between 10000 and 200000. Output only the number."
    )


class ExpensesGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about generating expenses < salary")
    Salary_in = dspy.InputField(desc="Original salary from input row")
    Expenses_in = dspy.InputField(desc="Original expenses from input row")
    Expenses = dspy.OutputField(
        desc="Final integer expenses, positive and < Salary_in. Output only the number."
    )


class RegionGen(dspy.Signature):
    seed = dspy.InputField()
    thought = dspy.OutputField(desc="Reason about picking exactly one region")
    Region_in = dspy.InputField(desc="Original region from input row")
    Region = dspy.OutputField(desc="Final region: North, South, East, or West")


# ======================================================
#  Pydantic v2 Model
# ======================================================
class SyntheticRowModel(BaseModel):
    Customer_ID: str
    Salary: int
    Expenses: int
    Region: str

    # Auto-increment counter for unique IDs
    _id_counter: ClassVar[int] = 0

    @classmethod
    def generate_id(cls) -> str:
        cls._id_counter += 1
        return f"C{cls._id_counter:03d}"

    @field_validator("Salary")
    @classmethod
    def validate_salary(cls, v: int):
        if not (10_000 <= v <= 200_000):
            raise ValueError("Salary must be between 10,000 and 200,000")
        return v

    @field_validator("Region")
    @classmethod
    def validate_region(cls, v: str):
        allowed = {"North", "South", "East", "West"}
        if v not in allowed:
            raise ValueError(f"Region must be one of {allowed}")
        return v

    @model_validator(mode="after")
    def validate_expenses_vs_salary(self):
        if self.Expenses >= self.Salary:
            raise ValueError("Expenses must be less than Salary")
        if self.Expenses < 0:
            raise ValueError("Expenses must be positive")
        return self


# ======================================================
#  Agentic Row Generator
# ======================================================
class SyntheticRowAgent(dspy.Module):
    def __init__(self):
        super().__init__()
        self.salary_agent = dspy.Predict(SalaryGen, temperature=0)
        self.expenses_agent = dspy.Predict(ExpensesGen, temperature=0)
        self.region_agent = dspy.Predict(RegionGen, temperature=0.5)

    def forward(self, seed: str, row):
        salary = self.salary_agent(seed=seed, Salary_in=row['Salary'])
        expenses = self.expenses_agent(seed=seed, Salary_in=row['Salary'], Expenses_in=row['Expenses'])
        region = self.region_agent(seed=seed, Region_in=row['Region'])

        return {
            "Salary": salary.Salary,
            "Expenses": expenses.Expenses,
            "Region": region.Region,
        }


# ======================================================
#  Helper: Generate a single validated row
# ======================================================
def generate_unique_row(agent, seed, row):
    synthetic_row = agent(seed=seed, row=row)

    # Override Customer_ID with deterministic unique one
    synthetic_row["Customer_ID"] = SyntheticRowModel.generate_id()

    # Validate & return row
    validated_row = SyntheticRowModel(**synthetic_row)
    return validated_row.model_dump()


# ======================================================
#  Generate synthetic dataset
# ======================================================
def generate_synthetic_data_unique(df_input, n_samples_per_row=2):
    agent = SyntheticRowAgent()
    synthetic_records = []

    for idx, row in df_input.iterrows():
        for i in range(n_samples_per_row):
            seed = f"{idx}-{i}"
            unique_row = generate_unique_row(agent, seed, row)
            synthetic_records.append(unique_row)

    df_synthetic = pd.DataFrame(synthetic_records)
    return df_synthetic






Original Dataset:
   Customer_ID  Salary  Expenses Region
0        C001   55000     20000  North
1        C002  120000     80000  South
2        C003   75000     40000   East
3        C004   25000     10000   West
4        C005  180000    120000  North


In [58]:
# ======================================================
#  Run synthetic generation
# ======================================================
df_synthetic = generate_synthetic_data_unique(df_input, n_samples_per_row=2)
print("\n✅ Synthetic Dataset:\n", df_synthetic.head(10))


✅ Synthetic Dataset:
   Customer_ID  Salary  Expenses Region
0        C001  150000     20000  North
1        C002  150000     15000   East
2        C003  150000     80000  North
3        C004  150000     80000  North
4        C005  150000     40000   East
5        C006  150000     15000   East
6        C007   15000     10000  North
7        C008  150000      8000  North
8        C009  150000     60000   East
9        C010  150000     60000   West


In [5]:
import json

notebook_content = {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Synthetic Data Generation with DSPy and Pydantic\n",
    "This notebook demonstrates generating synthetic sales data with **DSPy** using column-level signatures and **Pydantic v2** for validation."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1️⃣ Load Original Dataset"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import pandas as pd\n",
    "try:\n",
    "    df_input = pd.read_csv(\"data/sales_data.csv\")\n",
    "    print(\"\\nOriginal Dataset:\\n\", df_input.head())\n",
    "except FileNotFoundError:\n",
    "    print(\"⚠️ No 'sales_data.csv' found.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2️⃣ Column-Level Signatures with DSPy\n",
    "These signatures define how synthetic data is generated per column with chain-of-thought reasoning."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "import dspy\n",
    "from pydantic import BaseModel, field_validator, model_validator\n",
    "from typing import ClassVar\n",
    "\n",
    "# Salary Generator\n",
    "class SalaryGen(dspy.Signature):\n",
    "    seed = dspy.InputField()\n",
    "    thought = dspy.OutputField(desc=\"Reason about generating salary in range 10000-200000\")\n",
    "    Salary_in = dspy.InputField(desc=\"Original salary from input row\")\n",
    "    Salary = dspy.OutputField(desc=\"Final integer salary between 10000 and 200000. Output only the number.\")\n",
    "\n",
    "# Expenses Generator\n",
    "class ExpensesGen(dspy.Signature):\n",
    "    seed = dspy.InputField()\n",
    "    thought = dspy.OutputField(desc=\"Reason about generating expenses < salary\")\n",
    "    Salary_in = dspy.InputField(desc=\"Original salary from input row\")\n",
    "    Expenses_in = dspy.InputField(desc=\"Original expenses from input row\")\n",
    "    Expenses = dspy.OutputField(desc=\"Final integer expenses, positive and < Salary_in. Output only the number.\")\n",
    "\n",
    "# Region Generator\n",
    "class RegionGen(dspy.Signature):\n",
    "    seed = dspy.InputField()\n",
    "    thought = dspy.OutputField(desc=\"Reason about picking exactly one region\")\n",
    "    Region_in = dspy.InputField(desc=\"Original region from input row\")\n",
    "    Region = dspy.OutputField(desc=\"Final region: North, South, East, or West\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3️⃣ Pydantic v2 Model\n",
    "Define a model to validate the synthetic data with constraints and auto-increment Customer_IDs."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "class SyntheticRowModel(BaseModel):\n",
    "    Customer_ID: str\n",
    "    Salary: int\n",
    "    Expenses: int\n",
    "    Region: str\n",
    "\n",
    "    _id_counter: ClassVar[int] = 0\n",
    "\n",
    "    @classmethod\n",
    "    def generate_id(cls) -> str:\n",
    "        cls._id_counter += 1\n",
    "        return f\"C{cls._id_counter:03d}\"\n",
    "\n",
    "    @field_validator(\"Salary\")\n",
    "    @classmethod\n",
    "    def validate_salary(cls, v: int):\n",
    "        if not (10_000 <= v <= 200_000):\n",
    "            raise ValueError(\"Salary must be between 10,000 and 200,000\")\n",
    "        return v\n",
    "\n",
    "    @field_validator(\"Region\")\n",
    "    @classmethod\n",
    "    def validate_region(cls, v: str):\n",
    "        allowed = {\"North\", \"South\", \"East\", \"West\"}\n",
    "        if v not in allowed:\n",
    "            raise ValueError(f\"Region must be one of {allowed}\")\n",
    "        return v\n",
    "\n",
    "    @model_validator(mode=\"after\")\n",
    "    def validate_expenses_vs_salary(self):\n",
    "        if self.Expenses >= self.Salary:\n",
    "            raise ValueError(\"Expenses must be less than Salary\")\n",
    "        if self.Expenses < 0:\n",
    "            raise ValueError(\"Expenses must be positive\")\n",
    "        return self"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4️⃣ Agentic Row Generator\n",
    "This module uses DSPy predictors to generate synthetic rows."
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "class SyntheticRowAgent(dspy.Module):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.salary_agent = dspy.Predict(SalaryGen, temperature=0)\n",
    "        self.expenses_agent = dspy.Predict(ExpensesGen, temperature=0)\n",
    "        self.region_agent = dspy.Predict(RegionGen, temperature=0.5)\n",
    "\n",
    "    def forward(self, seed: str, row):\n",
    "        salary = self.salary_agent(seed=seed, Salary_in=row['Salary'])\n",
    "        expenses = self.expenses_agent(seed=seed, Salary_in=row['Salary'], Expenses_in=row['Expenses'])\n",
    "        region = self.region_agent(seed=seed, Region_in=row['Region'])\n",
    "\n",
    "        return {\n",
    "            \"Salary\": salary.Salary,\n",
    "            \"Expenses\": expenses.Expenses,\n",
    "            \"Region\": region.Region,\n",
    "        }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5️⃣ Helper Function: Generate Single Validated Row"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def generate_unique_row(agent, seed, row):\n",
    "    synthetic_row = agent(seed=seed, row=row)\n",
    "    synthetic_row[\"Customer_ID\"] = SyntheticRowModel.generate_id()\n",
    "    validated_row = SyntheticRowModel(**synthetic_row)\n",
    "    return validated_row.model_dump()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6️⃣ Generate Full Synthetic Dataset"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "def generate_synthetic_data_unique(df_input, n_samples_per_row=2):\n",
    "    agent = SyntheticRowAgent()\n",
    "    synthetic_records = []\n",
    "\n",
    "    for idx, row in df_input.iterrows():\n",
    "        for i in range(n_samples_per_row):\n",
    "            seed = f\"{idx}-{i}\"\n",
    "            unique_row = generate_unique_row(agent, seed, row)\n",
    "            synthetic_records.append(unique_row)\n",
    "\n",
    "    df_synthetic = pd.DataFrame(synthetic_records)\n",
    "    return df_synthetic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7️⃣ Run Synthetic Generation"
   ]
  },
  {
   "cell_type": "code",
   "metadata": {},
   "source": [
    "df_synthetic = generate_synthetic_data_unique(df_input, n_samples_per_row=2)\n",
    "df_synthetic.head(10)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

# Save the notebook
with open("synthetic_data_notebook.ipynb", "w") as f:
    json.dump(notebook_content, f, indent=2)

print("✅ Notebook 'synthetic_data_notebook.ipynb' created successfully!")


✅ Notebook 'synthetic_data_notebook.ipynb' created successfully!
