### Demonstrates how to configure **DSPy** to work with a locally 
### hosted **Ollama** LLM (Llama 3.2). Dspy: https://dspy.ai/ Its a declarative framework enabling us to build AI worflows using LLM

In [43]:
import csv
import io
import random
import dspy
import litellm

# Drop unsupported params for Ollama
litellm.drop_params = True

# Configure OllamaLocal LM
lm = dspy.LM(
    "ollama_chat/llama3.2",
    api_base="http://localhost:11434",
    api_key=""
)
dspy.settings.configure(lm=lm)


In [None]:
# Load original dataset 
import pandas as pd

try:
   
    df = pd.read_csv("data/sales_data.csv")
    print("\n Original Dataset:\n", df.head())
except FileNotFoundError:
    print(" No 'sales_data.csv' found.")




 Original Dataset:
   Customer_ID  Salary  Expenses Region
0        C001   55000     20000  North
1        C002  120000     80000  South
2        C003   75000     40000   East
3        C004   25000     10000   West
4        C005  180000    120000  North


### DSPy Signature  
A **Signature** in DSPy defines the schema of inputs and outputs for an LLM call, acting like a typed contract that guides data generation and reasoning.  


In [3]:
# Define signature for synthetic sales record generation
class SalesRecordGen(dspy.Signature):
    """Generate synthetic sales dataset rows with Customer_ID, Salary, Expenses, Region."""
    seed = dspy.InputField(desc="random seed string for variation")
    Customer_ID = dspy.OutputField(desc="unique customer ID like C123")
    Salary = dspy.OutputField(desc="monthly salary in range 20000-200000")
    Expenses = dspy.OutputField(desc="monthly expenses (positive number)")
    Region = dspy.OutputField(desc="Region, one of North, South, East, West")


# Function to generate synthetic sales dataset
def generate_sales_data(sample_size: int = 5) -> str:
    predictor = dspy.Predict(SalesRecordGen, n=1, max_tokens=150, temperature=0.7)
    synthetic_records = []

    for _ in range(sample_size):
        dummy_input = str(random.randint(1, 1000))
        result = predictor(seed=dummy_input)
        synthetic_records.append(result)

    # Write CSV into memory
    output = io.StringIO()
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Customer_ID", "Salary", "Expenses", "Region"])
    for r in synthetic_records:
        writer.writerow([r.Customer_ID, r.Salary, r.Expenses, r.Region])

    csv_content = output.getvalue()
    output.close()
    return csv_content


Lets begin

Generate syntehtic data

In [None]:
synthetic_structural_csv = generate_sales_data(sample_size=5)
print("\n Synthetic Dataset:\n", synthetic_structural_csv)


🤖 Synthetic Dataset:
 Customer_ID,Salary,Expenses,Region
C123,150000,5000,East
C123,150000,5000,North
C123,150000,5000,South
C1234,$18392,500,North
C989,165000,5000,East



Statistical evaluation of generated syntehtic data

In [37]:
def evaluate_sales_structural_realism(csv_content: str):
    """
    Structural Realism checks:
    1. Schema consistency
    2. Data type correctness
    3. Value ranges (Salary 20k–200k, Expenses >= 0, valid Region)
    """
    reader = csv.DictReader(io.StringIO(csv_content))
    required_columns = ["Customer_ID", "Salary", "Expenses", "Region"]

    if reader.fieldnames != required_columns:
        print(" Schema mismatch! Expected:", required_columns)
        return False

    realism_pass = True

    for row in reader:
        try:
            salary = float(row["Salary"])
            expenses = float(row["Expenses"])
        except ValueError:
            print(f" Invalid numeric values in row: {row}")
            realism_pass = False
            continue

        if not (10000 <= salary <= 200000):
            print(f" Salary out of range: {salary} in row {row}")
            realism_pass = False

        if expenses < 0:
            print(f" Expenses negative: {expenses} in row {row}")
            realism_pass = False

        if row["Region"] not in ["North", "South", "East", "West"]:
            print(f" Invalid Region: {row['Region']} in row {row}")
            realism_pass = False

    if realism_pass:
        print(" Structural Realism: PASSED (schema + types + ranges ok)")
    else:
        print(" Structural Realism: ISSUES FOUND")

    return realism_pass


Evaluate synthetic data

In [8]:
evaluate_sales_structural_realism(synthetic_structural_csv)

 Invalid numeric values in row: {'Customer_ID': 'C1234', 'Salary': '$18392', 'Expenses': '500', 'Region': 'North'}
 Structural Realism: ISSUES FOUND


False

Lets modify the signature to take care of this issue

In [35]:
# Define signature for synthetic sales record generation
class SalesRecordGen(dspy.Signature):
    """Generate synthetic sales dataset rows with Customer_ID, Salary, Expenses, Region."""
    seed = dspy.InputField(desc="random seed string for variation")
    Customer_ID = dspy.OutputField(desc="unique customer ID like C123")
    Salary = dspy.OutputField(desc="integer salary strictly between 10000 and 200000")
    Expenses = dspy.OutputField(desc="integer expenses strictly positive and less than Salary")
    Region = dspy.OutputField(desc="Region, one of North, South, East, West")


# Function to generate synthetic sales dataset
def generate_sales_data(sample_size: int = 5) -> str:
    predictor = dspy.Predict(SalesRecordGen, n=1, max_tokens=150, temperature=0.9)
    synthetic_records = []

    for _ in range(sample_size):
        dummy_input = str(random.randint(1, 1000))
        result = predictor(seed=dummy_input)
        synthetic_records.append(result)

    # Write CSV into memory
    output = io.StringIO()
    writer = csv.writer(output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Customer_ID", "Salary", "Expenses", "Region"])
    for r in synthetic_records:
        writer.writerow([r.Customer_ID, r.Salary, r.Expenses, r.Region])

    csv_content = output.getvalue()
    output.close()
    return csv_content


In [36]:
synthetic_structural_csv = generate_sales_data(sample_size=5)

In [38]:
print(synthetic_structural_csv)

Customer_ID,Salary,Expenses,Region
C123,150000,30000,East
C708,175000,35000,East
C123,185916,17445,North
C593,141112,11411,East
C123,175119,14251,West



In [39]:
evaluate_sales_structural_realism(synthetic_structural_csv)

 Structural Realism: PASSED (schema + types + ranges ok)


True