Notebook for optimizing the trend analyzer code.
To run the notebook, run the command "jupyter notebook --ip=0.0.0.0 --port=5024 --allow-root --no-browser". Then open the webpage that opens in replit in a new tab, and enter the token for the server you find from the command "jupter server list". If you enter the token in the replit preview it will give you a 403 error.

In [1]:
#import necessary modules
from modules import *
import os
import mlflow

DSPy version: 3.0.0b2


In [8]:
#setup mlflow
mlflow_tracking_uri = "../../mlflow/experiments"
mlflow.set_tracking_uri(mlflow_tracking_uri)
mlflow.set_experiment("TrendFinderOptimizer")
mlflow.dspy.autolog()

2025/07/23 22:28:49 INFO mlflow.tracking.fluent: Experiment with name 'TrendFinderOptimizer' does not exist. Creating a new experiment.


In [3]:
#setup dspy
api_key = os.environ['paul2']
lm = dspy.LM('gemini/gemini-2.5-flash', api_key=api_key, max_tokens=8000)
dspy.configure(lm=lm)

In [4]:
analyze_trends = trend_analyzer()

## DSPy Optimization for doc_analyzer

Since we don't have labeled training data, we'll use DSPy's signature optimization and prompt engineering techniques to improve the `doc_analyzer` signature performance.

In [5]:
# Import optimization modules
import dspy
from dspy.teleprompt import BootstrapFewShot
from dspy.evaluate import Evaluate
import json
import pandas as pd

ImportError: cannot import name 'SignatureOptimizer' from 'dspy.teleprompt' (/home/runner/workspace/.pythonlibs/lib/python3.11/site-packages/dspy/teleprompt/__init__.py)

In [9]:
# Create synthetic examples for different document types and categories
# This helps DSPy understand the expected input/output patterns

def create_synthetic_examples():
    examples = []
    
    # Example 1: Realistic inspection report with lengthy content
    inspection_report_1 = """
    INSPECTION REPORT - Document ID: 32037 ALR10C6BZ 097
    Date: 07-10-2025
    Inspector: AEPACS
    Location: Industrial Site NA

    SUMMARY OF INSPECTION:
    This inspection was conducted in accordance with safety regulations and compliance standards.
    The facility was evaluated for adherence to safety protocols, equipment maintenance, and regulatory compliance.

    VIOLATIONS IDENTIFIED:
    1. Missing safety equipment in Zone A - Safety harnesses not available at designated stations
    2. Improper chemical storage in Building 3 - Hazardous materials not properly labeled or contained
    3. Emergency exit blocked by equipment in Warehouse B

    RECOMMENDATIONS:
    - Immediate procurement and installation of safety equipment
    - Proper labeling and containment of all hazardous materials
    - Clear all emergency exits of obstructions

    COMPLIANCE STATUS: Non-compliant - 3 violations found
    Next inspection scheduled for 08-15-2025
    """

    examples.append(dspy.Example(
        document=inspection_report_1,
        categories=["document", "number of violations", "list and details of violations"],
        in_csv="",
        last_context="Analyzing safety inspection reports for compliance trends",
        next_context="Safety equipment and storage violations are recurring issues across sites",
        out_csv="document,number of violations,list and details of violations\n32037_ALR10C6BZ_097.pdf,3,Missing safety equipment in Zone A; Improper chemical storage in Building 3; Emergency exit blocked in Warehouse B"
    ))

    # Example 2: Financial document with some missing categories
    financial_report = """
    QUARTERLY FINANCIAL STATEMENT - Q3 2025
    Company: TechCorp Industries

    REVENUE BREAKDOWN:
    Product Sales: $850,000
    Service Revenue: $320,000
    Total Revenue: $1,170,000

    OPERATIONAL EXPENSES:
    Salaries and Benefits: $450,000
    Equipment and Maintenance: $120,000
    Marketing: $85,000
    Total Expenses: $655,000

    NET PROFIT: $515,000

    Note: Employee count data not available in this quarterly report.
    Tax information will be provided in annual filing.
    """

    examples.append(dspy.Example(
        document=financial_report,
        categories=["revenue", "expenses", "profit", "employee_count", "tax_rate"],
        in_csv="",
        last_context="Processing quarterly financial reports for trend analysis",
        next_context="Q3 shows strong profitability, employee data consistently missing from quarterly reports",
        out_csv="document,revenue,expenses,profit,employee_count,tax_rate\nQ3_2025_financial.pdf,$1170000,$655000,$515000,N/A,N/A"
    ))

    # Example 3: Document with no violations (testing N/A handling)
    clean_inspection = """
    INSPECTION REPORT - Document ID: 45892 CLN001
    Date: 07-12-2025
    Inspector: SAFETY_TEAM_A
    Location: Corporate Headquarters

    INSPECTION SUMMARY:
    Comprehensive safety and compliance inspection conducted across all floors and departments.
    All safety protocols, equipment, and procedures were found to be in full compliance.

    FINDINGS:
    - All safety equipment properly maintained and accessible
    - Emergency exits clear and properly marked
    - Chemical storage in full compliance with regulations
    - Fire safety systems operational and up to date

    VIOLATIONS: None identified
    COMPLIANCE STATUS: Fully compliant
    Commendation for excellent safety standards maintained.
    """

    examples.append(dspy.Example(
        document=clean_inspection,
        categories=["document", "number of violations", "list and details of violations"],
        in_csv="document,number of violations,list and details of violations\n32037_ALR10C6BZ_097.pdf,3,Missing safety equipment in Zone A; Improper chemical storage in Building 3; Emergency exit blocked in Warehouse B",
        last_context="Safety equipment and storage violations are recurring issues across sites",
        next_context="Some sites maintain excellent compliance standards with zero violations",
        out_csv="document,number of violations,list and details of violations\n32037_ALR10C6BZ_097.pdf,3,Missing safety equipment in Zone A; Improper chemical storage in Building 3; Emergency exit blocked in Warehouse B\n45892_CLN001.pdf,0,N/A"
    ))
    
    return examples

synthetic_examples = create_synthetic_examples()
print(f"Created {len(synthetic_examples)} synthetic examples")

Created 3 synthetic examples


In [10]:
# Define evaluation metrics for the doc_analyzer
def evaluate_doc_analyzer_output(example, pred, trace=None):
    """Evaluate the quality of doc_analyzer output"""
    score = 0.0
    max_score = 5.0

    # 1. Check if CSV format is valid
    try:
        import io
        pd.read_csv(io.StringIO(pred.out_csv))
        score += 1.0  # Valid CSV format
    except:
        pass

    # 2. Check if all categories are addressed in CSV headers
    try:
        csv_data = pd.read_csv(io.StringIO(pred.out_csv))
        csv_columns = set(csv_data.columns.tolist())
        expected_categories = set(example.categories)
        if expected_categories.issubset(csv_columns):
            score += 1.0
    except:
        pass

    # 3. Context management - penalize excessive length, reward meaningful updates
    context_score = 0.0
    if pred.next_context != example.last_context:
        # Reward context updates but penalize excessive length
        context_length = len(pred.next_context.split())
        if 5 <= context_length <= 50:  # Reasonable context length
            context_score = 1.0
        elif context_length > 50:  # Too long - partial credit
            context_score = 0.5
        elif context_length > 0:  # Very short but present
            context_score = 0.3
    score += context_score

    # 4. Appropriate use of N/A - check if N/A is used reasonably
    try:
        csv_data = pd.read_csv(io.StringIO(pred.out_csv))
        last_row = csv_data.iloc[-1]  # Get the newly added row

        # Count N/A values in the new row
        na_count = sum(1 for val in last_row if str(val).strip().upper() == 'N/A')
        total_categories = len(example.categories)

        # Score based on appropriate N/A usage
        if na_count == 0:  # No N/A - good if data is available
            score += 1.0
        elif na_count < total_categories:  # Some N/A - partial data extracted
            score += 0.8
        elif na_count == total_categories:  # All N/A - only if truly no data
            # This should be rare and only for documents with no extractable data
            score += 0.3
    except:
        pass

    # 5. Data extraction quality - check if actual data was extracted when possible
    try:
        csv_data = pd.read_csv(io.StringIO(pred.out_csv))
        last_row = csv_data.iloc[-1]

        # Check if meaningful data was extracted (not just filename)
        meaningful_data = False
        for col in csv_data.columns:
            if col != 'document':  # Skip document name column
                val = str(last_row[col]).strip()
                if val not in ['N/A', '', 'nan'] and len(val) > 1:
                    meaningful_data = True
                    break

        if meaningful_data:
            score += 1.0
        else:
            score += 0.2  # Some credit for proper format even without data
    except:
        pass

    return score / max_score

print("Evaluation metric defined")

Evaluation metric defined


In [11]:
# Optimize the doc_analyzer signature
# We'll use BootstrapFewShot to improve the prompt and reasoning

# Create an instance of the doc_analyzer module
doc_analyzer_module = dspy.ChainOfThought(doc_analyzer)

# Set up the optimizer using BootstrapFewShot
optimizer = BootstrapFewShot(
    metric=evaluate_doc_analyzer_output,
    max_bootstrapped_demos=4,  # Number of examples to bootstrap
    max_labeled_demos=2,       # Number of labeled examples to use
    max_rounds=2,              # Number of optimization rounds
    max_errors=3               # Maximum errors allowed during bootstrapping
)

print("Starting signature optimization with BootstrapFewShot...")
print("This may take several minutes as it tests different prompt variations.")

NameError: name 'SignatureOptimizer' is not defined

In [12]:
# Run the optimization with MLflow tracking
with mlflow.start_run(run_name="doc_analyzer_optimization") as run:
    # Log parameters
    mlflow.log_param("optimizer_type", "BootstrapFewShot")
    mlflow.log_param("num_examples", len(synthetic_examples))
    mlflow.log_param("max_bootstrapped_demos", 4)
    mlflow.log_param("max_rounds", 2)
    
    # Optimize the signature
    optimized_module = optimizer.compile(
        doc_analyzer_module,
        trainset=synthetic_examples  # BootstrapFewShot uses all examples for training
    )
    
    # Log the optimized signature
    mlflow.log_text(str(optimized_module.signature), "optimized_signature.txt")
    
    print("Optimization completed!")
    print(f"Run ID: {run.info.run_id}")

NameError: name 'optimizer' is not defined

In [13]:
# Test the optimized module with a sample
test_example = synthetic_examples[0]

print("Testing Original Module:")
print("=" * 40)
original_result = doc_analyzer_module(
    document=test_example.document,
    categories=test_example.categories,
    in_csv=test_example.in_csv,
    last_context=test_example.last_context
)
print(f"Context: {original_result.next_context}")
print(f"CSV: {original_result.out_csv}")
print(f"Score: {evaluate_doc_analyzer_output(test_example, original_result)}")

print("\nTesting Optimized Module:")
print("=" * 40)
optimized_result = optimized_module(
    document=test_example.document,
    categories=test_example.categories,
    in_csv=test_example.in_csv,
    last_context=test_example.last_context
)
print(f"Context: {optimized_result.next_context}")
print(f"CSV: {optimized_result.out_csv}")
print(f"Score: {evaluate_doc_analyzer_output(test_example, optimized_result)}")

Testing Original Module:
Context: Analyzing safety inspection reports for compliance trends. This document indicates non-compliance with 3 violations.
CSV: document,number of violations,list and details of violations
32037 ALR10C6BZ 097,3,"1. Missing safety equipment in Zone A - Safety harnesses not available at designated stations; 2. Improper chemical storage in Building 3 - Hazardous materials not properly labeled or contained; 3. Emergency exit blocked by equipment in Warehouse B"
Score: 1.0

Testing Optimized Module:


NameError: name 'optimized_module' is not defined

In [None]:
# Save the optimized module for use in the main application
import os

# Create optimized modules directory if it doesn't exist
os.makedirs("optimized_modules", exist_ok=True)

# Save the optimized module using DSPy's official save method
optimized_module.save("optimized_modules/optimized_doc_analyzer.json")

# Also save the signature as text for inspection
with open("optimized_modules/optimized_signature.txt", "w") as f:
    f.write(str(optimized_module.signature))

print("Optimized module saved using DSPy's official .save() method!")
print("You can load it in your main application using:")
print("import dspy")
print("from modules import doc_analyzer")
print("optimized_module = dspy.ChainOfThought(doc_analyzer)")
print("optimized_module.load('optimized_modules/optimized_doc_analyzer.json')")
print("")
print("Or integrate it into your trend_analyzer class by modifying the __init__ method:")

In [None]:
# Test the optimized module with your actual documents
print("Testing with real documents from your training data...")

# Use the same setup as in your test.py
categories = [
    "document", "number of violations", "list and details of violations"
]

documents = []
documents.append(
    Attachments(
        "TrainingData/32037 ALR10C6BZ 097 07-10-2025 INSPR AEPACS NA.pdf"))

# Create an optimized trend analyzer
class optimized_trend_analyzer(dspy.Module):
    def __init__(self, optimized_doc_analyzer):
        super().__init__()
        self.doc_analyzer_sql = optimized_doc_analyzer

    def forward(self, documents: list[Attachments], categories: list[str], context: str):
        doc_summary = ""
        for document in documents:
            result = self.doc_analyzer_sql(
                document=document,
                categories=categories,
                in_csv=doc_summary,
                last_context=context
            )
            context = result.next_context
            doc_summary = result.out_csv
        return doc_summary, context

# Test with optimized module
optimized_analyzer = optimized_trend_analyzer(optimized_module)
result, context = optimized_analyzer(
    documents=documents[:1],  # Test with first document
    categories=categories,
    context=""
)

print("Optimized Result:")
print(f"CSV Output: {result}")
print(f"Context: {context}")

## Loading the Optimized Module

Here's how to integrate the optimized module into your existing code:

```python
# Option 1: Load and use directly
import dspy
from modules import doc_analyzer

# Create and load optimized module
optimized_doc_analyzer = dspy.ChainOfThought(doc_analyzer)
optimized_doc_analyzer.load('optimized_modules/optimized_doc_analyzer.json')

# Option 2: Modify your trend_analyzer class
class optimized_trend_analyzer(dspy.Module):
    def __init__(self, use_optimized=True):
        super().__init__()
        self.doc_analyzer_sql = dspy.ChainOfThought(doc_analyzer)
        if use_optimized:
            try:
                self.doc_analyzer_sql.load('optimized_modules/optimized_doc_analyzer.json')
                print("Loaded optimized doc_analyzer module")
            except FileNotFoundError:
                print("Optimized module not found, using default")

    def forward(self, documents: list[Attachments], categories: list[str], context: str):
        # Same implementation as before
        doc_summary = ""
        for document in documents:
            result = self.doc_analyzer_sql(
                document=document,
                categories=categories,
                in_csv=doc_summary,
                last_context=context
            )
            context = result.next_context
            doc_summary = result.out_csv
        return doc_summary, context
```