In [None]:
from flowcept import Flowcept
from workflow import Workflow
from qa_chain import QAChain

In [None]:
workflow_id = Workflow.run()

In [None]:
workflow_id = 'x'
qa = QAChain().build(workflow_id)

In [None]:
# Multiple Runs Code
import time
import pandas as pd
import re

def cot_prompt(query):
    """Add Chain of Thought prompting to a query""" 
    return f"Let's think step by step. {query} Please explain your reasoning process."

def benchmark_query_with_tracking(qa_chain, base_query_id, query, runs=5, use_cot=False, context=None):
    """
    Run multiple queries and automatically track them in the QAChain DataFrame
    
    Args:
        qa_chain: Your QAChain instance
        base_query_id: Base ID like "DF-DL-Q01-NS-NC-R3-Swallow" (R3 sets starting run number)
        query: The actual query text
        runs: Number of times to run the query
        use_cot: Whether to use Chain of Thought prompting
        context: Custom context (if None, uses default based on query_id)
    """
    results = []
    prompt = cot_prompt(query) if use_cot else query
    
    # MODIFY QUERY ON COT 
    if use_cot and "NS" in base_query_id:
        base_query_id = base_query_id.replace("NS", "CoT")
    
    # EXTRACT STARTING ITERATION NUM
    run_match = re.search(r'-R(\d+)$', base_query_id)  # Match run number only if at end of string
    if run_match:
        start_run = int(run_match.group(1))
        # REMOVE RUN SUFFIX ONLY AT END
        base_id_clean = re.sub(r'-R\d+$', '', base_query_id)
    else:
        start_run = 1
        base_id_clean = base_query_id
    
    for i in range(runs):
        current_run = start_run + i
        query_id = f"{base_id_clean}-R{current_run:02d}"
        
       
        if context:
            result = qa_chain.ask(prompt, query_id=query_id, context=context)
        else:
            result = qa_chain.ask(prompt, query_id=query_id)
        
        # RESPONSE AND TIMING INFO
        response = result["result"]
        query_row = qa_chain.query_df[qa_chain.query_df['Query_ID'] == query_id].iloc[-1]
        response_time = query_row['Response_Time']
        char_count = query_row['Response_Chars']
        
        results.append({
            "query_id": query_id,
            "run": current_run,
            "response_time_sec": response_time,
            "response": response,
            "char_count": char_count
        })
    
    avg_time = sum(r["response_time_sec"] for r in results) / runs
    avg_char_count = sum(r["char_count"] for r in results) / runs
    
    return {
        "base_query_id": base_query_id,
        "query": query,
        "prompt_used": prompt,
        "use_cot": use_cot,
        "runs": results,
        "average_response_time_sec": avg_time,
        "average_char_count": avg_char_count
    }

def run_query_suite(qa_chain, query_suite):
    """
    Run a suite of queries with different configurations
    
    Args:
        qa_chain: Your QAChain instance
        query_suite: List of dictionaries with query configurations
                    Each dict should have: base_id, query, runs, use_cot, context
    
    Example:
        suite = [
            {
                "base_id": "DF-DL-Q01-NS-NC-R1-Swallow",
                "query": "What is the data lineage?",
                "runs": 3,
                "use_cot": False,
                "context": None
            },
            {
                "base_id": "CF-EO-Q02-CoT-FC-R5-Swallow", 
                "query": "What is the execution order?",
                "runs": 3,
                "use_cot": True,
                "context": "X"
            }
        ]
    """
    all_results = []
    
    for config in query_suite:
        print(f"\n{'='*60}")
        print(f"Running: {config['base_id']}")
        print(f"Query: {config['query']}")
        print(f"{'='*60}")
        
        result = benchmark_query_with_tracking(
            qa_chain=qa_chain,
            base_query_id=config["base_id"],
            query=config["query"],
            runs=config.get("runs", 5),
            use_cot=config.get("use_cot", False),
            context=config.get("context", None)
        )
        
        all_results.append(result)
        
        print(f"Average Response Time: {result['average_response_time_sec']:.2f}s")
        print(f"Average Character Count: {result['average_char_count']:.0f}")
    
    return all_results

def generate_query_ids_for_batch(base_id, runs, start_run=1):
    """
    Helper function to generate query IDs for batch accuracy updates
    
    Args:
        base_id: Base query ID (without run numbers)
        runs: Number of runs
        start_run: Starting run number
    
    Returns:
        List of query IDs
    """
    base_id_clean = re.sub(r'-R\d+$', '', base_id)
    
    query_ids = []
    for i in range(runs):
        current_run = start_run + i
        query_id = f"{base_id_clean}-R{current_run:02d}"
        query_ids.append(query_id)
    
    return query_ids


In [None]:
# Creation of DF to import into csv after it runs, could just make it a csv, but for now that is what Im doing.
# DF TO INSTANCE
qa.query_df = pd.DataFrame(columns=[
    'Query_ID', 'Query_Text', 'Query_Chars', 'Response_Text', 
    'Response_Chars', 'Response_Time', 'Accuracy'
])

# METHODS
def update_accuracy(self, query_id, accuracy_score):
    mask = self.query_df['Query_ID'] == query_id
    if mask.any():
        self.query_df.loc[mask, 'Accuracy'] = accuracy_score
        print(f"Updated accuracy for {query_id}: {accuracy_score}")
    else:
        print(f"Query ID {query_id} not found")

def export_queries(self, filename="query_results.csv"):
    self.query_df.to_csv(filename, index=False)
    print(f"Query results exported to {filename}")

# METHODS IN INSTANCE
import types
qa.update_accuracy = types.MethodType(update_accuracy, qa)
qa.export_queries = types.MethodType(export_queries, qa)

# TEST
print(qa.query_df)

In [None]:
# qa.ask AND what is being tracked into DF
import types

def ask(self, query, query_id=None, context=None):
    """
    Main ask method that can optionally take a query_id parameter
    If query_id is not provided, auto-generates one
    """
    # If no query_id provided, auto-generate one
    if query_id is None:
        query_id = f"Q_{len(self.query_df) + 1}"
    
    if context is None:
        context = "Each document represents a task. All tasks belong to a same workflow execution trace. "
        context += "The time the task started is stored in the started_at. The time the task ended is stored in the ended_at. The task duration is ended_at - started_at for each task "
    
    # Prepare full query text
    full_query = f"{context}. {query}"
    
    # Time the query
    from time import time
    t0 = time()
    result = self.qa_chain({"query": full_query})
    response_time = time() - t0
    
    # Extract response text
    response_text = result["result"]
    
    # Calculate character counts
    query_chars = len(query)
    response_chars = len(response_text)
    
    # Add to tracking DataFrame
    import pandas as pd
    new_row = {
        'Query_ID': query_id,
        'Query_Text': query,
        'Query_Chars': query_chars,
        'Response_Text': response_text,
        'Response_Chars': response_chars,
        'Response_Time': response_time,
        'Accuracy': None  # To be filled manually
    }
    
    self.query_df = pd.concat([self.query_df, pd.DataFrame([new_row])], ignore_index=True)
    
    print(f"Q: {query}")
    print(response_text)
    print(f"---------------- I took {response_time:.1f} s to answer this.")
    print("\n\n")
    
    return result

# Replace the ask method on your existing instance
qa.ask = types.MethodType(ask, qa)

# Queries

In [None]:
query_suite = [
    {
        "base_id": "DF-INOP-Q01-NS-LC-R1-Swallow",
        "query": "What tasks take the value 'H' as input?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q02-NS-LC-R1-Swallow",
        "query": "Was the output of I_TO_H used directly by multiple downstream tasks? If so what tasks?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q03-NS-LC-R1-Swallow",
        "query": "Which tasks produce intermediate values that are consumed by other tasks?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q04-NS-LC-R1-Swallow",
        "query": "How many values are combined to produce the final output 'A'?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q05-NS-LC-R1-Swallow",
        "query": "Did any task produce more than one output?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q06-NS-LC-R1-Swallow", 
        "query": "What is the data type of the output produced by 'I_TO_H'?",
        "runs": 3,
        "use_cot": False,
        "context": None
    },
    {
        "base_id": "DF-INOP-Q07-NS-LC-R1-Swallow",
        "query": "What are the output values of 'E_TO_D', 'F_TO_C', and 'G_TO_B'?",
        "runs": 3,
        "use_cot": False, 
        "context": None
    },
    {
        "base_id": "DF-INOP-Q08-NS-LC-R1-Swallow",
        "query": "How can 'H_TO_G's output change given the input?",
        "runs": 3,
        "use_cot": False,
        "context": None
    }
]

# Run all queries in the suite
all_results = run_query_suite(qa, query_suite)