In [0]:
import dataiku
import pandas as pd
import time
import json

def analyze_agent_costs_via_connection(audit_path="/data/dataiku/dss_data/run/audit"):
    client = dataiku.api_client()
    project = client.get_default_project()
    
    # Unique names for temp objects
    ts = int(time.time())
    conn_name = f"tmp_audit_conn_{ts}"
    ds_name = f"tmp_audit_logs_{ts}"
    
    dataset = None
    conn = None

    print(f"1. Creating temporary connection to: {audit_path}")
    try:
        # Create a Filesystem connection rooted at the audit log path
        conn = client.create_connection(conn_name, "Filesystem", {
            "root": audit_path
        })
    except Exception as e:
        print(f"Error creating connection: {e}")
        return None

    try:
        print(f"2. Creating temporary dataset: {ds_name}")
        dataset = project.create_dataset(ds_name, "Filesystem", params={
            "connection": conn_name,
            "path": "/"
        }, formatType="json")
        
        # 2.1 Configure format (One JSON object per line)
        settings = dataset.get_settings()
        settings.get_raw()["formatParams"] = {
            "style": "no_array",
            "charset": "utf8",
            "ignoreBadRecords": True
        }
        settings.save()

        # --- THE FIX ---
        # 2.5 Force Schema Autodetection
        # This scans the file content to find columns like 'topic', 'data', 'timestamp'
        print("   - Detecting schema from log files...")
        settings = dataset.autodetect_settings()
        settings.save() # Save the detected schema
        print(f"   - Schema detected: {[c['name'] for c in settings.get_raw().get('schema', {}).get('columns', [])]}")
        # ----------------

        print("3. Reading and aggregating data...")
        
        stats = {}
        dku_ds = dataiku.Dataset(ds_name)
        
        # Iterate in chunks
        for df in dku_ds.iter_dataframes(chunksize=10000):
            if 'topic' in df.columns:
                # Filter for LLM topics
                df_llm = df[df['topic'].astype(str).str.contains('llm|external-model', case=False, na=False)].copy()
                
                if df_llm.empty:
                    continue
                
                # Extract nested JSON fields safely
                cols = df_llm.columns
                
                def get_col(candidates, default=None):
                    for c in candidates:
                        if c in cols: return df_llm[c]
                    return default

                # Extract Cost, Tokens, Agent info
                # Note: DSS flattens JSON, so 'data.usage.estimatedCost' is the column name
                cost_col = get_col(['data.usage.estimatedCost', 'usage.estimatedCost'])
                costs = cost_col if cost_col is not None else 0.0
                
                tokens_col = get_col(['data.usage.totalTokens', 'usage.totalTokens'])
                tokens = tokens_col if tokens_col is not None else 0
                
                agent_col = get_col(['data.context.agentName', 'context.agentName', 
                                     'data.details.agentName', 'details.agentName',
                                     'data.context.agentId'])
                agents = agent_col.fillna("Direct/Unknown") if agent_col is not None else "Unknown"

                model_col = get_col(['data.details.llmId', 'details.llmId', 
                                     'data.target.llmId', 'target.llmId'])
                models = model_col.fillna("N/A") if model_col is not None else "N/A"

                # Aggregate locally
                df_llm['extracted_cost'] = pd.to_numeric(costs, errors='coerce').fillna(0)
                df_llm['extracted_tokens'] = pd.to_numeric(tokens, errors='coerce').fillna(0)
                df_llm['extracted_agent'] = agents
                df_llm['extracted_model'] = models
                
                chunk_group = df_llm.groupby(['extracted_agent', 'extracted_model']).agg({
                    'extracted_cost': 'sum',
                    'extracted_tokens': 'sum',
                    'topic': 'count'
                }).reset_index()
                
                for _, row in chunk_group.iterrows():
                    key = (row['extracted_agent'], row['extracted_model'])
                    if key not in stats:
                        stats[key] = {'cost': 0.0, 'tokens': 0, 'calls': 0}
                    stats[key]['cost'] += row['extracted_cost']
                    stats[key]['tokens'] += row['extracted_tokens']
                    stats[key]['calls'] += row['topic']

        results = []
        for (agent, model), metrics in stats.items():
            results.append({
                "Agent Name": agent,
                "LLM Model": model,
                "Total Cost ($)": round(metrics['cost'], 4),
                "Total Tokens": int(metrics['tokens']),
                "Call Count": int(metrics['calls'])
            })
            
        if not results:
            return pd.DataFrame(columns=["Message"]) 
            
        return pd.DataFrame(results).sort_values("Total Cost ($)", ascending=False)

    except Exception as e:
        print(f"\nCRITICAL ERROR: {e}")
        return None

    finally:
        print("4. Cleaning up temporary artifacts...")
        if dataset:
            try: dataset.delete()
            except: pass
        if conn:
            try: conn.delete()
            except: pass

# --- Execution ---
LOG_PATH = "/data/dataiku/dss_data/run/audit" 
df_report = analyze_agent_costs_via_connection(LOG_PATH)

if df_report is not None and not df_report.empty:
    print("\n--- Agent Cost & Utilization Report ---")
    print(df_report.to_string(index=False))
else:
    print("\nNo utilization data found.")