In [18]:
SCORER_ANALYSIS_PROMPT= """ A scorer named {scorer_name} is run on different runs of the same agent/llm/tool,
the scorer gives a score and a reasoning, you will be given 25 such samples. Now the scorer may be giving different
reasonings and all of them are in natural language. I want you to highlight the key reasons for your response.
These key reasons will further be used for analysis, and improvement of the agent. Do not suggest any fixes.
Just focus on not missing out on any of the information regarding why the agent is failing.
You have to focus on the low scores only, as we have to improve them.
Some rows might show - "Missing required fields" it is a code issue, on the developer's side, so do not include it in the reasoning.

Just to clarify, your job is not to analyze the scorers, but to analyze the agent. You are basically the representative of the scorers.

Give the reasoning, and start with the scorer name.

In the format - 
Scorer Name: Task Progression
Reasoning: 

"""

In [19]:
AGENTWISE_SUMMARY_PROMPT= """
Different scorers are run on different runs of the same agent/llm/tool, you will be given the reasoning
for each scorer, as to why it gave poor scores. You job is to summarize the information from different
scorers into a single analysis. All the scores are of one specific part of the entire agentic workflow,
so please remove the redundancies that you get. Do not try to suggest fixes, only focus on removing
the redundant information, and keeping the important information.

Just to clarify, your job is not to analyze the summaries, but to analyze the agent. You are basically the representative of the scorers.

In the format -
Agent Name: query_generation
Reasoning: 

"""

In [20]:
FINAL_ANALYSIS_PROMPT= """
An agent is run, and then different scorers are run on specific parts of the agentic workflow.
So if an agent has 5 different parts (llm/tool/agent), and there are 3 scorers, then there will be a total of 15 scores, and respective reasonings. I have condensed these reasonings, in a part wise manner.
You will be given the part wise analysis, and you will also be given the entire agentic workflow, explaining how the agent is set up.

You have to figure out why the agent is failing, you are given a bird's eye view, as in agents, a failure at step 1, may surface at step 3 in the analysis, so you will have to be aware of that.

You have to suggest fixes to the developer in bullet points, in the format ->

Suggested Fixes:
 - fix_1: 
 - fix_2: 
 
"""

In [21]:
import os
import pandas as pd
import google.generativeai as genai
from dotenv import load_dotenv
from datetime import datetime
import json
from pathlib import Path

# Load environment variables
load_dotenv()

# Configure Gemini API
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
model = genai.GenerativeModel('gemini-2.5-pro')

print("Setup complete!")


Setup complete!


In [22]:
# Read the reddit_agent.md file
with open('reddit_agent.md', 'r', encoding='utf-8') as f:
    reddit_agent_doc = f.read()

print(f"Reddit agent document loaded: {len(reddit_agent_doc)} characters")

# Create log directory
log_dir = Path('log')
log_dir.mkdir(exist_ok=True)

# Generate timestamp for log file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"analysis_log_{timestamp}.txt"

print(f"Log file will be saved to: {log_file}")


Reddit agent document loaded: 8492 characters
Log file will be saved to: log/analysis_log_20250925_004622.txt


In [23]:
# Function to log responses
def log_response(response, description):
    with open(log_file, 'a', encoding='utf-8') as f:
        f.write(f"\n{'='*50}\n")
        f.write(f"TIMESTAMP: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"DESCRIPTION: {description}\n")
        f.write(f"{'='*50}\n")
        f.write(f"{response}\n")
        f.write(f"{'='*50}\n\n")

def process_csv_file(csv_path):
    """
    Process CSV file and extract scorer data for first 25 rows
    Returns dict with scorer data strings
    """
    df = pd.read_csv(csv_path)
    
    # Get first 25 rows (or all if less than 25)
    rows_to_process = min(25, len(df))
    df_subset = df.head(rows_to_process)
    
    # Get column names
    columns = df.columns.tolist()
    
    # Skip first 4 columns (IDs)
    # The remaining columns are split into score columns and reasoning columns
    remaining_columns = columns[4:]
    n_scorers = len(remaining_columns) // 2  # Each scorer has score + reasoning column
    
    score_columns = remaining_columns[:n_scorers]  # First half are score columns
    reasoning_columns = remaining_columns[n_scorers:]  # Second half are reasoning columns
    
    scorer_data = {}
    
    # For each scorer
    for i, scorer_name in enumerate(score_columns):
        scorer_strings = []
        reasoning_col = reasoning_columns[i]
        
        for _, row in df_subset.iterrows():
            score = row[scorer_name]
            reasoning = row[reasoning_col]
            scorer_string = f"{scorer_name} score = {score} reasoning = {reasoning}"
            scorer_strings.append(scorer_string)
        
        scorer_data[scorer_name] = "\n".join(scorer_strings)
    
    return scorer_data
print("Helper functions defined!")


Helper functions defined!


In [24]:
# Main processing loop
demo_results_dir = Path('demo_results')
all_summaries = []

# Get all dataset directories
dataset_dirs = [d for d in demo_results_dir.iterdir() if d.is_dir()]

print(f"Found {len(dataset_dirs)} dataset directories to process:")
for d in dataset_dirs:
    print(f"  - {d.name}")

# Process each dataset directory
for dataset_dir in dataset_dirs:
    print(f"\nProcessing {dataset_dir.name}...")
    
    # Find CSV file in the directory
    csv_files = list(dataset_dir.glob('*.csv'))
    if not csv_files:
        print(f"  No CSV files found in {dataset_dir.name}, skipping...")
        continue
    
    csv_file = csv_files[0]  # Take the first CSV file
    print(f"  Processing CSV: {csv_file.name}")
    
    # Process the CSV file
    scorer_data = process_csv_file(csv_file)
    
    # Store responses for this dataset
    dataset_responses = []
    
    # For each scorer, make a Gemini API call
    for scorer_name, scorer_string in scorer_data.items():
        print(f"    Making Gemini call for scorer: {scorer_name}")
        
        # TODO: Replace with actual prompt template
        prompt = f"""{SCORER_ANALYSIS_PROMPT}
        
        Scorer Data: 
        Scorer Name: {scorer_name}
        {scorer_string}
        """
        
        try:
            response = model.generate_content(prompt)
            response_text = response.text
            response_text = "Analysis for scorer: " + scorer_name + "\n" + response_text
            # Log the response
            log_response(response_text, f"{dataset_dir.name} - {scorer_name} Analysis")
            dataset_responses.append(f"Scorer: {scorer_name}\n{response_text}")
            
        except Exception as e:
            error_msg = f"Error processing {scorer_name}: {str(e)}"
            print(f"    {error_msg}")
            log_response(error_msg, f"{dataset_dir.name} - {scorer_name} Error")
    
    # Make summary call for this dataset
    print(f"    Making summary call for {dataset_dir.name}")
    
    combined_responses = "\n\n".join(dataset_responses)
    
    # TODO: Replace with actual summary prompt template
    summary_prompt = f"""{AGENTWISE_SUMMARY_PROMPT}
    
    Agent Name: {dataset_dir.name}
    Scorer Analyses:
    {combined_responses}
    """
    
    try:
        summary_response = model.generate_content(summary_prompt)
        summary_text = summary_response.text
        
        # Log the summary
        log_response(summary_text, f"{dataset_dir.name} - Dataset Summary")
        all_summaries.append(f"Dataset: {dataset_dir.name}\n{summary_text}")
        
    except Exception as e:
        error_msg = f"Error creating summary for {dataset_dir.name}: {str(e)}"
        print(f"    {error_msg}")
        log_response(error_msg, f"{dataset_dir.name} - Summary Error")

print(f"\nCompleted processing {len(dataset_dirs)} datasets.")


Found 5 dataset directories to process:
  - email_gen_send_dataset
  - agent_comment_gen_dataset
  - post_validation_dataset
  - agent_query_gen_dataset
  - tavily_search_results_dataset

Processing email_gen_send_dataset...
  Processing CSV: agent_evaluation_results.csv
    Making Gemini call for scorer: task_progression
    Making Gemini call for scorer: context_relevancy
    Making Gemini call for scorer: role_adherence
    Making Gemini call for scorer: tool_relevancy
    Making Gemini call for scorer: parameter_correctness
    Making summary call for email_gen_send_dataset

Processing agent_comment_gen_dataset...
  Processing CSV: agent_evaluation_results.csv
    Making Gemini call for scorer: task_progression
    Making Gemini call for scorer: context_relevancy
    Making Gemini call for scorer: role_adherence
    Making Gemini call for scorer: tool_relevancy
    Making Gemini call for scorer: parameter_correctness
    Making summary call for agent_comment_gen_dataset

Processing

In [25]:
# Final comprehensive analysis
print("\nMaking final comprehensive analysis call...")

# Combine all dataset summaries
combined_summaries = "\n\n".join(all_summaries)

# TODO: Replace with actual final analysis prompt template
final_prompt = f"""{FINAL_ANALYSIS_PROMPT}

Reddit Agent Documentation:
{reddit_agent_doc}

Dataset Summaries:
{combined_summaries}
"""

try:
    final_response = model.generate_content(final_prompt)
    final_text = final_response.text
    
    # Log the final analysis
    log_response(final_text, "Final Comprehensive Analysis")
    
    print("Final analysis completed and logged!")
    print(f"All responses have been logged to: {log_file}")
    
except Exception as e:
    error_msg = f"Error creating final analysis: {str(e)}"
    print(error_msg)
    log_response(error_msg, "Final Analysis Error")

print("\n" + "="*50)
print("ANALYSIS COMPLETE!")
print("="*50)
print(f"Log file location: {log_file}")
print(f"Total datasets processed: {len(dataset_dirs)}")
print(f"Total summaries generated: {len(all_summaries)}")
print("="*50)



Making final comprehensive analysis call...
Final analysis completed and logged!
All responses have been logged to: log/analysis_log_20250925_004622.txt

ANALYSIS COMPLETE!
Log file location: log/analysis_log_20250925_004622.txt
Total datasets processed: 5
Total summaries generated: 5
