In [10]:
# Multi-Agent EDA Framework with AutoGen
# Author: Implementation for Sourav Banerjee
# Date: 2025-06-01

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from typing import Dict, List, Any
import json
from datetime import datetime
import autogen
from autogen import AssistantAgent, UserProxyAgent, GroupChat, GroupChatManager

In [11]:

# config_list_gpt4o = autogen.config_list_from_json(
#     "OAI_CONFIG_LIST",
#     filter_dict={
#         "model": ['gpt-4o']
#     },
# )
# llm_config = {
#     "cache_seed": 42,  # change the cache_seed for different trials
#     "temperature": 0,
#     "config_list": config_list_gpt4o,
#     "timeout": 120,
# }

In [12]:


# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')

# Configuration for AutoGen
config_list_gpt4o = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ['gpt-4o']
    },
)
llm_config = {
    "cache_seed": 42,  # change the cache_seed for different trials
    "temperature": 0,
    "config_list": config_list_gpt4o,
    "timeout": 120,
}

print("🚀 Multi-Agent EDA Framework Initialization")
print("=" * 60)

# Global variables to store analysis results
analysis_results = {}
dataset_info = {}
visualizations_created = []

# Sample dataset creation function
def create_sample_dataset():
    """
    Creates a comprehensive sample dataset for EDA demonstration.
    This dataset simulates e-commerce customer data with various data types and patterns.
    """
    np.random.seed(42)
    n_samples = 1000
    
    # Generate synthetic e-commerce customer data
    data = {
        'customer_id': range(1, n_samples + 1),
        'age': np.random.normal(35, 12, n_samples).astype(int),
        'gender': np.random.choice(['Male', 'Female', 'Other'], n_samples, p=[0.45, 0.50, 0.05]),
        'income': np.random.lognormal(10.5, 0.5, n_samples),
        'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                                    n_samples, p=[0.3, 0.4, 0.25, 0.05]),
        'city': np.random.choice(['Mumbai', 'Delhi', 'Bangalore', 'Chennai', 'Kolkata'], 
                                n_samples, p=[0.25, 0.20, 0.20, 0.20, 0.15]),
        'purchase_amount': np.random.exponential(500, n_samples),
        'num_purchases': np.random.poisson(5, n_samples),
        'satisfaction_score': np.random.normal(7.5, 1.5, n_samples),
        'is_premium': np.random.choice([0, 1], n_samples, p=[0.7, 0.3])
    }
    
    # Add some missing values to make it realistic
    missing_indices = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
    for idx in missing_indices[:len(missing_indices)//3]:
        data['income'][idx] = np.nan
    for idx in missing_indices[len(missing_indices)//3:2*len(missing_indices)//3]:
        data['satisfaction_score'][idx] = np.nan
    for idx in missing_indices[2*len(missing_indices)//3:]:
        data['education'][idx] = None
    
    # Create DataFrame and save
    df = pd.DataFrame(data)
    df['age'] = np.clip(df['age'], 18, 80)  # Ensure realistic age range
    df['satisfaction_score'] = np.clip(df['satisfaction_score'], 1, 10)  # Score range 1-10
    
    df.to_csv('sample_ecommerce_data.csv', index=False)
    print(f"✅ Sample dataset created: sample_ecommerce_data.csv ({len(df)} rows, {len(df.columns)} columns)")
    return df

# Create sample dataset
sample_df = create_sample_dataset()

# Agent Definitions with Specific Roles and Capabilities

# 1. Data Preparation Agent
data_prep_agent = AssistantAgent(
    name="DataPrepAgent",
    system_message="""You are a Data Preparation Agent specializing in data cleaning and preprocessing.
    
    Your responsibilities include:
    - Loading and inspecting datasets
    - Identifying data quality issues (missing values, duplicates, outliers)
    - Performing data cleaning operations
    - Data type conversions and standardization
    - Creating data quality reports
    
    Always provide detailed explanations of your preprocessing steps and rationale.
    Use Python code to demonstrate your analysis and cleaning procedures.
    Store results in global variables for other agents to access.
    """,
    llm_config=llm_config,
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False}
)

# 2. EDA Agent  
eda_agent = AssistantAgent(
    name="EDAAgent",
    system_message="""You are an EDA Agent specializing in statistical analysis and data visualization.
    
    Your responsibilities include:
    - Conducting comprehensive statistical summarization
    - Generating descriptive statistics
    - Creating meaningful visualizations (distributions, correlations, trends)
    - Identifying patterns, outliers, and relationships in data
    - Providing statistical insights and interpretations
    
    Create high-quality visualizations using matplotlib and seaborn.
    Provide detailed statistical interpretations of your findings.
    Store visualization paths and insights in global variables.
    """,
    llm_config=llm_config,
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False}
)

# 3. Report Generator Agent
report_agent = AssistantAgent(
    name="ReportAgent", 
    system_message="""You are a Report Generator Agent specializing in creating comprehensive EDA reports.
    
    Your responsibilities include:
    - Synthesizing findings from data preparation and EDA phases
    - Creating well-structured, professional reports
    - Organizing insights into logical sections
    - Providing actionable recommendations
    - Ensuring report clarity and readability
    
    Generate reports in markdown format with proper sections and formatting.
    Include executive summaries, detailed findings, and recommendations.
    Reference specific statistics and visualizations in your reports.
    """,
    llm_config=llm_config,
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False}
)

# 4. Critic Agent
critic_agent = AssistantAgent(
    name="CriticAgent",
    system_message="""You are a Critic Agent responsible for quality assurance and feedback.
    
    Your responsibilities include:
    - Reviewing outputs from other agents for accuracy and completeness
    - Providing constructive feedback on analysis quality
    - Identifying gaps or areas for improvement
    - Ensuring statistical validity and interpretation accuracy
    - Validating visualization effectiveness and clarity
    
    Provide specific, actionable feedback with clear reasoning.
    Focus on statistical accuracy, completeness, and clarity of communication.
    Suggest improvements and validate that recommendations are followed.
    """,
    llm_config=llm_config,
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False}
)

# 5. Executor Agent (User Proxy)
executor_agent = UserProxyAgent(
    name="ExecutorAgent",
    system_message="""You are an Executor Agent responsible for code validation and execution oversight.
    
    Your responsibilities include:
    - Executing and validating code from other agents
    - Ensuring code runs without errors
    - Verifying result accuracy
    - Managing file operations and data persistence
    - Coordinating technical execution aspects
    
    Execute code carefully and report any issues or errors.
    Validate that results match expectations and are technically sound.
    """,
    human_input_mode="NEVER",
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False},
    max_consecutive_auto_reply=3
)

# 6. Admin Agent (Group Chat Manager)
admin_agent = AssistantAgent(
    name="AdminAgent",
    system_message="""You are an Admin Agent overseeing the entire EDA workflow.
    
    Your responsibilities include:
    - Coordinating tasks between all agents
    - Ensuring workflow alignment with project goals  
    - Managing the overall EDA process timeline
    - Facilitating communication between agents
    - Making final decisions on workflow direction
    
    Guide the conversation flow logically through EDA phases:
    1. Data Preparation and Quality Assessment
    2. Statistical Analysis and Visualization  
    3. Report Generation
    4. Quality Review and Feedback Integration
    
    Ensure each phase is completed thoroughly before moving to the next.
    """,
    llm_config=llm_config,
    code_execution_config={"work_dir": "eda_workspace", "use_docker": False}
)

# Create workspace directory
os.makedirs("eda_workspace", exist_ok=True)

print("\n🤖 Agents Initialized Successfully")
print("Agents created:")
print("- DataPrepAgent: Data cleaning and preprocessing")
print("- EDAAgent: Statistical analysis and visualization") 
print("- ReportAgent: Comprehensive report generation")
print("- CriticAgent: Quality assurance and feedback")
print("- ExecutorAgent: Code validation and execution")
print("- AdminAgent: Workflow coordination and management")

# Setup Group Chat
agents_list = [admin_agent, data_prep_agent, eda_agent, report_agent, critic_agent, executor_agent]

groupchat = GroupChat(
    agents=agents_list,
    messages=[],
    max_round=50,
    speaker_selection_method="round_robin"
)

manager = GroupChatManager(groupchat=groupchat, llm_config=llm_config)

print("\n🔄 Group Chat Configured")
print(f"Agents in workflow: {len(agents_list)}")
print("Speaker selection: Round-robin")
print("Maximum rounds: 50")

# Workflow Execution Function
def execute_eda_workflow(dataset_path: str):
    """
    Executes the complete multi-agent EDA workflow on the specified dataset.
    
    Args:
        dataset_path (str): Path to the CSV dataset file
        
    Returns:
        Dict: Comprehensive results from the EDA process
    """
    
    print(f"\n🎯 Starting EDA Workflow for: {dataset_path}")
    print("=" * 60)
    
    # Initial workflow message
    initial_message = f"""
    Welcome to the Multi-Agent EDA Framework! 
    
    We need to perform a comprehensive Exploratory Data Analysis on the dataset: {dataset_path}
    
    Our workflow will proceed through these phases:
    
    Phase 1: Data Preparation and Quality Assessment
    - Load and inspect the dataset
    - Identify and handle data quality issues
    - Perform necessary preprocessing
    
    Phase 2: Statistical Analysis and Visualization
    - Generate descriptive statistics
    - Create meaningful visualizations
    - Identify patterns and relationships
    
    Phase 3: Report Generation
    - Synthesize findings into a comprehensive report
    - Provide actionable insights and recommendations
    
    Phase 4: Quality Review and Feedback Integration  
    - Review all outputs for accuracy and completeness
    - Incorporate feedback and improvements
    
    Let's begin with Phase 1. DataPrepAgent, please start by loading and inspecting the dataset.
    """
    
    # Execute the group chat
    result = executor_agent.initiate_chat(
        manager,
        message=initial_message,
        summary_method="reflection_with_llm"
    )
    
    return result

# Execute the EDA workflow
print("\n🚀 Launching Multi-Agent EDA Workflow")
print("Dataset: sample_ecommerce_data.csv")
print("Timestamp:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# Run the workflow
workflow_results = execute_eda_workflow("sample_ecommerce_data.csv")

print("\n✅ EDA Workflow Completed Successfully!")
print("=" * 60)

# Additional utility functions for post-processing

def generate_workflow_summary():
    """
    Generates a summary of the completed EDA workflow.
    """
    summary = {
        "workflow_completion_time": datetime.now().isoformat(),
        "dataset_analyzed": "sample_ecommerce_data.csv",
        "agents_involved": len(agents_list),
        "phases_completed": [
            "Data Preparation and Quality Assessment",
            "Statistical Analysis and Visualization", 
            "Report Generation",
            "Quality Review and Feedback Integration"
        ],
        "key_deliverables": [
            "Data quality assessment report",
            "Statistical summary and insights",
            "Comprehensive visualizations",
            "Final EDA report with recommendations"
        ]
    }
    
    # Save summary to file
    with open("eda_workspace/workflow_summary.json", "w") as f:
        json.dump(summary, f, indent=2)
    
    print("\n📊 Workflow Summary Generated")
    print(f"Summary saved to: eda_workspace/workflow_summary.json")
    
    return summary

# Generate final summary
final_summary = generate_workflow_summary()

print("\n🎉 Multi-Agent EDA Framework Execution Complete!")
print("=" * 60)
print("Key Achievements:")
print("✓ Dataset loaded and preprocessed successfully")
print("✓ Comprehensive statistical analysis performed") 
print("✓ High-quality visualizations generated")
print("✓ Professional EDA report created")
print("✓ Quality assurance and feedback integrated")
print("✓ All deliverables saved to eda_workspace/")

print(f"\n📁 Output Location: {os.path.abspath('eda_workspace')}")
print("Files generated:")
print("- Data quality reports")
print("- Statistical analysis results") 
print("- Visualization files")
print("- Comprehensive EDA report")
print("- Workflow execution summary")

# Code for manual execution and testing
if __name__ == "__main__":
    print("\n" + "="*80)
    print("MULTI-AGENT EDA FRAMEWORK - EXECUTION COMPLETE")
    print("="*80)
    print(f"Framework developed for: Sourav Banerjee")
    print(f"Execution date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"Dataset processed: sample_ecommerce_data.csv")
    print(f"Total agents deployed: {len(agents_list)}")
    print("="*80)


🚀 Multi-Agent EDA Framework Initialization
✅ Sample dataset created: sample_ecommerce_data.csv (1000 rows, 10 columns)

🤖 Agents Initialized Successfully
Agents created:
- DataPrepAgent: Data cleaning and preprocessing
- EDAAgent: Statistical analysis and visualization
- ReportAgent: Comprehensive report generation
- CriticAgent: Quality assurance and feedback
- ExecutorAgent: Code validation and execution
- AdminAgent: Workflow coordination and management

🔄 Group Chat Configured
Agents in workflow: 6
Speaker selection: Round-robin
Maximum rounds: 50

🚀 Launching Multi-Agent EDA Workflow
Dataset: sample_ecommerce_data.csv
Timestamp: 2025-06-01 21:42:25

🎯 Starting EDA Workflow for: sample_ecommerce_data.csv
[33mExecutorAgent[0m (to chat_manager):


    Welcome to the Multi-Agent EDA Framework! 

    We need to perform a comprehensive Exploratory Data Analysis on the dataset: sample_ecommerce_data.csv

    Our workflow will proceed through these phases:

    Phase 1: Data Preparation

In [13]:
workflow_results

ChatResult(chat_id=None, chat_history=[{'content': "\n    Welcome to the Multi-Agent EDA Framework! \n\n    We need to perform a comprehensive Exploratory Data Analysis on the dataset: sample_ecommerce_data.csv\n\n    Our workflow will proceed through these phases:\n\n    Phase 1: Data Preparation and Quality Assessment\n    - Load and inspect the dataset\n    - Identify and handle data quality issues\n    - Perform necessary preprocessing\n\n    Phase 2: Statistical Analysis and Visualization\n    - Generate descriptive statistics\n    - Create meaningful visualizations\n    - Identify patterns and relationships\n\n    Phase 3: Report Generation\n    - Synthesize findings into a comprehensive report\n    - Provide actionable insights and recommendations\n\n    Phase 4: Quality Review and Feedback Integration  \n    - Review all outputs for accuracy and completeness\n    - Incorporate feedback and improvements\n\n    Let's begin with Phase 1. DataPrepAgent, please start by loading and 

In [17]:
print(workflow_results.chat_history)

[{'content': "\n    Welcome to the Multi-Agent EDA Framework! \n\n    We need to perform a comprehensive Exploratory Data Analysis on the dataset: sample_ecommerce_data.csv\n\n    Our workflow will proceed through these phases:\n\n    Phase 1: Data Preparation and Quality Assessment\n    - Load and inspect the dataset\n    - Identify and handle data quality issues\n    - Perform necessary preprocessing\n\n    Phase 2: Statistical Analysis and Visualization\n    - Generate descriptive statistics\n    - Create meaningful visualizations\n    - Identify patterns and relationships\n\n    Phase 3: Report Generation\n    - Synthesize findings into a comprehensive report\n    - Provide actionable insights and recommendations\n\n    Phase 4: Quality Review and Feedback Integration  \n    - Review all outputs for accuracy and completeness\n    - Incorporate feedback and improvements\n\n    Let's begin with Phase 1. DataPrepAgent, please start by loading and inspecting the dataset.\n    ", 'role'