In [9]:
import autogen
from autogen import ConversableAgent, GroupChat, GroupChatManager
import os
import pandas as pd
import numpy as np

# Configuration for GPT-4o model
config_list_gpt4o = autogen.config_list_from_json(
    "OAI_CONFIG_LIST",
    filter_dict={
        "model": ['gpt-4o']
    },
)

gpt4o_config = {
    "cache_seed": 42,
    "temperature": 0.1,
    "config_list": config_list_gpt4o,
    "timeout": 300,
}

# Admin Agent - Project Oversight
admin_agent = autogen.UserProxyAgent(
    name="Admin",
    system_message="""You are the Admin agent overseeing the EDA process. Your responsibilities:
    - Initiate the EDA process with user requirements
    - Ensure the analysis meets the specified objectives
    - Approve the final comprehensive EDA report
    - Coordinate between agents when needed
    
    When a user requests EDA insights, you'll work with the team to:
    1. Identify and acquire the dataset
    2. Conduct comprehensive exploratory data analysis
    3. Generate visualizations and statistical insights
    4. Produce a professional EDA report with data overview, key insights, visualizations, and findings summary
    
    Always ensure the final deliverable is comprehensive and actionable.""",
    code_execution_config=False,
    human_input_mode="TERMINATE",
    max_consecutive_auto_reply=1
)

# Dataset Acquisition Agent - Smart Dataset Handling
dataset_agent = autogen.AssistantAgent(
    name="Dataset_Specialist",
    system_message="""You are the Dataset Specialist responsible for intelligent dataset acquisition and initial assessment.
    Your responsibilities include:
    - Identifying dataset sources (local files, URLs, sample datasets, or generating synthetic data)
    - Loading datasets using appropriate methods (CSV, Excel, JSON, API calls, etc.)
    - Performing initial dataset assessment and profiling
    - Handling various data formats and sources intelligently
    - Providing dataset metadata and basic information
    
    When given a dataset request, you should:
    1. First attempt to load from common sample datasets (iris, titanic, boston housing, etc.)
    2. If a specific path/URL is provided, load from that source
    3. If no specific dataset is mentioned, suggest and use an appropriate sample dataset
    4. Generate synthetic data if requested or if no other source is available
    
    Always provide a comprehensive overview of the acquired dataset including:
    - Dataset dimensions and structure
    - Column names and data types
    - Missing value summary
    - Basic statistical overview
    
    Write complete, executable Python code using pandas, numpy, seaborn datasets, sklearn datasets, etc.""",
    llm_config=gpt4o_config
)

# Data Preparation Agent - Advanced Data Cleaning
data_preparation_agent = autogen.AssistantAgent(
    name="Data_Preparer",
    system_message="""You are the Data Preparation specialist responsible for comprehensive data preprocessing.
    Your responsibilities include:
    - Advanced data cleaning and preprocessing
    - Handling missing values with appropriate strategies
    - Outlier detection and treatment
    - Data type optimization and feature engineering
    - Data validation and quality assessment
    - Creating analysis-ready datasets
    
    Your preprocessing should include:
    1. Missing value analysis and treatment
    2. Outlier detection using statistical methods
    3. Data type conversions and optimizations
    4. Feature engineering when beneficial
    5. Data quality scoring and validation
    6. Creation of cleaned dataset for analysis
    
    Always document your preprocessing decisions and provide before/after comparisons.
    Write complete, executable Python code using pandas, numpy, scipy, and sklearn.""",
    llm_config=gpt4o_config
)

# Statistical Analysis Agent - Comprehensive Analytics
statistical_analyst = autogen.AssistantAgent(
    name="Statistical_Analyst",
    system_message="""You are the Statistical Analyst responsible for comprehensive statistical analysis and insight generation.
    Your responsibilities include:
    - Descriptive statistics and data profiling
    - Correlation analysis and feature relationships
    - Distribution analysis and normality testing
    - Statistical hypothesis testing when appropriate
    - Pattern recognition and trend analysis
    - Advanced statistical insights and interpretations
    
    Your analysis should include:
    1. Comprehensive descriptive statistics
    2. Correlation matrices and relationship analysis
    3. Distribution analysis for numerical variables
    4. Categorical variable analysis and frequency distributions
    5. Statistical significance testing where relevant
    6. Key statistical insights and business interpretations
    
    Focus on extracting actionable insights and explaining statistical findings in business terms.
    Write complete Python code using pandas, numpy, scipy, statsmodels, and scikit-learn.""",
    llm_config=gpt4o_config
)

# Visualization Expert - Advanced Data Visualization
visualization_expert = autogen.AssistantAgent(
    name="Visualization_Expert",
    system_message="""You are the Visualization Expert responsible for creating comprehensive and insightful data visualizations.
    Your responsibilities include:
    - Creating appropriate visualizations for different data types and analysis objectives
    - Designing publication-quality charts and graphs
    - Building comprehensive visualization dashboards
    - Ensuring visual clarity and professional presentation
    - Supporting statistical findings with compelling visuals
    
    Your visualization suite should include:
    1. Distribution plots (histograms, box plots, violin plots)
    2. Relationship plots (scatter plots, correlation heatmaps)
    3. Categorical analysis (bar charts, count plots, pie charts)
    4. Time series plots (if temporal data exists)
    5. Advanced visualizations (pair plots, feature importance plots)
    6. Summary dashboard combining key visualizations
    
    Use matplotlib, seaborn, plotly for creating professional, well-labeled visualizations.
    Each plot should have clear titles, axis labels, legends, and appropriate styling.
    Write complete Python code that generates all visualizations.""",
    llm_config=gpt4o_config
)

# Insight Generator - Business Intelligence
insight_generator = autogen.AssistantAgent(
    name="Insight_Generator",
    system_message="""You are the Insight Generator responsible for extracting actionable business insights from the EDA.
    Your responsibilities include:
    - Synthesizing statistical findings into business insights
    - Identifying key patterns, trends, and anomalies
    - Generating actionable recommendations
    - Highlighting important relationships and correlations
    - Providing strategic implications of the findings
    
    Your insights should cover:
    1. Key data characteristics and quality assessment
    2. Most important patterns and relationships discovered
    3. Significant correlations and their business implications
    4. Outliers and anomalies that require attention
    5. Data-driven recommendations for next steps
    6. Potential areas for further investigation
    
    Focus on translating technical findings into clear, actionable business insights.
    Avoid technical jargon and present insights in an accessible manner.""",
    llm_config=gpt4o_config
)

# Report Compiler - Comprehensive EDA Report Generation
report_compiler = autogen.AssistantAgent(
    name="Report_Compiler",
    system_message="""You are the Report Compiler responsible for creating comprehensive, professional EDA reports.
    Your responsibilities include:
    - Synthesizing all analysis components into a cohesive report
    - Creating well-structured, professional documentation
    - Integrating statistical findings, visualizations, and insights
    - Ensuring report completeness and clarity
    - Following standard EDA reporting formats
    
    Your comprehensive EDA report must include:
    1. **Executive Summary** - Key findings and recommendations overview
    2. **Data Overview** - Dataset description, structure, and quality assessment
    3. **Statistical Analysis** - Descriptive statistics and key statistical findings
    4. **Visual Analysis** - Key visualizations with interpretations
    5. **Key Insights** - Most important patterns, relationships, and discoveries
    6. **Findings Summary** - Consolidated findings and their implications
    7. **Recommendations** - Data-driven suggestions for next steps
    8. **Technical Appendix** - Detailed methodology and technical notes
    
    Write in clear, professional language suitable for both technical and business audiences.
    Ensure the report is comprehensive, well-organized, and actionable.""",
    llm_config=gpt4o_config
)

# Code Execution Agent - Technical Validation
code_executor = autogen.UserProxyAgent(
    name="Code_Executor",
    system_message="""You are the Code Executor responsible for running all Python code and validating outputs.
    Your responsibilities include:
    - Executing all Python code written by analysis agents
    - Validating code execution and reporting results
    - Managing data files and outputs
    - Ensuring code runs successfully before proceeding
    - Providing immediate feedback on execution status
    
    Execute code blocks and report results clearly. If errors occur, provide specific error details.""",
    human_input_mode="NEVER",
    code_execution_config={
        "last_n_messages": 3,
        "work_dir": "eda_analysis",
        "use_docker": False,
        "timeout": 300,
    }
)

# Quality Assurance Agent - Comprehensive Review
quality_assurance = autogen.AssistantAgent(
    name="Quality_Assurance",
    system_message="""You are the Quality Assurance specialist responsible for comprehensive review and validation.
    Your responsibilities include:
    - Reviewing all analysis components for accuracy and completeness
    - Validating statistical methods and interpretations
    - Ensuring visualization quality and appropriateness
    - Checking report structure and clarity
    - Providing constructive feedback for improvements
    - Ensuring EDA best practices are followed
    
    Focus on:
    - Technical accuracy of statistical analysis
    - Appropriateness of visualization choices
    - Clarity and actionability of insights
    - Completeness of the EDA process
    - Professional quality of the final report
    
    Provide specific, actionable feedback to improve the analysis quality.""",
    llm_config=gpt4o_config
)

# Enhanced Group Chat Configuration
eda_groupchat = autogen.GroupChat(
    agents=[
        admin_agent,
        dataset_agent,
        data_preparation_agent,
        statistical_analyst,
        visualization_expert,
        insight_generator,
        report_compiler,
        code_executor,
        quality_assurance
    ],
    messages=[],
    max_round=150,
    speaker_selection_method="auto",
    allow_repeat_speaker=True
)

# Group Chat Manager with Enhanced Coordination
eda_manager = autogen.GroupChatManager(
    groupchat=eda_groupchat,
    llm_config=gpt4o_config,
    system_message="""You are the EDA Process Manager coordinating a comprehensive exploratory data analysis.
    Your role is to ensure smooth workflow between specialized agents to deliver a complete EDA report.
    
    Standard EDA Process Flow:
    1. Dataset_Specialist: Acquire and assess the dataset
    2. Data_Preparer: Clean and prepare the data
    3. Statistical_Analyst: Conduct comprehensive statistical analysis
    4. Visualization_Expert: Create insightful visualizations
    5. Insight_Generator: Extract actionable business insights
    6. Report_Compiler: Generate comprehensive EDA report
    7. Quality_Assurance: Review and validate the complete analysis
    
    Ensure each agent completes their work before proceeding to the next phase.
    Coordinate feedback and iterations to produce a high-quality EDA deliverable."""
)

# Main EDA Function - Simplified Interface
def conduct_comprehensive_eda(user_prompt="Provide EDA insights into the dataset and create a report. This report should include a data overview, key insights, visualizations, and a summary of findings"):
    """
    Conducts comprehensive EDA based on user prompt
    
    Args:
        user_prompt (str): User's EDA request
    
    Returns:
        Comprehensive EDA report with insights and visualizations
    """
    
    enhanced_prompt = f"""
    {user_prompt}
    
    Please conduct a comprehensive Exploratory Data Analysis following these requirements:
    
    1. **Dataset Acquisition**: Identify and load an appropriate dataset (use sample datasets if none specified)
    2. **Data Preparation**: Clean and prepare the data for analysis
    3. **Statistical Analysis**: Conduct thorough statistical analysis and profiling
    4. **Visualization**: Create comprehensive visualizations to support findings
    5. **Insight Generation**: Extract actionable business insights
    6. **Report Compilation**: Generate a professional EDA report including:
       - Data overview and quality assessment
       - Key statistical findings and insights
       - Comprehensive visualizations with interpretations
       - Summary of findings and recommendations
    
    The final deliverable should be a complete, professional EDA report that provides
    comprehensive insights into the dataset's characteristics, patterns, and business implications.
    
    Begin with dataset acquisition and proceed through the complete EDA workflow.
    """
    
    return admin_agent.initiate_chat(
        eda_manager,
        message=enhanced_prompt
    )

# Quick Start Function for Common Use Cases
def quick_eda_analysis(dataset_source=None, analysis_focus=None):
    """
    Quick start function for common EDA scenarios
    
    Args:
        dataset_source (str): Dataset path, URL, or name (optional)
        analysis_focus (str): Specific analysis focus (optional)
    """
    
    if dataset_source and analysis_focus:
        prompt = f"""
        Provide comprehensive EDA insights into the dataset: {dataset_source}
        
        Focus the analysis on: {analysis_focus}
        
        Create a complete report including data overview, key insights, visualizations, 
        and summary of findings with specific attention to the requested focus area.
        """
    elif dataset_source:
        prompt = f"""
        Provide comprehensive EDA insights into the dataset: {dataset_source}
        
        Create a complete report including data overview, key insights, visualizations, 
        and summary of findings.
        """
    elif analysis_focus:
        prompt = f"""
        Provide comprehensive EDA insights into an appropriate sample dataset.
        
        Focus the analysis on: {analysis_focus}
        
        Create a complete report including data overview, key insights, visualizations, 
        and summary of findings.
        """
    else:
        prompt = """
        Provide comprehensive EDA insights into an appropriate sample dataset.
        
        Create a complete report including data overview, key insights, visualizations, 
        and summary of findings.
        """
    
    return conduct_comprehensive_eda(prompt)

# Example Usage Functions
def demo_eda_analysis():
    """Demonstrates the EDA framework with default settings"""
    return conduct_comprehensive_eda()

def demo_focused_analysis():
    """Demonstrates focused EDA analysis"""
    return quick_eda_analysis(
        dataset_source="titanic dataset",
        analysis_focus="survival factors and passenger characteristics"
    )

def demo_custom_dataset():
    """Demonstrates EDA with custom dataset"""
    return quick_eda_analysis(
        dataset_source="path/to/your/dataset.csv",
        analysis_focus="business performance metrics and trends"
    )

# Utility function to display available sample datasets
def show_available_datasets():
    """Display information about available sample datasets"""
    datasets_info = """
    Available Sample Datasets for EDA:
    
    1. **Iris Dataset**: Classic flower classification dataset
    2. **Titanic Dataset**: Passenger survival analysis
    3. **Boston Housing**: Real estate price prediction data
    4. **Wine Quality**: Wine characteristics and ratings
    5. **Tips Dataset**: Restaurant tips and customer behavior
    6. **Flights Dataset**: Flight delays and performance
    7. **Car Crashes**: Traffic safety statistics
    8. **Diamonds**: Diamond characteristics and pricing
    
    Usage Examples:
    - conduct_comprehensive_eda() - Uses default sample dataset
    - quick_eda_analysis("titanic dataset") - Specific dataset
    - quick_eda_analysis("your_file.csv") - Custom dataset file
    """
    print(datasets_info)
    return datasets_info

# Main execution example
if __name__ == "__main__":
    # Example 1: Basic EDA with default prompt
    print("Starting Comprehensive EDA Analysis...")
    # result = conduct_comprehensive_eda()
    
    # Example 2: Focused analysis
    # result = quick_eda_analysis("titanic dataset", "survival analysis")
    
    # Example 3: Custom dataset
    result = quick_eda_analysis("../data/sales.csv")


Starting Comprehensive EDA Analysis...
[33mAdmin[0m (to chat_manager):


    
        Provide comprehensive EDA insights into the dataset: ../data/sales.csv

        Create a complete report including data overview, key insights, visualizations, 
        and summary of findings.
        

    Please conduct a comprehensive Exploratory Data Analysis following these requirements:

    1. **Dataset Acquisition**: Identify and load an appropriate dataset (use sample datasets if none specified)
    2. **Data Preparation**: Clean and prepare the data for analysis
    3. **Statistical Analysis**: Conduct thorough statistical analysis and profiling
    4. **Visualization**: Create comprehensive visualizations to support findings
    5. **Insight Generation**: Extract actionable business insights
    6. **Report Compilation**: Generate a professional EDA report including:
       - Data overview and quality assessment
       - Key statistical findings and insights
       - Comprehensive visual

In [7]:
print(result.chat_history[6]['content'])

# Comprehensive EDA Report on the Titanic Dataset

## 1. Executive Summary
This report presents a comprehensive Exploratory Data Analysis (EDA) of the Titanic dataset, focusing on understanding the key factors influencing passenger survival. The analysis reveals significant patterns related to passenger demographics, socio-economic status, and survival outcomes. Key findings indicate that passenger class and sex are strong predictors of survival, with first-class passengers and females having higher survival rates. These insights can inform strategic decisions in customer segmentation, safety protocols, and service offerings.

## 2. Data Overview
### Dataset Description
The Titanic dataset comprises 891 entries with 14 features, including both numerical and categorical data types. The dataset provides information on passenger demographics, socio-economic status, and survival outcomes.

### Data Structure and Quality Assessment
- **Dimensions**: 891 rows, 14 columns
- **Missing Values**

In [8]:
from IPython.display import Markdown

Markdown(result.chat_history[6]['content'])

# Comprehensive EDA Report on the Titanic Dataset

## 1. Executive Summary
This report presents a comprehensive Exploratory Data Analysis (EDA) of the Titanic dataset, focusing on understanding the key factors influencing passenger survival. The analysis reveals significant patterns related to passenger demographics, socio-economic status, and survival outcomes. Key findings indicate that passenger class and sex are strong predictors of survival, with first-class passengers and females having higher survival rates. These insights can inform strategic decisions in customer segmentation, safety protocols, and service offerings.

## 2. Data Overview
### Dataset Description
The Titanic dataset comprises 891 entries with 14 features, including both numerical and categorical data types. The dataset provides information on passenger demographics, socio-economic status, and survival outcomes.

### Data Structure and Quality Assessment
- **Dimensions**: 891 rows, 14 columns
- **Missing Values**: Initially present in 'age', 'embarked', and 'deck'. Missing values were addressed by imputing 'age' with the median, 'embarked' with the mode, and removing the 'deck' column due to excessive missing data.
- **Data Types**: A mix of numerical (e.g., age, fare) and categorical (e.g., sex, class) data.

## 3. Statistical Analysis
### Descriptive Statistics
- **Age**: Median age is approximately 28 years, with a right-skewed distribution.
- **Survival Rate**: Overall survival rate is approximately 38%.
- **Class and Sex**: Strong indicators of survival likelihood, with first-class passengers and females having higher survival rates.

### Categorical Insights
- **Passenger Class**: First-class passengers had a significantly higher survival rate compared to second and third class.
- **Sex**: Females had a higher survival rate than males, suggesting a prioritization policy during evacuation.

## 4. Visual Analysis
### Key Visualizations and Interpretations
1. **Age Distribution**: The histogram shows a right-skewed distribution, with most passengers being young adults.
2. **Survival Rate by Class**: Bar chart reveals higher survival rates for first-class passengers.
3. **Survival Rate by Sex**: Bar chart indicates higher survival rates for females.
4. **Pairplot**: Highlights relationships between features, with survival status emphasized.
5. **Correlation Heatmap**: Shows moderate negative correlation between 'fare' and 'class', indicating higher fares for higher classes.

## 5. Key Insights
### Patterns and Relationships
- **Socio-Economic Status**: Strong influence on survival, with first-class passengers having better survival outcomes.
- **Gender Influence**: Females prioritized during evacuation, leading to higher survival rates.
- **Age Factor**: While age influences survival, it is less significant compared to class and sex.

### Correlations and Anomalies
- **Class and Fare**: Higher fares associated with higher classes, guiding pricing strategies.
- **Outliers**: Age and fare outliers may represent unique customer segments for targeted services.

## 6. Findings Summary
The analysis confirms that passenger class and sex are strong predictors of survival, with age also playing a role. Visualizations support these findings, providing a clear picture of the relationships between features. The insights can guide strategic decisions in customer segmentation, safety protocols, and service offerings.

## 7. Recommendations
### Data-Driven Suggestions
- **Customer Segmentation**: Refine segmentation strategies based on class and sex insights.
- **Safety Protocols**: Enhance safety measures and prioritize services for high-value customer segments.
- **Pricing Strategy**: Optimize pricing strategies using fare and class correlations.

### Further Investigation
- **Embarkation Point Analysis**: Explore the impact of embarkation points on survival and customer behavior.
- **Predictive Modeling**: Develop machine learning models to forecast survival or customer preferences.

## 8. Technical Appendix
### Methodology and Technical Notes
- **Data Cleaning**: Addressed missing values through imputation and column removal.
- **Statistical Analysis**: Conducted using descriptive statistics and correlation analysis.
- **Visualization Tools**: Utilized seaborn and matplotlib for comprehensive visualizations.
- **Software and Libraries**: Analysis performed using Python, with libraries including pandas, seaborn, and matplotlib.

This comprehensive EDA report provides valuable insights into the Titanic dataset, offering strategic guidance for enhancing customer experience, optimizing pricing strategies, and improving safety protocols. Further exploration of additional features and advanced modeling techniques can unlock deeper insights and drive strategic growth.