In [2]:
# Add /usr/local/bin to PATH so that docker is found (required for agent code execution)
import os
os.environ["PATH"] += os.pathsep + "/usr/local/bin"

import shutil
print(shutil.which("docker"))

/usr/local/bin/docker


# The dataset to be used

In [None]:
inputs = {
    "original_dataset": "Superannuation.csv",
}

# Imports, Environment Variables, Tooling

In [None]:
from crewai import Agent, Task, Process, Crew, LLM
from dotenv import load_dotenv
from crewai_tools import FileReadTool, FileWriterTool, DirectoryReadTool
import os
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

# Specify the model. Comment out to use default
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4.1-mini'

csv_reader = FileReadTool(file_path=inputs["original_dataset"])
file_writer = FileWriterTool()
directory_reader = DirectoryReadTool()

## Creating Agents

In [None]:
# The first agent will ingest the dataset, identify and report all quality issues, and provide strategies to resolve them
data_cleaning_agent = Agent(
    role="Senior Data Engineer",
    goal="""
        You are a Senior Data Engineer specialising in data ingestion and cleaning. 
        Your primary goal is to ingest the raw data provided, identify all quality issues, and produce strategies for handling all identified quality issues.

        Responsibilities:
        1. Ingest the complete raw dataset
        2. Identify and quantify all data quality issues
        3. Develop strategies to resolve all data quality issues
        4. Adhere to data engineering best practices
        5. Produce a professional, high-quality, structured, and detailed markdown report on the quality issues and recommendations
        """,
    backstory="Senior Data Engineer experienced in Python, SQL, advanced data cleaning techniques, and producing top-tier data cleaning reports.",
    allow_code_execution=True,
    memory=True,
    verbose=True,
    tools=[csv_reader, file_writer, directory_reader],
    #llm=claude_llm
)

# The second agent will perform exploratory data analysis on the dataset to derive insights and suggest further areas of analysis
eda_agent = Agent(
    role="Senior Data Scientist",
    goal="""
        You are a Senior Data Scientist with deep expertise in exploratory data analysis and statistical analysis.
        
        Responsibilities
        1. Ingest the complete raw dataset to perform analysis
        2. Calculate statistics
        3. Perform correlation analysis
        4. Perform distribution analysis
        5. Investigate any patterns that may be present in the data
        6. Produce a professional, high-quality, structured, and detailed markdown report compiling the above findings
        """,
    backstory="A senior data scientist with extensive experience in exploratory data analysis and using python, pandas, and data manipulation, and with deep experience in creating top-tier EDA reports",
    allow_code_execution=True,
    memory=True,
    verbose=1,
    tools=[csv_reader, file_writer, directory_reader],
    #llm=openai_llm
)

# The third agent will create a report on suggested data visualisations, and use python to visualise and save visualisations of the data
data_visualisation_agent = Agent(
    role="Senior Data Visualisation Engineer",
    goal="""
        You are a Senior Data Visualisation Engineer with expertise in creating impactful visualisations that provide clear insight into data.
        
        Responsibilities:
        1. Ingest the complete raw dataset
        2. Create a visualisation plan describing the visualisations to be made
        3. Save the visualisation plan in a detailed visualisation report in markdown format
        4. Create the visualisations and save them as .png files following data visualisation best practices
        """,
    backstory="A senior data visualisation engineer with extensive experience in creating high-quality visualitsations and reports to be shared with stakeholders",
    allow_code_execution=True,
    memory=True,
    verbose=1,
    tools=[file_writer, directory_reader],
    #llm=openai_llm
)

# The fourth and final agent will take the findings of preceding agents and compile all knowledge into a professional report on the dataset
reporting_agent = Agent(
    role="Senior Reporting Analyst",
    goal="""
        You are a Senior Reporting Analyst with deep expertise in creating highly impactful business reports that are key in developing business strategies.
        
        Responsibilities:
        1. Read all the markdown reports produced by the other agents
        2. Compile all their findings into a comprehensive, detailed business report to be shared with key stakeholders and business leaders
        3. Ensure all the important findings are included
        4. Critically evaluate the dataset and provide possibilities for further analysis.
        """,
    backstory="A senior reporting analyst with extensive experience in creating in-depth, technical business data reports",
    allow_code_execution=False,
    memory=True,
    verbose=1,
    tools=[file_writer, directory_reader],
    #llm=openai_llm
)

## Creating Tasks

### Data Cleaning

In [None]:
""" The data cleaning tasks will be performed in sequence by the data_cleaning_agent """

data_cleaning_process = [
    # Task 1: Initial Data Assessment
    Task(
        description="""
            Conduct an initial assessment of the {original_dataset}. Ingest the first 500 rows.
            Specifically:
            1. Identify all columns and their data types
            2. Check for basic data quality issues
            3. Be as thorough and detailed as possible
            4. Compile all the findings into a report
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A detailed, structured report in markdown format in a folder called 'cleaning_reports/' containing:
                - Column summary: a table of the columns and their data types
                - Data quality metrics:
                    - Number of rows
                    - Number of columns
                    - Number of missing values
                    - Number of duplicates
                    - Data types
                    - A final section recommending next steps to be taken
                """,
        agent=data_cleaning_agent,
        output_file="initial_data_assessment.md",
    ),

    # Task 2: Missing Value Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and determine a strategy to handle missing values present, if there are any. Do not implement the strategy.
            Specifically:
            1. Ingest the file using pandas
            2. Check for missing values in the dataset
            3. Determine an appropriate handling strategy
            4. Do not do any further data cleaning
            5. Compile all the findings into a report
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A detailed, structured report in markdown format in a folder called 'cleaning_reports/' containing:
                - Sections detailing the findings of the missing value analysis
                - A strategy for handling missing values (if there are any)
                - Tabulate findings where appropriate
            """,
        agent=data_cleaning_agent,
        output_file="missing_value_handling.md",
    ),
    
    # Task 3: Format Standardisation
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a strategy to standardise formats across the dataset, if formats are not standard. Do not implement the strategy.
            Specifically:
            1. Use the pandas library to read the file and perform data manipulation
            2. Identify inconsistent formats
            3. Develop a standardisation strategy (e.g., date formats, string formats, etc.)
            4. Do not do any further data cleaning
            5. Compile all the findings into a report
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A detailed, structured report in markdown format in a folder called 'cleaning_reports/' containing:
                - All identified data format issues
                - Tabulate findings where appropriate (e.g. a table with columns: column_name, data_format, required_format)
                - A summary of the recommended strategy for handling format standardisation in the dataset
            """,
        agent=data_cleaning_agent,
        output_file="format_standardisation.md",
    ),
    
    # Task 4: Duplicate Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a strategy for handling duplicates in the dataset, if there are any. Do not implement the strategy.
            Specifically:
            1. Use the pandas library to read the file
            2. Identify duplicates
            3. Determine a strategy for handling duplicates.
            4. Do not do any further data cleaning
            5. Compile all the findings into a report
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A detailed, structured report in markdown format in a folder called 'cleaning_reports/' containing:
                - All duplicates found (if there are any)
                - Tabulate the columns and values of duplcates if ther are any
                - A handling strategy for duplicates in the dataset if there are any
            """,
        agent=data_cleaning_agent,
        output_file="duplicate_handling.md",
    ),
    
    # Task 5: Outlier Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a strategy for handling outliers in the dataset, if there are any. Do not implement the strategy.
            Specifically:
            1. Use the pandas library to read the file
            2. Identify outliers using statistical methods (e.g., Z-score, IQR)
            3. Determine a strategy for handling outliers (e.g., removal, transformation, etc.)
            4. Do not do any further data cleaning
            5. Compile a report with the findings
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A detailed, structured report in markdown format in a folder called 'cleaning_reports/' containing:
                - Comprehensive outlier analysis with statistical reasoning
                - Strategies for handling any outliers identified
            """,
        agent=data_cleaning_agent,
        output_file="outlier_handling.md",
    ),

    # Task 6: Final Summary
    Task(
        description="""
            Read all of the markdown files in the 'cleaning_reports/' directory (ending with .md) and compile a final data cleaning report.
            Specifically:
            1. Read the initial_data_assessment.md, missing_value_handling.md, data_type_standardisation.md, format_standardisation.md, duplicate_handling.md, and outlier_handling.md files
            2. Compile these results into a beautifully formatted report
            3. Ensure the report includes all business-critical information to be shared with senior management
            """,
        expected_output="""
            - A detailed, structured report in markdown format called 'final_cleaning_summary.md' containing:
                - Initial data assessment results
                - Columns with missing values and strategies for dealing with them
                - Summary of data types and strategies for standardising them
                - Columns with duplicates and the strategy for them
                - All outliers identified and the strategy for dealing with them
                - An overall summary of the cleaning process
            """,
        agent=data_cleaning_agent,
        output_file="final_cleaning_summary.md",
    )
]

### EDA

In [None]:
""" The exploratory data analysis tasks will be performed in sequence by the eda_agent """

eda_process = [
    # Task 1: Basic Statistics
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and create basic statistics for the dataset.
            Specifically:
            1. Calculate descriptive statistics for numerical columns (where it makes sense)
            2. Calculate descriptive statistics for categorical columns (where it makes sense) 
            3. Be as thorough and detailed as possible
            4. Infer any important insights from the statistics
            5. Compile a report with the findings
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A rich, detailed report called eda_statistics.md in markdown format with summary statistics and any insights in a directory called eda_reports/
            - The report should contain at least:
                - Summary statistics for numerical columns (tabulate where necessary)
                - Summary statistics for categorical columns (tabulate where necessary)
                - Any important insights that can be inferred
                - Recommendations for future statistical analysis
            """,
        agent=eda_agent,
        output_file="eda_statistics.md",
        #output_pydantic=BasicStatistics
    ),
    
    # Task 2: Correlation Analysis
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and analyse relationships between variables.
            Specifically:
            1. Calculate correlations
            2. Be as thorough and detailed as possible but do do not do any further analysis
            3. Infer any important insights from the correlations
            4. Compile a report with the findings
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A rich, detailed report called eda_correlation.md in markdown format with summary statistics and any insights in a directory called eda_reports/
            - The report should contain at least:
                - A tabulated correlation matrix
                - Any insights generated from the correlation analysis
                - Recommendations for further correlation analysis
            """,
        agent=eda_agent,
        output_file="eda_correlation.md",
        #output_pydantic=CorrelationAnalysis
    ),
    
    # Task 3: Distribution Analysis
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and analyse variable distributions.
            Specifically:
            1. Calculate distributions
            2. Identify outliers
            3. Be as thorough and detailed as possible but do do not do any further analysis
            4. Infer any important insights from the distributions
            5. Compile a report with the findings
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A rich, detailed report called eda_distribution.md in markdown format with summary statistics and any insights in a directory called eda_reports/
            - The report should contain at least:
                - Table(s) summarising the distributions in the dataset
                - Summary statistics for the distributions
                - Recommendations for further distribution analysis
            """,
        agent=eda_agent,
        output_file="eda_distribution.md",
        #output_pydantic=DistributionAnalysis
    ),
    
    # Task 4: Pattern Identification
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and identify patterns and trends.
            Specifically:
            1. Look for temporal patterns
            2. Identify clusters
            3. Be as thorough and detailed as possible but do do not do any further analysis
            4. Infer any important insights from the patterns
            5. Compile a report with the findings
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A rich, detailed report called eda_patterns.md in markdown format with summary statistics and any insights in a directory called eda_reports/
            - The report should contain at least:
                - An analysis of temporal patterns
                - An analysis of clusters
                - Recommendations for further pattern analysis
            """,
        agent=eda_agent,
        output_file="eda_patterns.md",
        #output_pydantic=PatternIdentification
    ),
    
    # Task 5: Final Summary
    Task(
        description="""
            Create comprehensive EDA report to be shared with senior business stakeholders,
            Specifically:
            1. Compile all findings from previous tasks. This will require reading the previous markdown files inside the 'eda_reports/' directory:
                - eda_statistics.md
                - eda_correlation.md
                - eda_distribution.md
                - eda_patterns.md
            2. Use a professional, highly analytical tone
            3. Ensure the report is structured well, with clear sections, insights, and recommendations
            """,
        expected_output="""
            - A rich, detailed report called eda_summary.md in markdown format in the eda_reports/ directory with all findings and insights.
            """,
        agent=eda_agent,
        output_file="eda_summary.md",
        #output_pydantic=FinalSummary
    )
]

### Data Visualisation

In [None]:
""" The data visualisation tasks will be completed in sequence by the data_visualisation_agent """

# Visualisation Tasks
visualisation_process = [
    # Task 1: Visualisation Planning
    Task(
        description="""
            Read eda_analysis.md and plan visualisations to be created from the {original_dataset}. Do not create any visualisations yet.
            Specifically:
            1. Read the eda_summary.md file to understand the findings
            2. Ingest the first 500 rows of {original_dataset} file using pandas
            3. Examine columns to determine suitable visualisation types
            4. For categorical columns, choose appropriate visualisation types such as bar charts, pie charts, etc.
            5. For numerical columns, choose appropriate visualisation types such as histograms, scatter plots, etc.
            6. Justify the choice of visualisation types
            7. Compile your analysis into a report
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A rich, detailed report called visualisation_planning.md in markdown format in a directory called visualisation_reports/.
            - The report should include at least:
                - A table containing columns and recommended visualisation types
                - Justification for the choice of visualisations
                - Recommendations for further visual analysis
            """,
        agent=data_visualisation_agent,
        output_file="visualisation_planning.md",
        #output_pydantic=VisualisationPlanning
    ),
    
    # Task 2: Create Simple Charts
    Task(
        description="""
            Read visualisation_planning.json and the {original_dataset} files and use them to create visualisations.
            Specifically:
            1. Ingest the first 500 rows of {original_dataset} file using pandas
            2. Use matplotlib or seaborn for visualisation
            3. Create charts using the proposed visualisation types
            4. Include legends and titles
            5. Produce a python file that will save various visualisations when run
            """.format(original_dataset=inputs["original dataset"]),
        expected_output="""
            - Numerous saved .png files in the visualisation_reports/ directory
            """,
        agent=data_visualisation_agent,
        #output_files=["visualisation_summary.md"],
        #output_pydantic=VisualisationCreation
    ),
]

### Reporting

In [None]:
# The final report will be compiled by the reporting_agent

reporting_process = [
    Task(
        description="""
            Read the data_cleaning_summary.md, eda_summary.md, and visualisation_summary.md reports and create a final report.
            - data_cleaning_summary.md is located in the cleaning_reports/ directory
            - eda_summary.md is located in the eda_reports/ directory
            - visualisation_summary.md is located in the visualisation_reports/ directory
            Specifically:
            1. Compile all findings from previous reports
            2. Include links to visualisations using the paths to the visualisation files in visualisation_summary.md
            3. Critically evaluate the findings and provide actionable recommendations and possibilities for further analysis or process improvements
            3. Save the report as final_report.md
            """,
        expected_output="""
            A rich, detailed report in markdown format with all findings and insights inside a directory called 'final_report/'.
            - The report should include:
                - Data cleaning summary
                - EDA analysis summary
                - Visualisation summary
                - Critical evaluation of findings
                - Actionable recommendations
                - Possibilities for further analysis or process improvements
            """,
        agent=reporting_agent,
        output_file="final_report.md"
    )
]

In [10]:
# Concatenate all the tasks into a single process
all_tasks = data_cleaning_process + eda_process + visualisation_process + reporting_process

## Create the Crew

In [11]:
from langchain_openai import ChatOpenAI

data_analysis_crew = Crew(
    agents=[
        data_cleaning_agent,
        eda_agent,
        data_visualisation_agent,
        reporting_agent,
    ],

    tasks=all_tasks,
    manager_llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.2),
    process=Process.hierarchical,
    verbose=False,
    memory=True,
)

In [12]:
#data_analysis_crew.train(n_iterations=1, filename="crew_training.pkl")

## Kickoff the crew

In [13]:
result = data_analysis_crew.kickoff()

/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/si

[1m[95m# Agent:[00m [1m[92mSenior Data Engineer[00m
[95m## Task:[00m [92mCompile a final data cleaning report in markdown format called 'final_cleaning_summary.md' that includes the following sections: Initial data assessment results, columns with missing values and strategies for dealing with them, summary of data types and strategies for standardising them, columns with duplicates and the strategy for them, all outliers identified and the strategy for dealing with them, and an overall summary of the cleaning process.[00m


[1m[95m# Agent:[00m [1m[92mSenior Data Engineer[00m
[95m## Final Answer:[00m [92m
# Final Data Cleaning Summary Report

## Initial Data Assessment Results
The dataset contains 20 rows and 20 columns with various personal and financial information. Key findings include missing values in gender, salary, and contact_number columns, inconsistent date formatting in date_of_birth, and categorical inconsistencies in employment_status and gender.

## Mis

/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/si

[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Task:[00m [92mCalculate the correlation matrix for Salary, Employer Contribution Rate, Employee Contribution Rate, and Super Balance from the Superannuation dataset and generate insights based on the results.[00m


[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Thought:[00m [92mAction: Read a file's content[00m
[95m## Using tool:[00m [92mRead a file's content[00m
[95m## Tool Input:[00m [92m
"{\"file_path\": \"Finance_Data_with_errors.csv\", \"start_line\": 1, \"line_count\": 20}"[00m
[95m## Tool Output:[00m [92m
Error: File not found at path: Finance_Data_with_errors.csv[00m


[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Thought:[00m [92mThought: The file "Finance_Data_with_errors.csv" mentioned in the context does not seem to exist or is not accessible. However, the task is to analyze correlation on the Superannuation dataset which is presumably a different 

/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/si

[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Task:[00m [92mAnalyze variable distributions, calculate distributions, identify outliers, and infer insights from the Superannuation dataset.[00m
[91m 

I encountered an error while trying to use the tool. This was the error: string index out of range.
 Tool List files in directory accepts these inputs: Tool Name: List files in directory
Tool Arguments: {'directory': {'description': 'Mandatory directory to list content', 'type': 'str'}}
Tool Description: A tool that can be used to recursively list a directory's content.
[00m


[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Thought:[00m [92mThought: To perform the analysis, I need to start by reading the first 500 rows of the Superannuation dataset. I do not yet know the exact filename or directory. I will first list the files in the root directory to find the dataset file.[00m
[95m## Using tool:[00m [92mList files in directory[00m
[95m##

/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/site-packages/chromadb/types.py:144: PydanticDeprecatedSince211: Accessing the 'model_fields' attribute on the instance is deprecated. Instead, you should access this attribute from the model class. Deprecated in Pydantic V2.11 to be removed in V3.0.
  return self.model_fields  # pydantic 2.x
/opt/homebrew/lib/python3.11/si

[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Task:[00m [92mAnalyze the first 500 rows of the Superannuation dataset for patterns and trends. Specifically, look for temporal patterns, identify clusters, and compile a detailed report with insights. Do not perform any further analysis beyond this.[00m


[1m[95m# Agent:[00m [1m[92mSenior Data Scientist[00m
[95m## Thought:[00m [92mThought: I first need to read the first 500 rows of the Superannuation dataset file in order to perform the requested exploratory data analysis focused on temporal patterns and clusters.[00m
[95m## Using tool:[00m [92mRead a file's content[00m
[95m## Tool Input:[00m [92m
"{\"file_path\": \"Superannuation.csv\", \"start_line\": 1, \"line_count\": 500}"[00m
[95m## Tool Output:[00m [92m
member_id,first_name,last_name,date_of_birth,gender,employment_status,salary,employer_contribution_rate,employee_contribution_rate,super_balance,investment_option,insurance_coverage,benefi

2025-05-26 12:47:04,013 - 8449330304 - rag_storage.py-rag_storage:109 - ERROR: Error during short_term save: APIStatusError.__init__() missing 2 required keyword-only arguments: 'response' and 'body'



[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Feedback / Get Help: https://github.com/BerriAI/litellm/issues/new[0m
LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.


[1;31mGive Fee

2025-05-26 12:47:26,250 - 8449330304 - llm.py-llm:905 - ERROR: LiteLLM call failed: litellm.RateLimitError: RateLimitError: OpenAIException - You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.




LiteLLM.Info: If you need to debug this error, use `litellm._turn_on_debug()'.

[91m Error during LLM call: litellm.RateLimitError: RateLimitError: OpenAIException - You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.[00m
[91m An unknown error occurred. Please check the details below.[00m
[91m Error details: litellm.RateLimitError: RateLimitError: OpenAIException - You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.[00m


RateLimitError: litellm.RateLimitError: RateLimitError: OpenAIException - You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.

In [None]:
from IPython.display import display, Markdown

filepath = "final_report.md"
display(Markdown(filename=filepath))

The final report on the Superannuation dataset has been successfully compiled and saved as `final_report.md` in the `final_report/` directory. The report includes a comprehensive summary of data cleaning actions, exploratory data analysis insights, and actionable recommendations for further analysis.