In [1]:
# Add /usr/local/bin to PATH so that docker is found (required for agent code execution)
import os
os.environ["PATH"] += os.pathsep + "/usr/local/bin"

import shutil
print(shutil.which("docker"))

/usr/local/bin/docker


# The dataset to be used

In [2]:
inputs = {
    "original_dataset": "../data/HR_Data.csv",
}

# Imports, Environment Variables, Tooling

In [3]:
from crewai import Agent, Task, Process, Crew
from dotenv import load_dotenv
from crewai_tools import FileReadTool, FileWriterTool, DirectoryReadTool
import os
import warnings
warnings.filterwarnings("ignore")

load_dotenv()

# Specify the model. Comment out to use default (gpt-4o)
os.environ["OPENAI_MODEL_NAME"] = 'gpt-4.1-mini'

csv_reader = FileReadTool(file_path=inputs["original_dataset"])
file_writer = FileWriterTool()
directory_reader = DirectoryReadTool()

  warn(


## Creating Agents

In [4]:
# The first agent will ingest the dataset, identify and report all quality issues, and provide strategies to resolve them
data_cleaning_agent = Agent(
    role="Senior Data Engineer",
    goal="""
        You are a Senior Data Engineer specialising in data ingestion and cleaning. 
        Your primary goal is to ingest the raw data provided, identify all quality issues, and produce python code to resolve all identified quality issues.

        Responsibilities:
        1. Ingest the complete raw dataset
        2. Develop python code to resolve all data quality issues
        3. Adhere to data engineering best practices
        """,
    backstory="Senior Data Engineer experienced in Python, SQL, data cleaning techniques, and producing top-tier python code.",
    allow_code_execution=True,
    memory=True,
    verbose=True,
    tools=[csv_reader, file_writer, directory_reader],
    #llm=claude_llm
)

# The second agent will perform exploratory data analysis on the dataset to derive insights and suggest further areas of analysis
eda_agent = Agent(
    role="Senior Data Scientist",
    goal="""
        You are a Senior Data Scientist with deep expertise in exploratory data analysis and statistical analysis.
        
        Responsibilities
        1. Ingest the complete raw dataset to perform analysis
        2. Develop python code to:
            - calculate various statistics
            - perform correlation analysis
            - perform distribution analysis
            - perform pattern analysis
        3. Adhere to data science best practices
        """,
    backstory="A senior data scientist with extensive experience in exploratory data analysis and using python, pandas, and data manipulation.",
    allow_code_execution=True,
    memory=True,
    verbose=1,
    tools=[csv_reader, file_writer, directory_reader],
    #llm=openai_llm
)

# The third agent will create a report on suggested data visualisations, and use python to visualise and save visualisations of the data
data_visualisation_agent = Agent(
    role="Senior Data Visualisation Engineer",
    goal="""
        You are a Senior Data Visualisation Engineer with expertise in creating impactful visualisations that provide clear insight into data.
        
        Responsibilities:
        1. Ingest the complete raw dataset
        2. Generate python code to create visualisations of the data following data visualisation best practices
        """,
    backstory="A senior data visualisation engineer with extensive experience in creating high-quality visualitsations using python.",
    allow_code_execution=True,
    memory=True,
    verbose=1,
    tools=[file_writer, directory_reader],
    #llm=openai_llm
)

## Creating Tasks

### Data Cleaning

In [5]:
""" The data cleaning tasks will be performed in sequence by the data_cleaning_agent """

data_cleaning_process = [
    # Task 1: Initial Data Assessment
    Task(
        description="""
            Develop python code to conduct an initial assessment of the {original_dataset}. Ingest the first 500 rows.
            Specifically:
            1. Identify all columns and their data types
            2. Check for basic data quality issues
            3. Be as thorough and detailed as possible
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A python code file in a folder called 'cleaning_files/' containing code to calculate:
                - Data quality metrics:
                    - Number of rows
                    - Number of columns
                    - Number of missing values
                    - Number of duplicates
                    - Data types
                """,
        agent=data_cleaning_agent,
        output_file="initial_data_assessment.py",
    ),

    # Task 2: Missing Value Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and crate a python script to handle missing values present, if there are any.
            Specifically:
            1. Ingest the file using pandas
            2. Check for missing values in the dataset
            3. Determine an appropriate handling strategy and write code to implement it
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            A python code file in a folder called 'cleaning_files/' containing code to handle missing values present, if there are any.
            """,
        agent=data_cleaning_agent,
        output_file="missing_value_handling.py",
    ),
    
    # Task 3: Format Standardisation
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a python script to standardise formats across the dataset, if formats are not standard.
            Specifically:
            1. Use the pandas library to read the file and perform data manipulation
            2. Identify inconsistent formats
            3. Develop a standardisation strategy (e.g., date formats, string formats, etc.)
            4. Write code to implement the strategy
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            A python code file in a folder called 'cleaning_files/' containing code to standardise formats across the dataset, if formats are not standard.
            """,
        agent=data_cleaning_agent,
        output_file="format_standardisation.py",
    ),
    
    # Task 4: Duplicate Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a python script to handle duplicates in the dataset, if there are any.
            Specifically:
            1. Use the pandas library to read the file
            2. Develop a strategy for handling duplicates.
            3. Write code to implement the strategy
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            A python code file in a folder called 'cleaning_files/' containing code to handle duplicates in the dataset, if there are any.
            """,
        agent=data_cleaning_agent,
        output_file="duplicate_handling.py",
    ),
    
    # Task 5: Outlier Handling
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and develop a python script to handle outliers in the dataset.
            Specifically:
            1. Use the pandas library to read the file
            2. Identify outliers using statistical methods (e.g., Z-score, IQR)
            3. Develop a strategy for handling outliers (e.g., removal, transformation, etc.)
            4. Write code to implement the strategy
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            A python code file in a folder called 'cleaning_files/' containing code to handle outliers in the dataset.
            """,
        agent=data_cleaning_agent,
        output_file="outlier_handling.py",
    )
]

### EDA

In [6]:
""" The exploratory data analysis tasks will be performed in sequence by the eda_agent """

eda_process = [
    # Task 1: Basic Statistics
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and develop a python script to calculate statistics for the dataset.
            Specifically the code should:
            1. Calculate descriptive statistics for numerical columns (where it makes sense)
            2. Calculate descriptive statistics for categorical columns (where it makes sense) 
            3. Be as thorough and detailed as possible
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A python code file in a folder called 'eda_files/' containing code to calculate basic statistics for the dataset.
            """,
        agent=eda_agent,
        output_file="eda_statistics.py",
        #output_pydantic=BasicStatistics
    ),
    
    # Task 2: Correlation Analysis
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and develop a python script to calculate correlations for the dataset.
            Specifically the code should:
            1. Calculate correlations
            2. Be as thorough and detailed as possible but do do not do any further analysis
            3. Infer any possible important insights from the correlations
            """.format(original_dataset=inputs["original_dataset"]),        
        expected_output="""
            - A python code file in a folder called 'eda_files/' containing code to calculate correlations for the dataset.
            """,
        agent=eda_agent,
        output_file="eda_correlation.py",
        #output_pydantic=CorrelationAnalysis
    ),
    
    # Task 3: Distribution Analysis
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and develop a python script to analyse variable distributions.
            Specifically the code should:
            1. Calculate distributions
            2. Identify outliers
            3. Be as thorough and detailed as possible but do do not do any further analysis
            4. Infer any possible important insights from the distributions
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A python code file in a folder called 'eda_files/' containing code to analyse variable distributions.
            """,
        agent=eda_agent,
        output_file="eda_distribution.py",
        #output_pydantic=DistributionAnalysis
    ),
    
    # Task 4: Pattern Identification
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} file and develop a python script to identify patterns and trends.
            Specifically the code should:
            1. Look for temporal patterns
            2. Identify clusters
            3. Be as thorough and detailed as possible but do do not do any further analysis
            4. Infer any possible important insights from the patterns
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            - A python code file in a folder called 'eda_files/' containing code to identify patterns and trends.
            """,
        agent=eda_agent,
        output_file="eda_patterns.py",
        #output_pydantic=PatternIdentification
    )
]

### Data Visualisation

In [7]:
""" The data visualisation tasks will be completed in sequence by the data_visualisation_agent """

# Visualisation Tasks
visualisation_process = [
    Task(
        description="""
            Ingest the first 500 rows of {original_dataset} and generate a python script that will generate various detailed visualisations.
            Specifically:
            1. Ingest the first 500 rows of {original_dataset} file using pandas
            2. Use seaborn for visualisation
            3. Be creative and thoughtful with what charts to create
            4. Be thorough and make an extensive list of visualisations
            4. Include legends and titles
            """.format(original_dataset=inputs["original_dataset"]),
        expected_output="""
            A python code file in a folder called visualisation_files/ containing the code to generate visualisations as per the task.
            """,
        agent=data_visualisation_agent,
        output_file="visualisations.py"
        #output_pydantic=VisualisationCreation
    ),
]

In [8]:
# Concatenate all the tasks into a single process
all_tasks = data_cleaning_process + eda_process + visualisation_process

## Create the Crew

In [9]:
from langchain_openai import ChatOpenAI

data_analysis_crew = Crew(
    agents=[
        #data_cleaning_agent,
        #eda_agent,
        data_visualisation_agent,
    ],

    #tasks=all_tasks,
    tasks=visualisation_process,
    manager_llm=ChatOpenAI(model="gpt-4o-mini", temperature=0.2), # try to make the manager more deterministic
    process=Process.hierarchical,
    verbose=False,
    memory=True
)

In [10]:
# Use this to give feedback on task completions
#data_analysis_crew.train(n_iterations=1, filename="crew_training.pkl")

## Kickoff the crew

In [11]:
result = data_analysis_crew.kickoff()

[93m Maximum iterations reached. Requesting final answer.[00m
