In [29]:
from langchain.agents import create_agent
from langchain_core.tools import tool
from typing import Annotated
import os 
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
import subprocess

load_dotenv()



True

In [30]:
@tool
def list_files_in_directory(directory="data"):
    """List all files in a directory
    Args:
        directory: The directory to list files from
    Returns:
        A string listing all files in the directory
    """

    data_folder = Path(directory)
    if not data_folder.exists():
        return []
    return f"found {[f.name for f in data_folder.iterdir() if f.is_file()]} files in {directory}"


@tool 
def read_csv_file(file_path:str):
    """Read a CSV file
    Args:
        file_path: The path to the CSV file
    Returns:
        A string containing information from the CSV file
    """
    if os.path.exists(file_path) is False:
        return f"File {file_path} does not exist"
    
    df = pd.read_csv(file_path)

    first_5_rows = df.head(5)

    describe = df.describe()

    info = df.info()

    return f"First 5 rows: {first_5_rows}\nDescription: {describe}\nInfo: {info}"
    
# repl = PythonREPL()

@tool
def python_execute_tool(
    code:Annotated[str, "The python code to execute"]
):
    
    """Use this to execute python code. If you want to see the output of a value,
    you should print it out with `print(...)`. This is visible to the user. Any charts should be displayed using `plt.show()`."""
    try:
        result = subprocess.run(
            ["python3", "-c", code],
            capture_output=True,
            text=True,
            timeout=5,
        )
        if result.returncode != 0:
            return f"❌ Error: {result.stderr.strip()}"
        return result.stdout.strip() or "✅ Code executed successfully (no output)"
    except subprocess.TimeoutExpired:
        return "❌ Error: Code execution timed out"
    return f"Successfully executed the Python REPL tool.\n\nPython code executed:\n```python\n{code}\n```\n\nCode output:\n```\n{result}\n```"

In [31]:



research_agent = create_agent(
    "openai:gpt-4o-mini",
    tools=[list_files_in_directory, read_csv_file],
    system_prompt=(
        """
        You are a helpful assistant that can list files in a directory, and read CSV files.
        you are responsible for givine the user an overview of the dataset 
        """
    )
)


analysis_agent = create_agent(
    "openai:gpt-4o-mini",
    tools=[python_execute_tool],
    system_prompt=(
        """
        You are an Data Analyst.
        You are responsible for analyzing the dataset and providing the user with insights.
        You have access to a sandbox to execute python code.
        """
    )
)







In [32]:
# query= "Tell me about the data science student dataset and the data present in it"

# for step in agent.stream(
#     {"messages": [{"role": "user", "content": query}]}
# ):
#     for update in step.values():
#         for message in update.get("messages", []):
#             message.pretty_print()
    

In [33]:
@tool
def research(request:str) -> str:
    """
    Research Information about a file

    Use this when the user needs information about the avaliable files in the directory or 
    about a specific csv file present in the directory. Respond with the information in a concise manner.

    Input: Natural Language Request 
    Examples: 
    - What files are present in the directory?
    - What data is present in the data science student dataset?
    - What is the average age of the students in the data science student dataset?
    - What is the average salary of the students in the data science student dataset?
    - What is the average age of the students in the data science student dataset?
    """
    result = research_agent.invoke({
        "messages": [{"role": "user", "content": request}]
    })
    return result["messages"][-1].text

@tool
def analyze_data(request:str) -> str:
    """
    Analyze the data present in the dataset

    Use this when the user needs to analyze the data present in the dataset or to generate a chart that is not answered by the research tool.
    You have access to a sandbox to execute python code.

    Input: Natural Language Request 
    Examples: 
       - Plot a bar chart of the average age of the students in the data science student dataset.
       - Create a histogram of the age of the students in the data science student dataset.
       - Run a regression analysis on the data science student dataset.
       
    """
    result = analysis_agent.invoke({
        "messages": [{"role": "user", "content": request}]
    })
    return result["messages"][-1].text






In [34]:
supervisor_agent = create_agent(
    "openai:gpt-4o-mini",
    tools=[research, analyze_data],
    system_prompt=(
        """
        You are a super assistant that can research and analyze the data present. 
        You have access to a research tool and an analyze tool.
        You are responsible for giving the user the best possible answer to their question.
        When a request involves multiple actions, use multiple tools in sequence.
        Do not make assumptions, only use the tools provided to you.
        Summarize your response in a concise manner.

        """)
    )

In [35]:
query = (
    "Give me an overview of the datasets I have "
    "Visualize any interesting insights from the first dataset"
)

for step in supervisor_agent.stream(
    {"messages": [{"role": "user", "content": query}]}
):
    for update in step.values():
        for message in update.get("messages", []):
            message.pretty_print()

Tool Calls:
  research (call_hl2ncaoek2MFf0V1878uxfPR)
 Call ID: call_hl2ncaoek2MFf0V1878uxfPR
  Args:
    request: What files are present in the directory?
Name: research

The directory contains the following file: 

- `data_science_student_marks.csv` 

If you would like to know more about this file or its contents, please let me know!
Tool Calls:
  research (call_vR8cK3hV6rl6sDaX8ESoZx2z)
 Call ID: call_vR8cK3hV6rl6sDaX8ESoZx2z
  Args:
    request: What data is present in the data_science_student_marks.csv dataset?
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   student_id      497 non-null    int64 
 1   location        497 non-null    object
 2   age             497 non-null    int64 
 3   sql_marks       497 non-null    int64 
 4   excel_marks     497 non-null    int64 
 5   python_marks    497 non-null    int64 
 6   power_bi_marks  