In [1]:
!pip install --upgrade dspy openai



In [2]:
import pandas as pd
import numpy as np
import openai
import dspy

In [3]:
from google.colab import userdata
OPENAI_KEY = userdata.get('GPT_KEY')
openai.api_key = OPENAI_KEY

In [4]:
openai_model = dspy.OpenAI(model='gpt-3.5-turbo',api_key = OPENAI_KEY)
dspy.settings.configure(lm=openai_model)

In [5]:
class PlannerAgent(dspy.Signature):
    """ You are a data analytics planner agent. Your task is to create a strategic plan for achieving a user-defined goal using the user defined goal and agents. You have access to two key inputs:
          Data Agent Descriptions: A list of available data agents, each with a specific function or capability.
          User-Defined Goal: The end objective that the user wants to accomplish using the available datasets and agents.

        Task:
          1. Develop a comprehensive plan to achieve the user-defined goal. You will determine which agents and datasets to use in a sequence that best achieves the goal.
          2. If the goal appears infeasible or unclear, request further clarification or additional details from the user to refine the goal.

        output format:
          plan: Agent1 -> Agent2 -> Agent3 (e.g., Agent1 -> Agent3)
          plan_desc: Provide a detailed explanation of why each agent is selected in sequence. Describe how each agent contributes to achieving the goal.
          You are not required to use all the agents available in every response. Use only the relevant agents necessary for achieving the goal.

    """
    Agent_desc = dspy.InputField(desc= "The agents available in the system")
    goal = dspy.InputField(desc="The user defined goal ")
    plan = dspy.OutputField(desc="The plan that would achieve the user defined goal")
    plan_desc= dspy.OutputField(desc="The reasoning behind the chosen plan")

class preprocessing_agent(dspy.Signature):
    """ You are a Data Pre-processing Agent. Using Numpy and Pandas, create an EDA pipeline based on a user-defined goal and the available datasets. Clean the data (handle missing values, outliers, duplicates), transform it (scaling, encoding), and generate basic analysis (summary statistics, correlations, visualizations if needed). Output the Python code for the full pipeline, with brief comments explaining each step."""
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    goal = dspy.InputField(desc="The user defined goal ")
    code = dspy.OutputField(desc ="The code that does the data preprocessing and introductory analysis")

class statistical_analytics_agent(dspy.Signature):
    """ You are a Statistical Analytics Agent. Using Statsmodels, analyze the dataset to achieve the user-defined goal. Choose the appropriate statistical method (e.g., regression, hypothesis testing), preprocess the data if needed, and generate Python code for model fitting and analysis (e.g., p-values, confidence intervals). Include comments explaining each step."""
    dataset = dspy.InputField(desc="Available datasets loaded in the system, use this df_name,columns  set df as copy of df_name")
    goal = dspy.InputField(desc="The user defined goal for the analysis to be performed")
    commentary = dspy.OutputField(desc="The comments about what analysis is being performed")
    code = dspy.OutputField(desc ="The code that does the statistical analysis using statsmodel")

class Data_Viz(dspy.Signature):
    """
    You are an AI Agent using Plotly to create visualizations based on a user-defined goal and dataset. Identify the relevant data, generate the appropriate visualizations (e.g., bar charts, scatter plots), and output the Python code with necessary customizations. If required data is missing, state:
    "The dataset does not contain the necessary columns or information to generate the requested visualization."
    """
    dataset = dspy.InputField(desc=" Provides information about the data in the data frame. Only use column names and dataframe_name as in this context")
    goal = dspy.InputField(desc="user defined goal which includes information about data and chart they want to plot")
    commentary = dspy.OutputField(desc="The comments about what analysis is being performed")
    code= dspy.OutputField(desc="Plotly code that visualizes what the user needs according to the query & dataframe_index & styling_context")

class code_combiner_agent(dspy.Signature):
    """ You are a Code Combine Agent. Combine Python code from multiple agents into a single, error-free script. Fix any syntax, logical, or compatibility issues, optimize for efficiency, and remove redundancy. Output the final, well-commented Python code. """
    agent_code_list =dspy.InputField(desc="A list of code given by each agent")
    code = dspy.OutputField(desc="Refined complete code base")


In [6]:
from httpx import Client

# Save the original initialization
original_client_init = Client.__init__

# Patch to remove `proxies` argument
def patched_client_init(self, *args, **kwargs):
    kwargs.pop("proxies", None)  # Remove proxies if passed
    original_client_init(self, *args, **kwargs)

Client.__init__ = patched_client_init


In [7]:
class Data_analyst(dspy.Module):
    def __init__(self, agents):
        # Initialize agents and planner
        self.agents = {}
        self.agent_inputs = {}
        self.agent_desc = []
        i = 0
        for a in agents:
            name = a.__pydantic_core_schema__['schema']['model_name']
            self.agents[name] = dspy.ChainOfThought(a)
            self.agent_inputs[name] = {x.strip() for x in str(agents[i].__pydantic_core_schema__['cls']).split('->')[0].split('(')[1].split(',')}
            self.agent_desc.append(str(a.__pydantic_core_schema__['cls']))
            i += 1
        self.planner = dspy.ChainOfThought(PlannerAgent)
        self.combine = dspy.ChainOfThought(code_combiner_agent)


    def forward(self, dataset, query):
      dict_ = {}

      dict_['Agent_desc'] = str(self.agent_desc)
      dict_['goal'] = query

      execution_plan = self.planner(
            Agent_desc=dict_['Agent_desc'],
            goal= query,
            agents=[agent for agent in self.agents.values()]
            )
      print(execution_plan)

      chunk_size = 100
      chunks = np.array_split(dataset, len(dataset) // chunk_size + 1)
      print(len(chunks))
      docs = []
      for chunk in chunks:
        chunk_str = chunk.to_string()
        dict_ = {
            'dataset': chunk_str,
            'goal': query,
            'Agent_desc': str(self.agent_desc)
        }
        # agent_list = execution_plan.plan.split('->')
        # code_combiner = ""
        # Agent_outputs = {}
        # for agent_name in agent_list:
        #       name = agent_name.strip()
        #       agent = self.agents[name]
        #       result = agent(dataset=dict_['dataset'], agent_desc = dict_['Agent_desc'], goal= dict_['goal'])
        #       Agent_outputs[name] = result
        #       code_combiner += result.code
        # combined_code = self.combine(agent_code_list = code_combiner)



      agent_list = execution_plan.plan.split('->')
      code_combiner = ""
      Agent_outputs = {}
      for agent_name in agent_list:
            name = agent_name.strip()
            agent = self.agents[name]
            result = agent(dataset=dict_['dataset'], agent_desc = dict_['Agent_desc'], goal= dict_['goal'])
            Agent_outputs[name] = result
            code_combiner += result.code


      combined_code = self.combine(agent_code_list = code_combiner)
      return combined_code, Agent_outputs



agents = [preprocessing_agent, statistical_analytics_agent, Data_Viz]

Data_analyst_system = Data_analyst(agents)
dataset = pd.read_csv("/content/sample_data/california_housing_test.csv")
query = "Visualizations on the dataset"

output,agent_outputs = Data_analyst_system.forward(dataset, query)
print(output)


 		You are using the client GPT3, which will be removed in DSPy 2.6.
 		Changing the client is straightforward and will let you use new features (Adapters) that improve the consistency of LM outputs, especially when using chat LMs. 

 		Learn more about the changes and how to migrate at
 		https://github.com/stanfordnlp/dspy/blob/main/examples/migration.ipynb
  return bound(*args, **kwds)


Prediction(
    rationale='produce the plan_desc. We need to start by preprocessing the dataset to clean and transform it, then move on to perform statistical analysis to identify patterns and relationships in the data. Finally, we can use data visualization to present the insights in a clear and visually appealing manner.',
    plan='preprocessing_agent -> statistical_analytics_agent -> Data_Viz',
    plan_desc='1. We will start by using the preprocessing_agent to clean the dataset, handle missing values, outliers, duplicates, and transform the data as needed.\n2. Next, we will utilize the statistical_analytics_agent to perform statistical analysis on the preprocessed dataset to identify patterns and relationships that can be visualized.\n3. Finally, we will use the Data_Viz agent to create visualizations'
)
31
Prediction(
    rationale='Combine the code snippets to load a dataset, handle missing values and duplicates, and create scatter plots using different visualization libraries.'

In [8]:
print(agent_outputs['preprocessing_agent'])

Prediction(
    rationale='create visualizations for the dataset. We will start by loading the dataset, handling missing values, outliers, duplicates, and then proceed to transform the data for visualization purposes.',
    code='```python\nimport numpy as np\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Load the dataset\ndf_name = pd.read_csv(\'your_dataset.csv\')\ndf = df_name.copy()\n\n# Check for missing values\nmissing_values = df.isnull().sum()\nprint("Missing values:\\n", missing_values)\n\n# Handle missing values\ndf = df.dropna()\n\n# Check for duplicates\nduplicates = df.duplicated().sum()\nprint("Number of duplicates:", duplicates)\n\n# Handle duplicates\ndf = df.drop_duplicates()'
)


In [9]:
agent_outputs['statistical_analytics_agent']

Prediction(
    rationale='create visualizations for the dataset. We will start by loading the dataset and then use matplotlib and seaborn libraries to create various plots such as scatter plots, histograms, and box plots to visualize the data distribution and relationships between variables.',
    commentary='We will use matplotlib and seaborn libraries to create visualizations for the dataset.',
    code="```python\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Load the dataset\ndf = pd.read_csv('dataset.csv')\n\n# Scatter plot of median_income vs median_house_value\nplt.figure(figsize=(10, 6))\nsns.scatterplot(x='median_income', y='median_house_value', data=df)\nplt.title('Scatter plot of Median Income vs Median House Value')\nplt"
)

In [10]:
agent_outputs['Data_Viz']

Prediction(
    rationale='produce the code. We will create multiple visualizations to explore different aspects of the dataset such as the distribution of median house values, the relationship between median income and median house value, and the distribution of housing median age.',
    commentary='The following code will generate multiple visualizations using Plotly to explore different aspects of the dataset.',
    code="```python\nimport plotly.express as px\nimport pandas as pd\n\n# Create a DataFrame from the provided dataset\ndata = {\n    'longitude': [-118.32, -121.30, -117.69, -118.34, -121.92, -122.11, -117.65, -121.80, -122.66, -122.39],"
)