# Data Analysis Agent

This agent is an AI-powered data analysis tool that interprets and answers questions about a dataset using natural language. It combines language models with data manipulation tools to enable intuitive data exploration.


**Download necessary libraries**

In [None]:
!pip install pandas
!pip install numpy
!pip install langchain
!pip install langchain-experimental
!pip install langchain_groq
!pip install python-dotenv

**Import necessary libraries**

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from langchain.agents import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os

**Generate Sample Data**

In [4]:
# Set a random seed for reproducibility
np.random.seed(42)

# Generate sample data
n_rows = 1000

# Generate dates
start_date = datetime(2024, 1, 1)
dates = [start_date + timedelta(days=i) for i in range(n_rows)]

# Define data categories
makes = ['Toyota', 'Honda', 'Ford', 'Chevrolet',
         'Nissan', 'BMW', 'Mercedes', 'Audi', 'Hyundai', 'Kia']
models = ['Sedan', 'SUV', 'Truck', 'Hatchback', 'Coupe', 'Van']
colors = ['Red', 'Blue', 'Black', 'White', 'Silver', 'Gray', 'Green']

# Create the dataset
data = {
    'Date': dates,
    'Make': np.random.choice(makes, n_rows),
    'Model': np.random.choice(models, n_rows),
    'Color': np.random.choice(colors, n_rows),
    'Year': np.random.randint(2015, 2023, n_rows),
    'Price': np.random.uniform(20000, 80000, n_rows).round(2),
    'Mileage': np.random.uniform(0, 100000, n_rows).round(0),
    'EngineSize': np.random.choice([1.6, 2.0, 2.5, 3.0, 3.5, 4.0], n_rows),
    'FuelEfficiency': np.random.uniform(20, 40, n_rows).round(1),
    'SalesPerson': np.random.choice(['Alice', 'Bob', 'Charlie', 'David', 'Eva'], n_rows)
}

# Create DataFrame and sort by date
df = pd.DataFrame(data).sort_values('Date')

# Display DataFrame
print("\nFirst ten rows of the generated data:")
print(df.head(10))

print("\nDataFrame info:")
df.info()


First ten rows of the generated data:
        Date       Make      Model  Color  Year     Price  Mileage  \
0 2024-01-01   Mercedes      Sedan  Green  2022  57952.65   5522.0   
1 2024-01-02  Chevrolet  Hatchback    Red  2021  58668.22  94238.0   
2 2024-01-03       Audi      Truck  White  2019  69187.87   7482.0   
3 2024-01-04     Nissan  Hatchback  Black  2016  40004.44  43846.0   
4 2024-01-05   Mercedes  Hatchback    Red  2016  63983.07  52988.0   
5 2024-01-06        Kia  Hatchback   Gray  2018  51714.69  98530.0   
6 2024-01-07       Ford  Hatchback  White  2022  67603.64  16602.0   
7 2024-01-08   Mercedes      Coupe    Red  2022  79432.76  11443.0   
8 2024-01-09       Audi      Coupe    Red  2020  32039.06    871.0   
9 2024-01-10     Nissan        SUV  Black  2015  79678.92  15557.0   

   EngineSize  FuelEfficiency SalesPerson  
0         2.0            24.7       Alice  
1         1.6            26.2         Bob  
2         2.0            28.0       David  
3         3.5 

**Create Data Analysis Agent**

In [5]:
# Load variables from .env file
load_dotenv()
# Load the API key from the .env file
api_key = os.getenv("API_KEY")

In [14]:
# Initialize the Groq model with LangChain
llm = ChatGroq(
    model_name="llama3-70b-8192",
    api_key= api_key
)

# Create the Pandas DataFrame agent
agent = create_pandas_dataframe_agent(
    llm = llm,
    df= df,
    verbose= True,
    allow_dangerous_code= True,
    agent_type= AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    max_iterations= 5,
)

print("Data Analysis Agent is ready. You can now ask questions about the data.")

Data Analysis Agent is ready. You can now ask questions about the data.


**Define Question-Asking Function**

In [None]:
def ask_agent(question):
    """Function to ask questions to the agent and display the response"""

    response = agent.invoke({
        "input": question,
        "agent_scratchpad": f"Human: {question}\nAI: To answer this question, I need to use Python to analyze the dataframe. I'll use the python_repl_ast tool.\n\nAction: python_repl_ast\nAction Input: ",
    })

    print("---")
    print(f"Question: {question}")
    print(f"Answer: {response['output']}")
    print("---")


In [19]:
ask_agent("What are the column names in this dataset?")
ask_agent("How many rows are in this dataset?")
ask_agent("Find the most expensive model of Nissan")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To get the column names of the dataframe, I can use the `columns` attribute.

Action: python_repl_ast
Action Input: df.columns[0m[36;1m[1;3mIndex(['Date', 'Make', 'Model', 'Color', 'Year', 'Price', 'Mileage',
       'EngineSize', 'FuelEfficiency', 'SalesPerson'],
      dtype='object')[0m[32;1m[1;3mI now know the final answer.
Final Answer: The column names in this dataset are: 'Date', 'Make', 'Model', 'Color', 'Year', 'Price', 'Mileage', 'EngineSize', 'FuelEfficiency', 'SalesPerson'.[0m

[1m> Finished chain.[0m
---
Question: What are the column names in this dataset?
Answer: The column names in this dataset are: 'Date', 'Make', 'Model', 'Color', 'Year', 'Price', 'Mileage', 'EngineSize', 'FuelEfficiency', 'SalesPerson'.
---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To find the number of rows in the dataset, I need to use the shape attribute of the pandas dataframe.

Action: python_