# AI-Powered Data Analysis

## Download the datasets

In [1]:
# ! wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/N0CceRlquaf9q85PK759WQ/regression-dataset.csv
# ! wget https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/7J73m6Nsz-vmojwab91gMA/classification-dataset.csv

## Importing required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn
import sklearn
import langchain
import openai
import langchain_ollama

import glob
import os
from typing import List, Optional

from langchain_classic.agents import tool, create_openai_tools_agent, create_tool_calling_agent, AgentExecutor

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

2026-01-17 20:43:54.533163: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Langchain Tool

### List CSV files

In [3]:
from langchain_core.tools import tool


@tool
def list_csv_files() -> Optional[List[str]]:
    """List all CSV file names in the local directory.

    Returns:
        A list containing CSV file names.
        If no CSV files are found, returns None.
    """
    csv_files = glob.glob(os.path.join(os.getcwd(), "*.csv"))
    if not csv_files:
        return None
    return [os.path.basename(file) for file in csv_files]

In [4]:
print("Tool Name:", list_csv_files.name)
print("Tool Description:", list_csv_files.description)
print("Tool Arguments:", list_csv_files.args)

files = list_csv_files.invoke({})
files

Tool Name: list_csv_files
Tool Description: List all CSV file names in the local directory.

    Returns:
        A list containing CSV file names.
        If no CSV files are found, returns None.
Tool Arguments: {}


['classification-dataset.csv', 'regression-dataset.csv']

### Dataset caching tool

While loading datasets multiple times, we waste tokens and context window space. To fix this, create a **global** cache that stores DataFrames after theyâ€™re loaded the first time. This approach has several benefits:

1. It reduces token usage by letting you reference datasets by name instead of pasting their contents  
2. It improves performance by loading each dataset only once  
3. It keeps datasets available across different tool calls  

The following tool lets the agent preload datasets into this cache system.

In [5]:
DATAFRAME_CACHE = {}

@tool
def preload_datasets(paths: List[str]) -> str:
    """
    Loads CSV files into a global cache if not already loaded.
    
    This function helps to efficiently manage datasets by loading them once
    and storing them in memory for future use. Without caching, you would
    waste tokens describing dataset contents repeatedly in agent responses.
    
    Args:
        paths: A list of file paths to CSV files.

    Returns:
        A message summarizing which datasets were loaded or already cached.
    """
    loaded = []
    cached = []
    for path in paths:
        if path not in DATAFRAME_CACHE:
            DATAFRAME_CACHE[path] = pd.read_csv(path)
            loaded.append(path)
        else:
            cached.append(path)
    
    return (
        f"Loaded datasets: {loaded}\n"
        f"Already cached: {cached}"
    )

In [6]:
print("Tool Name:", preload_datasets.name)
print("Tool Description:", preload_datasets.description)
print("Tool Arguments:", preload_datasets.args)


preload_datasets.invoke({"paths":files})

len(DATAFRAME_CACHE)

Tool Name: preload_datasets
Tool Description: Loads CSV files into a global cache if not already loaded.

This function helps to efficiently manage datasets by loading them once
and storing them in memory for future use. Without caching, you would
waste tokens describing dataset contents repeatedly in agent responses.

Args:
    paths: A list of file paths to CSV files.

Returns:
    A message summarizing which datasets were loaded or already cached.
Tool Arguments: {'paths': {'items': {'type': 'string'}, 'title': 'Paths', 'type': 'array'}}


2

### Summarization tool 

In [7]:
from typing import List, Optional,Dict,Any

@tool
def get_dataset_summaries(dataset_paths: List[str]) -> List[Dict[str, Any]]:
    """
    Analyze multiple CSV files and return metadata summaries for each.

    Args:
        dataset_paths (List[str]): 
            A list of file paths to CSV datasets.

    Returns:
        List[Dict[str, Any]]: 
            A list of summaries, one per dataset, each containing:
            - "file_name": The path of the dataset file.
            - "column_names": A list of column names in the dataset.
            - "data_types": A dictionary mapping column names to their data types (as strings).
    """
    summaries = []

    for path in dataset_paths:
        # Load and cache the dataset if not already cached
        if path not in DATAFRAME_CACHE:
            DATAFRAME_CACHE[path] = pd.read_csv(path)
        
        df = DATAFRAME_CACHE[path]

        # Build summary
        summary = {
            "file_name": path,
            "column_names": df.columns.tolist(),
            "data_types": df.dtypes.astype(str).to_dict()
        }

        summaries.append(summary)

    return summaries

In [8]:
print("Tool Name:", get_dataset_summaries.name)
print("Tool Description:", get_dataset_summaries.description)
print("Tool Arguments:", get_dataset_summaries.args)


summaries =get_dataset_summaries.invoke({"dataset_paths":files})

summaries

Tool Name: get_dataset_summaries
Tool Description: Analyze multiple CSV files and return metadata summaries for each.

Args:
    dataset_paths (List[str]): 
        A list of file paths to CSV datasets.

Returns:
    List[Dict[str, Any]]: 
        A list of summaries, one per dataset, each containing:
        - "file_name": The path of the dataset file.
        - "column_names": A list of column names in the dataset.
        - "data_types": A dictionary mapping column names to their data types (as strings).
Tool Arguments: {'dataset_paths': {'items': {'type': 'string'}, 'title': 'Dataset Paths', 'type': 'array'}}


[{'file_name': 'classification-dataset.csv',
  'column_names': ['mean radius',
   'mean texture',
   'mean perimeter',
   'mean area',
   'mean smoothness',
   'mean compactness',
   'mean concavity',
   'mean concave points',
   'mean symmetry',
   'mean fractal dimension',
   'radius error',
   'texture error',
   'perimeter error',
   'area error',
   'smoothness error',
   'compactness error',
   'concavity error',
   'concave points error',
   'symmetry error',
   'fractal dimension error',
   'worst radius',
   'worst texture',
   'worst perimeter',
   'worst area',
   'worst smoothness',
   'worst compactness',
   'worst concavity',
   'worst concave points',
   'worst symmetry',
   'worst fractal dimension',
   'target'],
  'data_types': {'mean radius': 'float64',
   'mean texture': 'float64',
   'mean perimeter': 'float64',
   'mean area': 'float64',
   'mean smoothness': 'float64',
   'mean compactness': 'float64',
   'mean concavity': 'float64',
   'mean concave points': 'fl

### DataFrame method execution tool

In [9]:
@tool
def call_dataframe_method(file_name: str, method: str) -> str:
   """
   Execute a method on a DataFrame and return the result.
   This tool lets you run simple DataFrame methods like 'head', 'tail', or 'describe' 
   on a dataset that has already been loaded and cached using 'preload_datasets'.
   Args:
       file_name (str): The path or name of the dataset in the global cache.
       method (str): The name of the method to call on the DataFrame. Only no-argument 
                     methods are supported (e.g., 'head', 'describe', 'info').
   Returns:
       str: The output of the method as a formatted string, or an error message if 
            the dataset is not found or the method is invalid.
   Example:
       call_dataframe_method(file_name="data.csv", method="head")
   """
   # Try to get the DataFrame from cache, or load it if not already cached
   if file_name not in DATAFRAME_CACHE:
       try:
           DATAFRAME_CACHE[file_name] = pd.read_csv(file_name)
       except FileNotFoundError:
           return f"DataFrame '{file_name}' not found in cache or on disk."
       except Exception as e:
           return f"Error loading '{file_name}': {str(e)}"
   
   df = DATAFRAME_CACHE[file_name]
   func = getattr(df, method, None)
   if not callable(func):
       return f"'{method}' is not a valid method of DataFrame."
   try:
       result = func()
       return str(result)
   except Exception as e:
       return f"Error calling '{method}' on '{file_name}': {str(e)}"

In [10]:
print("Tool Name:", call_dataframe_method.name)
print("Tool Description:", call_dataframe_method.description)
print("Tool Arguments:", call_dataframe_method.args)


summaries = call_dataframe_method.invoke({
    "file_name": files[1],
    "method": "info"
})

print(summaries)


Tool Name: call_dataframe_method
Tool Description: Execute a method on a DataFrame and return the result.
This tool lets you run simple DataFrame methods like 'head', 'tail', or 'describe' 
on a dataset that has already been loaded and cached using 'preload_datasets'.
Args:
    file_name (str): The path or name of the dataset in the global cache.
    method (str): The name of the method to call on the DataFrame. Only no-argument 
                  methods are supported (e.g., 'head', 'describe', 'info').
Returns:
    str: The output of the method as a formatted string, or an error message if 
         the dataset is not found or the method is invalid.
Example:
    call_dataframe_method(file_name="data.csv", method="head")
Tool Arguments: {'file_name': {'title': 'File Name', 'type': 'string'}, 'method': {'title': 'Method', 'type': 'string'}}
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype

### Model evaluation tools

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error

# Assumes this global cache is shared
DATAFRAME_CACHE = {}

@tool
def evaluate_classification_dataset(file_name: str, target_column: str) -> Dict[str, float]:
    """
    Train and evaluate a classifier on a dataset using the specified target column.
    Args:
        file_name (str): The name or path of the dataset stored in DATAFRAME_CACHE.
        target_column (str): The name of the column to use as the classification target.
    Returns:
        Dict[str, float]: A dictionary with the model's accuracy score.
    """
    # Try to get the DataFrame from cache, or load it if not already cached
    if file_name not in DATAFRAME_CACHE:
        try:
            DATAFRAME_CACHE[file_name] = pd.read_csv(file_name)
        except FileNotFoundError:
            return {"error": f"DataFrame '{file_name}' not found in cache or on disk."}
        except Exception as e:
            return {"error": f"Error loading '{file_name}': {str(e)}"}
    
    df = DATAFRAME_CACHE[file_name]
    if target_column not in df.columns:
        return {"error": f"Target column '{target_column}' not found in '{file_name}'."}
    
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return {"accuracy": acc}

@tool
def evaluate_regression_dataset(file_name: str, target_column: str) -> Dict[str, float]:
    """
    Train and evaluate a regression model on a dataset using the specified target column.
    Args:
        file_name (str): The name or path of the dataset stored in DATAFRAME_CACHE.
        target_column (str): The name of the column to use as the regression target.
    Returns:
        Dict[str, float]: A dictionary with RÂ² score and Mean Squared Error.
    """
    # Try to get the DataFrame from cache, or load it if not already cached
    if file_name not in DATAFRAME_CACHE:
        try:
            DATAFRAME_CACHE[file_name] = pd.read_csv(file_name)
        except FileNotFoundError:
            return {"error": f"DataFrame '{file_name}' not found in cache or on disk."}
        except Exception as e:
            return {"error": f"Error loading '{file_name}': {str(e)}"}
    
    df = DATAFRAME_CACHE[file_name]
    if target_column not in df.columns:
        return {"error": f"Target column '{target_column}' not found in '{file_name}'."}
    
    X = df.drop(columns=[target_column])
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    return {
        "r2_score": r2,
        "mean_squared_error": mse
    }

In [12]:
print("Tool Name:", evaluate_classification_dataset.name)
print("Tool Description:", evaluate_classification_dataset.description)
print("Tool Arguments:", evaluate_classification_dataset.args)

print("Tool Name:", evaluate_regression_dataset.name)
print("Tool Description:", evaluate_regression_dataset.description)
print("Tool Arguments:", evaluate_regression_dataset.args)

Tool Name: evaluate_classification_dataset
Tool Description: Train and evaluate a classifier on a dataset using the specified target column.
Args:
    file_name (str): The name or path of the dataset stored in DATAFRAME_CACHE.
    target_column (str): The name of the column to use as the classification target.
Returns:
    Dict[str, float]: A dictionary with the model's accuracy score.
Tool Arguments: {'file_name': {'title': 'File Name', 'type': 'string'}, 'target_column': {'title': 'Target Column', 'type': 'string'}}
Tool Name: evaluate_regression_dataset
Tool Description: Train and evaluate a regression model on a dataset using the specified target column.
Args:
    file_name (str): The name or path of the dataset stored in DATAFRAME_CACHE.
    target_column (str): The name of the column to use as the regression target.
Returns:
    Dict[str, float]: A dictionary with RÂ² score and Mean Squared Error.
Tool Arguments: {'file_name': {'title': 'File Name', 'type': 'string'}, 'target_col

In [13]:
result = evaluate_classification_dataset.invoke({
    "file_name": files[0],
    "target_column": "target"
})

print(result)

{'accuracy': 0.9649122807017544}


In [14]:
result = evaluate_regression_dataset.invoke({
    "file_name": files[0],
    "target_column": "target"
})

print(result)

{'r2_score': 0.8590216180805765, 'mean_squared_error': 0.033118421052631575}


## Agents

Agents in LangChain are advanced components that enable AI models to decide when and how to use tools dynamically. Instead of relying on predefined scripts, agents analyze user queries and choose the best tools to achieve a goal. The next step is defining your agent, which requires specifying how it should think and behave. You'll use `ChatPromptTemplate.from_messages()` to create a structured prompt with three essential components:

1. **System message**: This establishes the agent's identity and primary objective. You define it as a data science assistant whose task is to analyze CSV files and determine whether each dataset is suitable for classification or regression based on its structure. This gives the agent a clear purpose and scope.

2. **User input**: The `{input}` placeholder will be replaced with the user's actual query. This allows the agent to respond directly to what the user is asking about.

3. **Agent scratchpad**: The `{agent_scratchpad}` placeholder is crucial for tool-calling agents as it provides space for the agent to show its reasoning process and track intermediate steps. This enables the agent to build a chain of thought, call tools sequentially, and use the results from one tool to inform decisions about subsequent tool calls.

![agents copy.png](https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/TYkDvBmpmmSXx6TftNpJgw/agents%20copy.png)

[Reference article for image](https://medium.com/@Shamimw/understanding-langchain-tools-and-agents-a-guide-to-building-smart-ai-applications-e81d200b3c12)


In [15]:
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a data science assistant. Use the available tools to analyze CSV files. "
     "Your job is to determine whether each dataset is for classification or regression, based on its structure."),

    ("placeholder", "{agent_scratchpad}"),  # Fixed: Use MessagesPlaceholder(variable_name="agent_scratchpad")
    ("user", "{input}"),  # Order doesn't matter much, but user last before scratchpad
])

In [16]:
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a data science assistant. Use the available tools to analyze CSV files. "
     "Your job is to determine whether each dataset is for classification or regression."),
    ("user", "{input}"),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])


In [17]:
# from langchain.chat_models import init_chat_model

# llm = init_chat_model("llama3.2:1b", model_provider="ollama", streaming=False )

from langchain_ollama import ChatOllama 

llm = ChatOllama(
    model= "llama3.1:8b", 
    temperature=0  # Low temp for reliable tool calling
)


In [18]:
tools=[list_csv_files, preload_datasets, get_dataset_summaries, call_dataframe_method, evaluate_classification_dataset, evaluate_regression_dataset]

### Agent creation and limitations

Here, you'll create your agent using `create_openai_tools_agent()`, which combines the language model, tools, and prompt template into a functional agent. However, this raw agent has significant limitations when used directly. It only performs a single step of reasoning and tool usage per invocation, then returns its intermediate thought process rather than a final answer. This behavior occurs because the agent doesn't automatically manage the full loop of thinking, acting, observing results, and continuing to reason until reaching a complete solution.


In [19]:
# Construct the tool calling agent
#agent = create_openai_tools_agent(llm, tools, prompt)
agent = create_tool_calling_agent(llm,tools, prompt)

In [20]:
response = agent.invoke({
    "input": "List out the dataset .csv files in current directory",
    "intermediate_steps": []
})

In [21]:
# Get the first ToolAgentAction from the list
action = response[0]

# Print the key details
print("ðŸ§  Agent decided to call a tool:")
print("Tool Name:", action.tool)
print("Tool Input:", action.tool_input)
print("Log:\n", action.log.strip())

ðŸ§  Agent decided to call a tool:
Tool Name: list_csv_files
Tool Input: {}
Log:
 Invoking: `list_csv_files` with `{}`


### Agent executor ReAct

Managing this ReAct loop manually can be cumbersome, which is why you'll use the AgentExecutor. The AgentExecutor wraps the agent and the toolset, and handles the full tool-use loop behind the scenes. It automatically runs the agent, executes the selected tool, takes the result (observation), and feeds it back into the agent until a final answer is reached. Without the executor, you'd have to manually manage every step, including checking whether the agent returned a tool call or a final answer, running the tool, and tracking the intermediate stepsâ€”all of which the executor handles for you.


In [22]:
from langchain_classic.agents import AgentExecutor

#### Agent executor configuration

The `AgentExecutor` line creates a complete, autonomous agent system by wrapping your basic agent with additional functionality. This executor manages the full ReAct loop (reasoning and acting) that allows the agent to make multiple tool calls in sequence until reaching a final answer. Configure it with several important parameters:

1. **agent**: The agent to run for creating a plan and determining actions to take at each step of the execution loop.

2. **tools**: The valid tools the agent can call.
3. **verbose=True**: Enables detailed logging of each step in the agent's thinking and tool-calling process, which is invaluable for debugging and understanding how the agent arrives at its conclusions.

4. **handle_parsing_errors=True**: Rather than crashing, the executor will attempt to recover and continue the conversation.

The second line, `agent_executor.agent.stream_runnable = False`, disables streaming mode for the agent. 


In [23]:
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True,
    return_intermediate_steps=True   # REQUIRED
)

agent_executor.agent.stream_runnable = False

In [24]:
response = agent_executor.invoke({
    "input": "List out the dataset .csv files in current directory"
})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `list_csv_files` with `{}`


[0m[36;1m[1;3m['classification-dataset.csv', 'regression-dataset.csv'][0m[32;1m[1;3mThe available CSV files are `classification-dataset.csv` and `regression-dataset.csv`.[0m

[1m> Finished chain.[0m


In [25]:
type(response)

dict

In [26]:
action, tool_result = response["intermediate_steps"][0]

print("ðŸ§  Agent decided to call a tool:")
print("Tool Name:", action.tool)
print("Tool Input:", action.tool_input)
print("Log:\n", action.log.strip())

print("\nðŸ“¤ Tool Output:")
print(tool_result)


ðŸ§  Agent decided to call a tool:
Tool Name: list_csv_files
Tool Input: {}
Log:
 Invoking: `list_csv_files` with `{}`

ðŸ“¤ Tool Output:
['classification-dataset.csv', 'regression-dataset.csv']


In [29]:
print("ðŸ“Š Ask questions about your dataset (type 'exit' to quit):")

while True:
    user_input=input(" You:")
    if user_input.strip().lower() in ['exit','quit']:
        print("see ya later")
        break
    result=agent_executor.invoke({"input":user_input})
    print(f"my Agent: {result['output']}")

ðŸ“Š Ask questions about your dataset (type 'exit' to quit):


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `list_csv_files` with `{}`


[0m[36;1m[1;3m['classification-dataset.csv', 'regression-dataset.csv'][0m[32;1m[1;3mThe available CSV files are `classification-dataset.csv` and `regression-dataset.csv`.[0m

[1m> Finished chain.[0m
my Agent: The available CSV files are `classification-dataset.csv` and `regression-dataset.csv`.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `get_dataset_summaries` with `{'dataset_paths': ['regression-dataset.csv']}`


[0m[38;5;200m[1;3m[{'file_name': 'regression-dataset.csv', 'column_names': ['Unnamed: 0', 'MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'target'], 'data_types': {'Unnamed: 0': 'int64', 'MedInc': 'float64', 'HouseAge': 'float64', 'AveRooms': 'float64', 'AveBedrms': 'float64', 'Population': 'float64', 'AveOccup': 'float64', 'La