In [2]:
from langchain.agents import initialize_agent
from langchain.agents.agent_types import AgentType
from langchain.tools import Tool
from langchain.memory import ConversationBufferMemory
import pandas as pd
import numpy as np
import re
from pydantic import BaseModel, Field
from typing import Optional
from langchain.tools import BaseTool
import os
from typing import ClassVar, Type
from pydantic import BaseModel, Field
from langchain.tools import BaseTool

In [4]:
pip install -qU "langchain[groq]"

Note: you may need to restart the kernel to use updated packages.


In [18]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("deepseek-r1-distill-llama-70b", model_provider="groq")

In [8]:
from langchain.tools import tool
import pandas as pd
from scipy.stats import spearmanr, pearsonr
import re

# Cache the dataframe so the tool can reuse it
DATAFRAME = None

def load_dataset(filepath):
    global DATAFRAME
    DATAFRAME = pd.read_csv(filepath)

# LangChain tool
@tool
def correlation_tool(query: str) -> str:
    """
    Extracts two column names from a user query and computes Pearson and Spearman correlation.
    Example query: 'Find correlation between sales and profit'
    """

    global DATAFRAME
    if DATAFRAME is None:
        return "No dataset loaded. Please load a dataset first."

    def extract_columns(query, df_columns):
        found_columns = []
        for col in df_columns:
            if re.search(rf'\b{col}\b', query, re.IGNORECASE):
                found_columns.append(col)
        return found_columns if len(found_columns) == 2 else None

    def calculate_correlations(df, col1, col2):
        x = df[col1].dropna()
        y = df[col2].dropna()
        min_len = min(len(x), len(y))
        x = x[:min_len]
        y = y[:min_len]
        pearson_corr, _ = pearsonr(x, y)
        spearman_corr, _ = spearmanr(x, y)
        return pearson_corr, spearman_corr

    columns = extract_columns(query, DATAFRAME.columns)

    if columns:
        col1, col2 = columns
        pearson_corr, spearman_corr = calculate_correlations(DATAFRAME, col1, col2)
        return (
            f"Pearson correlation between '{col1}' and '{col2}': {pearson_corr:.4f}\n"
            f"Spearman correlation between '{col1}' and '{col2}': {spearman_corr:.4f}"
        )
    else:
        return "Could not identify two valid columns from your query."


In [10]:
from langchain.agents import initialize_agent, AgentType


In [12]:
load_dataset("assignment_data.csv")  # Replace with actual path


In [14]:
tools = [correlation_tool]


In [24]:
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)


In [26]:
response = agent.run("Find correlation between orders_all_np and DisplayAll_spend")
print(response)

  response = agent.run("Find correlation between orders_all_np and DisplayAll_spend")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m<think>
Okay, so I need to find the correlation between two columns: orders_all_np and DisplayAll_spend. Hmm, I'm not entirely sure what these columns represent, but I guess orders_all_np could be the number of orders, and DisplayAll_spend might be the amount spent on displaying something, maybe ads or products. 

First, I should figure out what kind of data these columns have. Are they numerical? I hope so because correlation usually works with numbers. If they're not, I might have to convert them or find another way to measure the relationship. 

I remember there are two main types of correlation: Pearson and Spearman. Pearson is for linear relationships and assumes that the data is normally distributed. Spearman is non-parametric, so it works better when the data isn't normal or when dealing with ranks. I'm not sure which one to use yet, so maybe I'll calculate both and see which one makes more sense.

Using the correlatio