In [1]:
import kuzu
import pandas as pd
import numpy as np
import json

In [2]:
purchase_frequency_data = {
    'PurchaseFrequency': [3.0, 3.5, 4.0, 4.2, 4.3, 4.4, 4.5, 4.6, 4.6, 4.5],
    'ProductSubcategory': ['Mountain Bikes', 'Road Bikes', 'Hybrid Bikes', 'Hybrid Bikes', 'BMX Bikes',
                            'Mountain Bikes', 'Road Bikes', 'Hybrid Bikes', 'Hybrid Bikes', 'BMX Bikes'],
    'Color': ['Silver', 'Black', 'Red', 'Blue', 'White', 'Silver', 'Black', 'Red', 'Blue', 'White'],
    'SalesDate': ['2023-11-05', '2023-07-21', '2022-12-15', '2022-09-30', '2022-06-14', '2021-08-09', '2021-04-18', 
                   '2020-11-25', '2020-07-30', '2020-03-08']
}

df_PurchaseFrequency = pd.DataFrame(purchase_frequency_data)

churn_risk_score_data = {
    'ChurnRiskScore': [0.15, 0.14, 0.13, 0.12, 0.11, 0.11, 0.12, 0.13, 0.12, 0.12],
    'Region': ['Northwest', 'Northeast', 'Midwest', 'South', 'West', 'Northwest', 'Northeast', 'Midwest', 'South', 'Northwest'],
    'Segment': ['Individual', 'Corporate', 'Individual', 'Individual', 'Individual', 'Corporate', 'Individual', 
                'Corporate', 'Individual', 'Individual']
}

df_ChurnRiskScore = pd.DataFrame(churn_risk_score_data)

cltv_data = {
    'CLTV': [1200.0, 1250.0, 1300.0, 1350.0, 1400.0, 1450.0, 1475.0, 1490.0, 1495.0, 1500.0],
    'Segment': ['Individual', 'Corporate', 'Individual', 'Corporate', 'Corporate', 'Corporate', 'Corporate', 
                'Corporate', 'Individual', 'Corporate'],
    'IncomeBracket': ['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High', 'Medium-High', 'Low-Medium', 'Medium', 'Medium-High', 'Medium-High']
}

df_CLTV = pd.DataFrame(cltv_data)
aov_data = {
    'AOV': [100.0, 105.0, 110.0, 115.0, 120.0, 125.0, 130.0, 132.0, 134.0, 135.8],
    'AgeGroup': ['18-24', '25-34', '35-44', '45-54', '55+', '18-24', '25-34', '35-44', '45-54', '55+'],
    'PaymentType': ['Credit Card', 'Cash', 'Check', 'Online', 'Credit Card', 'Online', 'Online', 'Online', 'Credit Card', 'Cash'],
    'IncomeBracket': ['Low', 'Low-Medium', 'Medium', 'Medium-High', 'High', 'Low', 'Low-Medium', 'Medium', 'Medium-High', 'High']
}

df_AOV = pd.DataFrame(aov_data)
revenue_data = {
    'Revenue': [450000.0, 460000.0, 470000.0, 480000.0, 490000.0, 495000.0, 500000.0, 510000.0, 520000.0, 530000.0],
    'Region': ['Northwest', 'Northeast', 'Northwest', 'Northwest', 'West', 'Northwest', 'Northeast', 'Midwest', 'South', 'West'],
    'Segment': ['Individual', 'Corporate', 'Individual', 'Corporate', 'Individual', 'Corporate', 'Individual', 
                'Corporate', 'Individual', 'Corporate'],
    'ProductCategory': ['Bikes', 'Components', 'Bikes', 'Accessories', 'Helmets', 'Bikes', 'Components', 
                         'Clothing', 'Bikes', 'Helmets']
}

df_Revenue = pd.DataFrame(revenue_data)

In [3]:
from langchain_kuzu.graphs.kuzu_graph import KuzuGraph
from langchain_kuzu.chains.graph_qa.kuzu import KuzuQAChain
from langchain_groq import ChatGroq
from langchain.agents import AgentType, initialize_agent
from langchain.tools import Tool
from langgraph.prebuilt import create_react_agent


In [4]:
# Initialize database
db = kuzu.Database("./knowledge_graph")
conn = kuzu.Connection(db)
graph = KuzuGraph(db, allow_dangerous_requests=True)

In [12]:
#LLM instance
llm_groq=ChatGroq(
    model="llama3-70b-8192", 
    temperature=0.3, 
    api_key="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"
    )

In [13]:
#Kuzu Tool
qa_chain = KuzuQAChain.from_llm(
    llm=llm_groq,
    graph=graph,
    verbose=True,
    allow_dangerous_requests=True
)

kuzu_tool = Tool(
    name="Kuzu Query Tool",
    description="A tool for querying the Kùzu graph database.",
    func=qa_chain.run,
    )


In [14]:
#Correlation tool
def calculate_correlation(metric, dimensions):
    mapping = {
    'Revenue': df_Revenue,
    'AOV': df_AOV,
    'CLTV':df_CLTV,
    'ChurnRiskScore':df_ChurnRiskScore,
    'PurchaseFrequency':df_PurchaseFrequency
    }
    if metric not in mapping:
        raise ValueError(f"Metric '{metric}' not found in the mapping.")
    
    df = mapping[metric]
    
    selected_columns = [metric] + [dim for dim in dimensions if dim in df.columns]
    
    if len(selected_columns) < 2:
        raise ValueError("Not enough valid columns found in the DataFrame for correlation calculation.")
    
    df_selected = df[selected_columns].copy()
    
    # Encoding non-numeric columns
    for col in df_selected.columns:
        if not pd.api.types.is_numeric_dtype(df_selected[col]):
            df_selected[col] = df_selected[col].astype('category').cat.codes
    
    correlation_matrix = df_selected.corr()
    correlation_matrix.drop([metric],axis=1,inplace=True)
    return correlation_matrix.loc[metric]

def calculate_correlation_tool(input_str):
    """
    Expects a JSON string with keys 'metric' and 'dimensions'. For example:
    {"metric": "Revenue", "dimensions": ["Region", "Segment", "ProductCategory"]}
    """
    try:
        data = json.loads(input_str)
        if isinstance(data, dict) and "__arg1" in data:
            inner_input = data["__arg1"]
            # If inner_input is a string, try parsing it as JSON
            if isinstance(inner_input, str):
                data = json.loads(inner_input)
            else:
                data = inner_input
        metric = data.get("metric")
        dimensions = data.get("dimensions")
        if not metric or not dimensions:
            return "Error: Input must contain both 'metric' and 'dimensions'."
        
        result = calculate_correlation(metric, dimensions)
        return result.to_json()
    except Exception as e:
        return f"Error: {str(e)}"

#print(calculate_correlation_tool())
    
correlation_tool = Tool(
    name="calculate_correlation_tool",
    description=(
        "A tool for calculating the correlation between a given metric and dimensions. "
        "Input must be a JSON string with keys 'metric' and 'dimensions'. For example:"
        '{"metric":, "dimensions": []}'
    ),
    func=calculate_correlation_tool
    )


In [15]:
#REACT AGENT
prompt = (
    "You are an AI analyst specializing in data analytics for metrics and dimensions. Your goal is to provide clear, concise answers to the user queries. "
    "When answering questions, you may internally use various tools to retrieve or analyze data, but you must not mention the names of these tools in your final answer. "
    "Instead, simply provide the insights and data that directly address the user's query. \n\n"
    
    "Your job is to answer queries by choosing the most appropriate tool:\n\n"
    
    "1. Use the 'Kuzu Query Tool' for queries that request data retrieval or general insights about changes in metrics. "
    "For example, if the query asks for 'changes in metrics' or 'what has happened', use the Kùzu Query Tool.\n\n"
    
    "2. Use the 'Correlation Tool' only when the query explicitly asks for the cause of a change in a metric, or for a correlation or relationship analysis between a metric and dimensions. "
    "In these cases, first retrieve the relevant dimensions that drive the metric and are associated with it from the graph (using the Kùzu Query Tool) and then pass these dimensions along with the metric to the Correlation Tool. "
    "When calling the Correlation Tool, provide a JSON object with keys 'metric' and 'dimensions'. For example: "
    '{"metric": "Revenue", "dimensions": ["Region", "Segment", "ProductCategory"]}.\n\n'
    
    "Always think step by step and decide which tool best suits the user's query. "
    "If the query does not explicitly request a correlation analysis, do not call the Correlation Tool."
)



agent = create_react_agent(
    model=llm_groq,
    tools=[kuzu_tool,correlation_tool],
    prompt=prompt
)

In [16]:
agent.invoke({"messages": [("user","give the significant changes in metrics for user 1")]})



[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (u:User {user_id: '1'})-[:Associated]->(m:Metrics) RETURN m.metric_id, m.metric_name, m.cur_value, m.change ORDER BY m.change DESC;[0m
Full Context:
[32;1m[1;3m[{'m.metric_id': 'Revenue', 'm.metric_name': 'Revenue', 'm.cur_value': 500000.0, 'm.change': 10000.0}, {'m.metric_id': 'AOV', 'm.metric_name': 'AOV', 'm.cur_value': 135.8, 'm.change': 12.8}, {'m.metric_id': 'CLTV', 'm.metric_name': 'CLTV', 'm.cur_value': 1500.0, 'm.change': 5.0}, {'m.metric_id': 'PurchaseFrequency', 'm.metric_name': 'PurchaseFrequency', 'm.cur_value': 4.5, 'm.change': 2.2}, {'m.metric_id': 'ChurnRiskScore', 'm.metric_name': 'ChurnRiskScore', 'm.cur_value': 0.12, 'm.change': -1.5}][0m

[1m> Finished chain.[0m


{'messages': [HumanMessage(content='give the significant changes in metrics for user 1', additional_kwargs={}, response_metadata={}, id='4f0fada3-52d3-44b8-bddb-271232562ce9'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_ctq0', 'function': {'arguments': '{"__arg1":"changes in metrics for user 1"}', 'name': 'Kuzu Query Tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 75, 'prompt_tokens': 1373, 'total_tokens': 1448, 'completion_time': 0.235860983, 'prompt_time': 0.163460015, 'queue_time': 0.06277755300000001, 'total_time': 0.399320998}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_2f30b0b571', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-c95eddb2-af2c-409f-8d69-f47ff543238c-0', tool_calls=[{'name': 'Kuzu Query Tool', 'args': {'__arg1': 'changes in metrics for user 1'}, 'id': 'call_ctq0', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1373, 'output_tokens': 75, 'total_tokens': 1448}),
  Too

In [17]:
agent.invoke({"messages": [("user","which dimension caused the change in metric AOV"),
                           ]})



[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (u:User)-[:Associated]->(m:Metrics) WHERE LOWER(u.user_name) = 'aov' RETURN m.metric_name, m.cur_value, m.change, m.trend, m.goal[0m
Full Context:
[32;1m[1;3m[][0m

[1m> Finished chain.[0m


[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (u:User {user_name: LOWER("AOV"}))-[:Associated]->(m:Metrics)-[:Impacts|Drives]-(d:Dimensions) RETURN d.dimension_name AS dimension_name, d.dimension_description AS dimension_description ORDER BY d.dimension_name ASC[0m


[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (d:Dimensions)-[:Drives]->(m:Metrics) WHERE LOWER(m.metric_name) = 'aov' RETURN d.dimension_name, m.cur_value[0m
Full Context:
[32;1m[1;3m[{'d.dimension_name': 'PaymentType', 'm.cur_value': 135.8}, {'d.dimension_name': 'AgeGroup', 'm.cur_value': 135.8}, {'d.dimension_name': 'IncomeBracket', 'm.cur_value': 135.8}][0m

[1m> Finish

{'messages': [HumanMessage(content='which dimension caused the change in metric AOV', additional_kwargs={}, response_metadata={}, id='6c94e7f7-d71a-491b-b217-93f75de04444'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_gejj', 'function': {'arguments': '{"__arg1":"AOV"}', 'name': 'Kuzu Query Tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 48, 'prompt_tokens': 1372, 'total_tokens': 1420, 'completion_time': 0.145856778, 'prompt_time': 0.107954638, 'queue_time': 0.43370683299999996, 'total_time': 0.253811416}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_2f30b0b571', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-81ae57b6-6cf6-4580-b9f5-65ec981b896d-0', tool_calls=[{'name': 'Kuzu Query Tool', 'args': {'__arg1': 'AOV'}, 'id': 'call_gejj', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1372, 'output_tokens': 48, 'total_tokens': 1420}),
  ToolMessage(content="I don't know the answer.", name='Kuzu