In [1]:
import kuzu
import pandas as pd
import numpy as np
import json

In [19]:
df_PurchaseFrequency = pd.read_csv("PurchaseFrequency.csv")

df_ChurnRiskScore = pd.read_csv("ChurnRiskScore.csv")

df_CLTV = pd.read_csv("CLTV.csv")

df_AOV = pd.read_csv("AOV.csv")

df_Revenue = pd.read_csv("revenue.csv")

In [3]:
from langchain_kuzu.graphs.kuzu_graph import KuzuGraph
from langchain_kuzu.chains.graph_qa.kuzu import KuzuQAChain
from langchain_groq import ChatGroq
from langchain.agents import AgentType, initialize_agent
from langchain.tools import Tool
from langgraph.prebuilt import create_react_agent


In [5]:
# Initialize database
db = kuzu.Database("./knowledge_graph")
conn = kuzu.Connection(db)
graph = KuzuGraph(db, allow_dangerous_requests=True)

In [6]:
#LLM instance
llm_groq=ChatGroq(
    model="llama3-70b-8192", 
    temperature=0.3, 
    api_key="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"
    )

In [7]:
#Kuzu Tool
qa_chain = KuzuQAChain.from_llm(
    llm=llm_groq,
    graph=graph,
    verbose=True,
    allow_dangerous_requests=True
)

kuzu_tool = Tool(
    name="Kuzu Query Tool",
    description="A tool for querying the Kùzu graph database.",
    func=qa_chain.run,
    )


In [15]:
# #Correlation tool
# def calculate_correlation(metric, dimensions):
#     mapping = {
#     'Revenue': df_Revenue,
#     'AOV': df_AOV,
#     'CLTV':df_CLTV,
#     'ChurnRiskScore':df_ChurnRiskScore,
#     'PurchaseFrequency':df_PurchaseFrequency
#     }
#     if metric not in mapping:
#         raise ValueError(f"Metric '{metric}' not found in the mapping.")
    
#     df = mapping[metric]
    
#     selected_columns = [metric] + [dim for dim in dimensions if dim in df.columns]
    
#     if len(selected_columns) < 2:
#         raise ValueError("Not enough valid columns found in the DataFrame for correlation calculation.")
    
#     df_selected = df[selected_columns].copy()
    
#     # Encoding non-numeric columns
#     for col in df_selected.columns:
#         if not pd.api.types.is_numeric_dtype(df_selected[col]):
#             df_selected[col] = df_selected[col].astype('category').cat.codes
    
#     correlation_matrix = df_selected.corr()
#     correlation_matrix.drop([metric],axis=1,inplace=True)
#     return correlation_matrix.loc[metric]

# def calculate_correlation_tool(input_str):
#     """
#     Expects a JSON string with keys 'metric' and 'dimensions'. For example:
#     {"metric": "Revenue", "dimensions": ["Region", "Segment", "ProductCategory"]}
#     """
#     try:
#         data = json.loads(input_str)
#         if isinstance(data, dict) and "__arg1" in data:
#             inner_input = data["__arg1"]
#             # If inner_input is a string, try parsing it as JSON
#             if isinstance(inner_input, str):
#                 data = json.loads(inner_input)
#             else:
#                 data = inner_input
#         metric = data.get("metric")
#         dimensions = data.get("dimensions")
#         if not metric or not dimensions:
#             return "Error: Input must contain both 'metric' and 'dimensions'."
        
#         result = calculate_correlation(metric, dimensions)
#         return result.to_json()
#     except Exception as e:
#         return f"Error: {str(e)}"
    
# c_tool = Tool(
    # name="calculate_correlation_tool",
    # description=(
    #     "A tool for calculating the correlation between a given metric and dimensions. "
    #     "Input must be a JSON string with keys 'metric' and 'dimensions'. For example:"
    #     '{"metric":, "dimensions": []}'
    # ),
    # func=calculate_correlation_tool
    # )


In [58]:
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from scipy.stats import pearsonr
from scipy.spatial.distance import euclidean
from dtaidistance import dtw
import pandas as pd
import json

def analyze_similarity_causality(input_str):
    """Analyzes similarity and causality for a given metric from time-series data."""

    # Parse JSON input safely
    try:
        data = json.loads(input_str)
        if isinstance(data, dict) and "__arg1" in data:
            data = json.loads(data["__arg1"])
    except json.JSONDecodeError:
        return {"error": "Invalid JSON input."}

    metric = data.get("metric")
    
    # Mapping of available datasets
    mapping = {
        'Revenue': df_Revenue,
        'AOV': df_AOV,
        'CLTV': df_CLTV,
        'ChurnRiskScore': df_ChurnRiskScore,
        'PurchaseFrequency': df_PurchaseFrequency
    }
    
    # Validate metric input
    if metric not in mapping:
        return {"error": f"Metric '{metric}' not found in dataset mapping."}
    
    df_used = mapping[metric]

    def check_stationarity(df_pair):
        """Check stationarity using ADF Test and apply differencing if needed."""
        try:
            res1 = adfuller(df_pair.iloc[:, 0])  
            res2 = adfuller(df_pair.iloc[:, 1])
        except Exception:
            return df_pair  # Return original if ADF fails

        if res1[1] > 0.05 or res2[1] > 0.05:
            return df_pair.diff().dropna()  # Apply differencing if needed
        return df_pair  

    cols = [col for col in df_used.columns if col not in [metric, 'day']]
    
    results = []
    for col in cols:
        df_pair = df_used[[metric, col]].dropna()
        df_pair = check_stationarity(df_pair) 
        corr,corr_p_value= pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])  # Pearson Correlation


        # Compute Granger Causality
        max_lags = 3
        granger_results = grangercausalitytests(df_pair, max_lags, verbose=False)
        granger_p_values = [granger_results[lag][0]['ssr_chi2test'][1] for lag in range(1, max_lags + 1)]
        
        best_granger_p = min(granger_p_values)

        # Store results
        results.append({
            'Variable': col,
            'Pearson Correlation': corr,
            'Best Granger p-value': best_granger_p
        })

    df_results = pd.DataFrame(results)

            # # Compute Similarity Metrics
            # try:
            #     corr, corr_p_value = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])  
            #     dtw_distance = dtw.distance(df_pair.iloc[:, 0], df_pair.iloc[:, 1])  
            #     euclidean_distance = euclidean(df_pair.iloc[:, 0], df_pair.iloc[:, 1])  
            # except Exception:
            #     print("not done similarity")
            #     continue  # Skip variable if similarity calculation fails

            # # Compute Granger Causality (safely extract p-values)
            # max_lags = 3
            # granger_p_values = []
            
            # try:
            #     granger_results = grangercausalitytests(df_pair, max_lags, verbose=False)
            #     for lag in range(1, max_lags + 1):
            #         if lag in granger_results and 'ssr_chi2test' in granger_results[lag][0]:
            #             p_value = granger_results[lag][0]['ssr_chi2test'][1]
            #             granger_p_values.append(p_value)
            # except Exception:
            #     print("not done similarity")

            # best_granger_p = min(granger_p_values) if granger_p_values else 1.0  # Handle missing values

            # Store results
        #     results.append({
        #         'Variable': col,
        #         'Pearson Correlation': corr,
        #         'Correlation p-value': corr_p_value,
        #         'DTW Distance': dtw_distance,
        #         'Euclidean Distance': euclidean_distance,
        #         'Best Granger p-value': best_granger_p
        #     })

        # df_results = pd.DataFrame(results)

    #Extract Top 2 variables by Pearson Correlation & Granger Causality
    top_pearson = df_results.nlargest(2, 'Pearson Correlation')[['Variable', 'Pearson Correlation']]
    top_granger = df_results.nsmallest(2, 'Best Granger p-value')[['Variable', 'Best Granger p-value']]
    
    return {
        "Top 2 Pearson Correlation Variables": top_pearson.to_dict(orient='records'),
        "Top 2 Granger Causality Variables": top_granger.to_dict(orient='records')
    }


In [None]:
sample_input = json.dumps({"metric": "Revenue"})  
output = analyze_similarity_causality(sample_input)  
print(output)  # Displays the results

In [59]:
sample_input = json.dumps({"metric": "AOV"})  
output = analyze_similarity_causality(sample_input)  
print(output)  # Displays the results



TypeError: Column 'Pearson Correlation' has dtype object, cannot use method 'nlargest' with this dtype

In [9]:
analyze_similarity_causality_tool = Tool(
    name="analyze_similarity_causality_tool",
    description=(
        "A tool for calculating the attributing change in a metric to certain dimension based in the pearson relation coefficient and granger p value "
        "Input must be a JSON string with keys 'metric'. For example:"
        '{"metric"}'
    ),
    func=analyze_similarity_causality
    )

In [10]:
#REACT AGENT
prompt = (
    "You are an AI analyst specializing in data analytics for metrics and dimensions. Your goal is to provide clear, concise answers to the user queries. "
    "When answering questions, you may internally use various tools to retrieve or analyze data, but you must not mention the names of these tools in your final answer. "
    "Instead, simply provide the insights and data that directly address the user's query. \n\n"
    
    "Your job is to answer queries by choosing the most appropriate tool:\n\n"
    
    "1. Use the 'Kuzu Query Tool' for queries that request data retrieval or general insights about changes in metrics from the graph. "
    "For example, if the query asks for 'changes in metrics for a certain' or 'what has happened', use the Kùzu Query Tool.\n\n"
    
    "2. Use the 'analyze_similarity_causality_tool' only when the query explicitly asks for the cause of a change in a metric.It is used to determine which variable caused the change in the metric. "
    "In these cases,pass the metric to the analyze_similarity_causality_tool do not call the Kuzu Query Tool. "
    "When calling the analyze_similarity_causality_tool, provide a JSON object with key 'metric'. For example: "
    '{"metric": "Revenue"}.\n\n'
    "It will return the a dictionary with a top 2 variables based on pearson correlation coeeficient and granger p_value"
    "The higher coefficient the value the stronger the correlation, the smaller the p_value the more the causality"
    "Always think step by step and decide which tool best suits the user's query. "
    "If the query does not explicitly request a cause or relationship analysis, do not call the analyze_similarity_causality_tool."
)



agent = create_react_agent(
    model=llm_groq,
    tools=[kuzu_tool,analyze_similarity_causality_tool],
    prompt=prompt
)

In [71]:
agent.invoke({"messages": [("user","give the significant changes in metrics for user 1")]})



[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (u:User {user_id: '1'})-[:Associated]->(m:Metrics) RETURN m.metric_id, m.metric_name, m.cur_value, m.change ORDER BY m.change DESC;[0m
Full Context:
[32;1m[1;3m[{'m.metric_id': 'Revenue', 'm.metric_name': 'Revenue', 'm.cur_value': 500000.0, 'm.change': 10000.0}, {'m.metric_id': 'AOV', 'm.metric_name': 'AOV', 'm.cur_value': 135.8, 'm.change': 12.8}, {'m.metric_id': 'CLTV', 'm.metric_name': 'CLTV', 'm.cur_value': 1500.0, 'm.change': 5.0}, {'m.metric_id': 'PurchaseFrequency', 'm.metric_name': 'PurchaseFrequency', 'm.cur_value': 4.5, 'm.change': 2.2}, {'m.metric_id': 'ChurnRiskScore', 'm.metric_name': 'ChurnRiskScore', 'm.cur_value': 0.12, 'm.change': -1.5}][0m

[1m> Finished chain.[0m


{'messages': [HumanMessage(content='give the significant changes in metrics for user 1', additional_kwargs={}, response_metadata={}, id='8c2c8d13-4ea2-4862-9539-6f2d7d9b6a8d'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_9f5a', 'function': {'arguments': '{"__arg1":"changes in metrics for user 1"}', 'name': 'Kuzu Query Tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 53, 'prompt_tokens': 1403, 'total_tokens': 1456, 'completion_time': 0.156249751, 'prompt_time': 0.087662892, 'queue_time': 0.02258013199999999, 'total_time': 0.243912643}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_753a4aecf6', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-00b8a0e2-b28a-4754-9dd1-7eab77daba13-0', tool_calls=[{'name': 'Kuzu Query Tool', 'args': {'__arg1': 'changes in metrics for user 1'}, 'id': 'call_9f5a', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1403, 'output_tokens': 53, 'total_tokens': 1456}),
  Too

In [84]:
agent.invoke({"messages": [("user","what caused the change in metric Revenue"),
                           ]})


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.1990  , p=0.2969  , df_denom=11, df_num=1
ssr based chi2 test:   chi2=1.5261  , p=0.2167  , df=1
likelihood ratio test: chi2=1.4485  , p=0.2288  , df=1
parameter F test:         F=1.1990  , p=0.2969  , df_denom=11, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=2.1091  , p=0.1838  , df_denom=8, df_num=2
ssr based chi2 test:   chi2=6.8546  , p=0.0325  , df=2
likelihood ratio test: chi2=5.5053  , p=0.0638  , df=2
parameter F test:         F=2.1091  , p=0.1838  , df_denom=8, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=2.7233  , p=0.1542  , df_denom=5, df_num=3
ssr based chi2 test:   chi2=19.6075 , p=0.0002  , df=3
likelihood ratio test: chi2=11.6219 , p=0.0088  , df=3
parameter F test:         F=2.7233  , p=0.1542  , df_denom=5, df_num=3

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=0.9378  , p=0.3537  , df_denom

{'messages': [HumanMessage(content='what caused the change in metric Revenue', additional_kwargs={}, response_metadata={}, id='f40b2c38-3a3a-457e-b6ae-2eac667e5864'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_9hxh', 'function': {'arguments': '{"__arg1":"{\\"metric\\": \\"Revenue\\"}"}', 'name': 'analyze_similarity_causality_tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 55, 'prompt_tokens': 1401, 'total_tokens': 1456, 'completion_time': 0.161971716, 'prompt_time': 0.125951915, 'queue_time': 1.6985534070000001, 'total_time': 0.287923631}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_753a4aecf6', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-d78cf6e1-d169-4fe2-b370-6050df36e7b9-0', tool_calls=[{'name': 'analyze_similarity_causality_tool', 'args': {'__arg1': '{"metric": "Revenue"}'}, 'id': 'call_9hxh', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1401, 'output_tokens': 55, 'total_token

In [12]:
agent.invoke({"messages": [("user","what caused the change in metric AOV"),
                           ]})

  corr, corr_p_value = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])
  corr, corr_p_value = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])
  corr, corr_p_value = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])
  corr, corr_p_value = pearsonr(df_pair.iloc[:, 0], df_pair.iloc[:, 1])




[1m> Entering new KuzuQAChain chain...[0m
Generated Cypher:
[32;1m[1;3mMATCH (m:Metrics {LOWER(metric_name): 'aov'}) RETURN m.cur_value, m.change, m.trend, m.goal[0m


{'messages': [HumanMessage(content='what caused the change in metric AOV', additional_kwargs={}, response_metadata={}, id='a981e05b-5498-4ec3-8ab0-8cbe17d1e39f'),
  AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'call_nvbk', 'function': {'arguments': '{"__arg1":"{\\"metric\\": \\"AOV\\"}"}', 'name': 'analyze_similarity_causality_tool'}, 'type': 'function'}]}, response_metadata={'token_usage': {'completion_tokens': 55, 'prompt_tokens': 1402, 'total_tokens': 1457, 'completion_time': 0.19092022, 'prompt_time': 0.065710045, 'queue_time': 0.22743977799999998, 'total_time': 0.256630265}, 'model_name': 'llama3-70b-8192', 'system_fingerprint': 'fp_2e0feca3c9', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-a67c3c00-122d-4a49-a6f1-341482e2c237-0', tool_calls=[{'name': 'analyze_similarity_causality_tool', 'args': {'__arg1': '{"metric": "AOV"}'}, 'id': 'call_nvbk', 'type': 'tool_call'}], usage_metadata={'input_tokens': 1402, 'output_tokens': 55, 'total_tokens': 1457}),
