In [2]:
# install packages
%pip install -U langchain-ollama
%pip install langchain.prompts
%pip install langchain_community
%pip install langchain_community.llms
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.
[31mERROR: Could not find a version that satisfies the requirement langchain.prompts (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for langchain.prompts[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
Collecting langchain_community
  Using cached langchain_community-0.3.18-py3-none-any.whl.metadata (2.4 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Using cached pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Using cached httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Using cached marshmallow-3.26.1-py3-none-any.whl.meta

# Import Packages

In [3]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.prompts import ChatPromptTemplate
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_datetime64_any_dtype, is_numeric_dtype, is_object_dtype
import os
from pydantic import BaseModel
import json
import numpy as np

# LLM

## Test

Try out to see how to run it

In [None]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="llama3.2")

chain = prompt | model

chain.invoke({"question": "What is LangChain?"})

## Preprocessing
- Return JSON's Headers and Description of Data

### Return JSON's Headers and Description of Data

- Tells LLM what files are available and what headers are in each file

In [6]:
def extract_headers(file_path):
    """
    Extract headers from a JSON file.

    Parameters:
        file_path (str): The path to the JSON file.

    Returns:
        list: List of headers from the JSON file.
    """
    with open(file_path, "r") as f:
        data = json.load(f)

    headers = []

    if isinstance(data, dict):
        # For a dictionary, get the keys
        headers = list(data.keys())
    elif isinstance(data, list) and data:
        first_item = data[0]
        if isinstance(first_item, dict):
            # For a list of dictionaries, get the keys from the first item
            headers = list(first_item.keys())

    return headers


def recursive_json_schema_extractor(directory):
    """
    Recursively walks through the given directory and extracts headers
    from all JSON files found.

    Parameters:
        directory (str): The root directory to start the recursive search.

    Returns:
        dict: A dictionary mapping JSON file paths to their header lists
    """
    schemas = {}
    for entry in os.listdir(directory):
        full_path = os.path.join(directory, entry)
        if os.path.isdir(full_path):
            # Recursively process subdirectories
            schemas.update(recursive_json_schema_extractor(full_path))
        elif entry.lower().endswith(".json"):
            try:
                headers = extract_headers(full_path)
                schemas[full_path] = headers
            except Exception as e:
                print(f"Error processing file {full_path}: {e}")
    return schemas


# Extract headers from all JSON files
data_dir = "./data"
json_schemas = recursive_json_schema_extractor(data_dir)

# Print the results
for file_path, headers in json_schemas.items():
    print(f"\nFile: {file_path}")
    print("Headers:")
    for header in headers:
        print(f"  {header}")
    print("-" * 40)

FileNotFoundError: [Errno 2] No such file or directory: './data'

## Query Classifier

- Decides which function to call based on the user's query

In [7]:
# Pydantic model for query classification
class QueryType(BaseModel):
    original_query: str
    query_type: str  # Can be "description", "report", or "chart"


# Template for query classification
template = """Classify the following query into one of these categories:
- description: Queries asking for specific details or explanations about certain aspects
- report: Queries requesting comprehensive analysis of all aspects of the dataset
- chart: Queries specifically requesting visual representation or graphs

Query: {query}

Respond only in this exact format:
original_query: [the original query]
query_type: [description/report/chart]"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")


# Function to parse LLM response into QueryType
def parse_llm_response(response: str) -> QueryType:
    lines = response.strip().split("\n")
    parsed = {}
    for line in lines:
        key, value = line.split(": ")
        parsed[key] = value.strip()

    return QueryType(**parsed)


# Create the chain with structured output
chain = prompt | model | parse_llm_response


# Example usage
def classify_query(user_query: str) -> QueryType:
    result = chain.invoke({"query": user_query})
    return result


# Test examples
test_queries = [
    "Show me a graph of sales over time",
    "What is the average age of our customers?",
    "Give me a complete analysis of our customer data",
]

for query in test_queries:
    result = classify_query(query)
    print(f"\nQuery: {result.original_query}")
    print(f"Type: {result.query_type}")


Query: Show me a graph of sales over time
Type: chart

Query: What is the average age of our customers?
Type: description

Query: Give me a complete analysis of our customer data
Type: report


## Visualisation Creator

### Select the file and columns to create the visualization

In [None]:
# Pydantic model for chart information
class ChartInfo(BaseModel):
    file_name: str
    x_axis: str
    y_axis: str


# Format json_schemas into a readable string for the prompt (without type info)
def format_schemas_for_prompt(schemas):
    formatted_str = ""
    for file_path, headers in schemas.items():
        file_name = os.path.basename(file_path)  # Get just the filename without path
        # Assume headers is now a list; join just the header names.
        headers_str = ", ".join(headers)
        formatted_str += f"{file_name}: [{headers_str}]\n"
    return formatted_str


# Template that includes instructions for the LLM
template = """Given the following JSON file headers, determine the most appropriate file and columns to create the visualization.

Available JSON files and their headers:
{json_headers}

Chart request: {query}

Respond only with the following information in this exact format:
file_name: [selected json file name]
x_axis: [column name for x-axis]
y_axis: [column name for y-axis]"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")


# Function to parse LLM output into ChartInfo
def parse_llm_response(response: str) -> ChartInfo:
    lines = response.strip().split("\n")
    parsed = {}
    for line in lines:
        key, value = line.split(": ")
        parsed[key] = value.strip()

    return ChartInfo(**parsed)


# Create the chain with structured output
chain = prompt | model | parse_llm_response

# Example usage with json_schemas:
formatted_headers = format_schemas_for_prompt(json_schemas)
result = chain.invoke(
    {"json_headers": formatted_headers, "query": "Show me the revenue trends over time"}
)

print("Selected file:", result.file_name)
print("X-axis:", result.x_axis)
print("Y-axis:", result.y_axis)

### Dataframe for selected file

In [7]:
df = pd.read_json(f"./Data/{result.file_name}")

### Code for Chart Generator

1. Properly imports all necessary data type checking functions from pandas
2. Handles different data type combinations:
    - Time series data (both datetime and string-based time columns)
    - Numeric vs numeric (scatter plots)
    - Categorical vs numeric (box plots)
    - Categorical vs categorical (heatmaps)
3. Includes automatic handling of:
    - Label rotation for better readability
    - Layout adjustments to prevent cutoff
    - Proper sorting for time series data


In [8]:
def generate_graph(df: pd.DataFrame, x_col: str, y_col: str) -> None:
    """
    Generate an appropriate visualization based on the data types of input columns.

    Parameters:
        df (pd.DataFrame): Input dataframe
        x_col (str): Column name for x-axis
        y_col (str): Column name for y-axis
    """
    # Set figure size and style
    plt.figure(figsize=(12, 6))
    sns.set_style("whitegrid")

    # Determine data types
    x_is_datetime = is_datetime64_any_dtype(df[x_col])
    x_is_numeric = is_numeric_dtype(df[x_col])
    x_is_categorical = is_object_dtype(df[x_col])

    y_is_datetime = is_datetime64_any_dtype(df[y_col])
    y_is_numeric = is_numeric_dtype(df[y_col])
    y_is_categorical = is_object_dtype(df[y_col])

    # Time series plot
    if x_is_datetime and y_is_numeric:
        sns.lineplot(data=df, x=x_col, y=y_col)
        plt.xticks(rotation=90)

    # Scatter plot for numeric vs numeric
    elif x_is_numeric and y_is_numeric:
        sns.scatterplot(data=df, x=x_col, y=y_col)

    # Box plot for categorical vs numeric
    elif x_is_categorical and y_is_numeric:
        sns.boxplot(data=df, x=x_col, y=y_col)
        plt.xticks(rotation=90)

    # Bar plot for categorical vs numeric (alternative to box plot)
    elif y_is_categorical and x_is_numeric:
        sns.barplot(data=df, x=x_col, y=y_col)
        plt.xticks(rotation=90)

    # Heatmap for categorical vs categorical
    elif x_is_categorical and y_is_categorical:
        # Create contingency table
        contingency = pd.crosstab(df[x_col], df[y_col])
        sns.heatmap(contingency, annot=True, fmt="d", cmap="YlOrRd")
        plt.xticks(rotation=90)

    else:
        raise ValueError("Unsupported combination of data types")

    # Add labels and adjust layout
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.title(f"{y_col} vs {x_col}")
    plt.tight_layout()  # Prevent label cutoff

    # Show the plot
    plt.show()

### Plot Chart

In [None]:
generate_graph(df, result.x_axis, result.y_axis)

## Description Generator

In [None]:
# Pydantic model for column selection
class ColumnSelections(BaseModel):
    selections: list[tuple[str, str]]  # List of (filename, column) pairs


# Template for column selection
template = """Given the following JSON file headers and their data types, determine which columns would be relevant to answer the query.

Available JSON files and their headers (with types):
{json_headers}

Query: {query}

Respond only with a list of [filename, column] pairs, one per line, in this exact format:
[file1.json, column1]
[file1.json, column2]
[file2.json, column3]
...etc

Each pair should be unique and relevant to the query."""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")


# Function to parse LLM output into ColumnSelections
def parse_llm_response(response: str) -> ColumnSelections:
    lines = response.strip().split("\n")
    selections = []
    seen_pairs = set()  # To ensure uniqueness

    for line in lines:
        # Remove brackets and split by comma
        clean_line = line.strip("[]").split(",")
        if len(clean_line) == 2:
            filename = clean_line[0].strip()
            column = clean_line[1].strip()
            pair = (filename, column)

            # Only add if we haven't seen this combination before
            if pair not in seen_pairs:
                selections.append(pair)
                seen_pairs.add(pair)

    return ColumnSelections(selections=selections)


# Create the chain with structured output
chain = prompt | model | parse_llm_response

# Example usage with json_schemas:
formatted_headers = format_schemas_for_prompt(json_schemas)
result = chain.invoke(
    {
        "json_headers": formatted_headers,
        "query": "What information do we have about customer purchases and their demographics?",
    }
)

# Print results
print("\nRelevant columns:")
for filename, column in result.selections:
    print(f"- {filename}: {column}")

In [None]:
class NumpyEncoder(json.JSONEncoder):
    """
    Custom JSON Encoder that converts numpy data types
    into native Python types so they can be serialized.
    """

    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NumpyEncoder, self).default(obj)


def collect_statistics_report(result: ColumnSelections) -> list:
    """
    Collects comprehensive statistical report data for selected columns,
    returning a JSON-serializable list of statistics per (filename, column) pair.

    Parameters:
        result (ColumnSelections): The Pydantic model containing the list of (filename, column) pairs.

    Returns:
        list: A list of dictionaries with detailed statistics for each selected column.
    """
    report = []
    for filename, column in result.selections:
        try:
            # Read the JSON file
            df = pd.read_json(f"./Data/{filename}")

            if pd.api.types.is_numeric_dtype(df[column]):
                stats = {
                    "Count": df[column].count(),
                    "Missing Values": df[column].isnull().sum(),
                    "Mean": df[column].mean(),
                    "Median": df[column].median(),
                    "Mode": (
                        df[column].mode().iloc[0]
                        if not df[column].mode().empty
                        else None
                    ),
                    "Std Dev": df[column].std(),
                    "Min": df[column].min(),
                    "Max": df[column].max(),
                    "Q1 (25th percentile)": df[column].quantile(0.25),
                    "Q3 (75th percentile)": df[column].quantile(0.75),
                    "IQR": df[column].quantile(0.75) - df[column].quantile(0.25),
                }
            else:
                # For categorical columns, calculate unique count, missing values, and frequency counts.
                value_counts = df[column].value_counts().to_dict()
                stats = {
                    "Unique Values": df[column].nunique(),
                    "Missing Values": df[column].isnull().sum(),
                    "Value Frequencies": value_counts,
                }
        except Exception as e:
            stats = {"error": str(e)}

        report.append({"filename": filename, "column": column, "statistics": stats})
    return report


# Example usage: Store the statistics in a JSON-serializable data structure.
stats_report = collect_statistics_report(result)

# Convert the report to a JSON string using the custom NumpyEncoder.
stats_report_json = json.dumps(stats_report, indent=4, cls=NumpyEncoder)
print(stats_report_json)

In [None]:
template = """Based on the following JSON statistics report for various columns, generate a comprehensive written summary of the findings.

JSON Statistics Report:
{stats_report}

Please provide a detailed summary including key insights, trends, and any anomalies in the data.
"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")

chain = prompt | model
final_report = chain.invoke({"stats_report": stats_report_json})

print(final_report)

## Report Generator

- With the query this is what it does:
    1. It will brainstorm what columns are relevant to the query
    2. It will generate a list of queries that are relevant to the report requested
    3. It will generate a report based on the queries and the data using the previous 2 functions
    4. It will return the report

In [16]:
# Pydantic model for query generation
class QueryList(BaseModel):
    queries: list[tuple[str, str]]


# Format json_schemas into a readable string for the prompt (without type info)
def format_schemas_for_prompt(schemas):
    print("\n[DEBUG] Formatting schemas:")
    formatted_str = ""
    for file_path, headers in schemas.items():
        file_name = os.path.basename(file_path)
        headers_str = ", ".join(headers)
        formatted_str += f"{file_name}: [{headers_str}]\n"
        print(f"- Processing file: {file_name}")
        print(f"  Headers: {headers_str}")
    return formatted_str


# Template for generating sub-queries
template = """Given the following JSON file headers and a user query, generate a list of specific sub-queries that would help analyze the data comprehensively.

Available JSON files and their headers:
{json_headers}

User Query: {query}

Generate a list of sub-queries, each either requesting a chart or asking for descriptions.
Each line MUST be in this exact format (use only these two types):
[chart, "generate a chart showing correlation between X and Y"]
[description, "analyze the distribution of X"]

Rules:
- Use ONLY 'chart' or 'description' as query types
- For numeric columns, suggest charts comparing relevant pairs
- For categorical columns, suggest descriptions of distributions
- Keep queries focused and specific
- Each query should be unique and relevant to the main question
- Limit to 3-5 most relevant queries
- Do not include any additional explanatory text"""

prompt = ChatPromptTemplate.from_template(template)
model = OllamaLLM(model="llama3.2")


# Function to parse LLM output into QueryList
def parse_llm_response(response: str) -> QueryList:
    print("\n[DEBUG] Parsing LLM response:")
    print(f"Raw response:\n{response}")

    lines = response.strip().split("\n")
    queries = []
    seen_queries = set()

    print("\nProcessing lines:")
    for line in lines:
        try:
            clean_line = line.strip("[]")
            query_type, query_text = clean_line.split(",", 1)
            query_type = query_type.strip().lower()
            query_text = query_text.strip(' "')

            print(f"- Line: {line}")
            print(f"  Type: {query_type}")
            print(f"  Text: {query_text}")

            if query_type not in ["chart", "description"]:
                print("  ⚠️ Invalid query type - skipping")
                continue

            query_pair = (query_type, query_text)
            if query_pair not in seen_queries:
                queries.append(query_pair)
                seen_queries.add(query_pair)
                print("  ✓ Added to queries")
            else:
                print("  ⚠️ Duplicate query - skipping")
        except ValueError:
            print(f"  ⚠️ Error parsing line: {line}")
            continue

    print(f"\nTotal queries parsed: {len(queries)}")
    return QueryList(queries=queries)


# Create the chain with structured output
chain = prompt | model | parse_llm_response


# Example usage
def generate_analysis_queries(user_query: str, json_schemas: dict) -> QueryList:
    print("\n[DEBUG] Generating analysis queries:")
    print(f"User query: {user_query}")

    formatted_headers = format_schemas_for_prompt(json_schemas)
    print(f"\nFormatted headers:\n{formatted_headers}")

    print("\nInvoking LLM chain...")
    result = chain.invoke({"json_headers": formatted_headers, "query": user_query})

    return result


# Test example
test_query = "Generate a report"
result = generate_analysis_queries(test_query, json_schemas)

# Print results
print("\nGenerated Analysis Queries:")
for query_type, query in result.queries:
    print(f"[{query_type}] {query}")


[DEBUG] Generating analysis queries:
User query: Generate a report

[DEBUG] Formatting schemas:
- Processing file: Adjusted_Ad_Campaign_Performance_Data.json
  Headers: Time, Campaign_ID, Age_group, Channel_Name, Spending, Number_of_Views, Number_of_Leads
- Processing file: Banking_KPI_Data.json
  Headers: Time, Number_of_New_Accounts, Total_Base_Mn

Formatted headers:
Adjusted_Ad_Campaign_Performance_Data.json: [Time, Campaign_ID, Age_group, Channel_Name, Spending, Number_of_Views, Number_of_Leads]
Banking_KPI_Data.json: [Time, Number_of_New_Accounts, Total_Base_Mn]


Invoking LLM chain...

[DEBUG] Parsing LLM response:
Raw response:
Here are some sub-queries that would help analyze the data comprehensively:

[chart, "generate a chart showing correlation between Spending and Number_of_Views"]
[description, "analyze the distribution of Age_group"]
[chart, "generate a bar chart comparing spending across different Channel_Name categories"]
[chart, "generate a scatter plot showing relati