In [1]:
import requests
import json
from pathlib import Path
import re
import os
import psycopg2

In [1]:
# Activating api: uvicorn Middleware:app --reload

# 1. Define Functions & Parameters

In [2]:
# Set Ollama API URL endpoint
OLLAMA_API_URL = "http://localhost:11434/api/chat"

In [3]:
# Define initial setup function. This function only needs to be run once
def initial_setup():
    instruction = 'You are a friendly and helpful data analyst. The user will ask you question about data analysis, be prepared.'
    chat= [{'role':'system',
            'content':instruction}]
    payload = {
    "model":"deepseek-r1:8b",
    "messages":chat,
    "stream":False,
    "temperature": 0.0
}
    response = requests.post(OLLAMA_API_URL,json=payload).json()

    chat.append({'role':'assistant',
                        'content':response['message']['content']})
    return chat

In [None]:
# Define natural language query function. This function takes natural language as input and outputs SQL query.
def generate_sql_query(chat_hist,natural_language):
    # Get the current directory of the script
    script_dir = Path.cwd()

    # Navigate to the parent directory and then into the Context folder
    context_dir = script_dir.parent / "Context"

    # Construct the full path to the file
    file_path = context_dir / "MASTER_FUNDING.txt"

    # Open context file in read mode
    with open(file_path, "r") as file:
        # Read the entire file content
        content = file.read()

    prompt = f"""
You are an expert SQL generator. Translate the following natural language request into a valid SQL query. 
Make sure the query is properly formatted and is tailored for our database schema.
The database we are using is PostgreSQL. Do not use backtick for column names.
The table in the database is "master_funding", it uses the following schema:
{content}

Request: {natural_language}

SQL Query:
"""
    chat_hist.append({"role":"user","content":prompt})
    payload = {
        "model": "deepseek-r1:8b",  # if needed, or remove if the API defaults to it
        "messages": chat_hist,
        "temperature": 0.0,  # Lower temperature can help ensure deterministic output
        "stream":False
    }
    response = requests.post(OLLAMA_API_URL, json=payload).json()
    
    # The structure may vary based on the API; assume generated text is in this field:
    reply = response['message']['content']
    chat_hist.append({"role":"assistant","content":reply})

    # Get SQL query
    pattern = r"```sql\n(.*?)\n```"

    # Search for the pattern
    match = re.search(pattern, reply, re.DOTALL)

    # Extracted SQL query
    sql_query = match.group(1) if match else None

    # Return SQL if SQL query found
    if sql_query is not None:
        return chat_hist,sql_query
    else:
        return chat_hist,"No SQL query detected"

In [16]:
# Define insight generating function. This function took query result and outputs insight
def generate_insight(chat_hist,natural_language, sql_query, query_result):
    # Format the query result in a readable way (e.g., JSON)
    formatted_result = json.dumps(query_result, indent=2)
    
    prompt = f"""
You are a data analyst. You first received the following natural language request:
"{natural_language}"

Based on that, an SQL query was generated and executed:
{sql_query}

The query returned the following result:
{formatted_result}

Please provide a detailed explanation of the insights, trends, or key points that can be derived from these results.
"""
    chat_hist.append({"role":"user","content":prompt})
    payload = {
        "model": "deepseek-r1:8b",
        "messages": chat_hist,
        "temperature": 0.7,
        "stream": False
    }
    response = requests.post(OLLAMA_API_URL, json=payload).json()
    insight = response['message']['content']
    chat_hist.append({"role":"assistant","content":insight})
    return chat_hist,insight

# Define database query function. This function took sql query and outputs the resulting query.

In [6]:
# Define database query function. This function took sql query and outputs the resulting query.
def query_database(sql_query):
    # Endpoint for query middleware
    API_URL = "http://127.0.0.1:8000/query"
    response = requests.post(API_URL, json={"query": sql_query})
    return response.json()

# Testing

In [7]:
chat_hist = initial_setup()

In [8]:
chat_hist

[{'role': 'system',
  'content': 'You are a friendly and helpful data analyst. The user will ask you question about data analysis, be prepared.'},
 {'role': 'assistant',
  'content': '<think>\nOkay, so I\'m trying to figure out how to analyze the dataset provided by my instructor for this statistics class. The dataset is about customers of a retail company, containing information like age, gender, annual income, shopping frequency, and purchase amount. My task is to identify patterns or trends that might help the company improve their strategies.\n\nFirst, I think I should start by understanding the basic characteristics of the dataset. I remember from class that descriptive statistics can give me an overview. So, maybe I should calculate the mean, median, mode for numerical variables like age and income. That will tell me the central tendency. For categorical variables like gender and shopping frequency, I can count the frequencies to see how each category is represented.\n\nNext, I w

In [9]:
naturalQuery=input("Enter yout data related question: ")

In [10]:
chat_hist, sql_query = generate_sql_query(chat_hist,naturalQuery)

In [11]:
chat_hist

[{'role': 'system',
  'content': 'You are a friendly and helpful data analyst. The user will ask you question about data analysis, be prepared.'},
 {'role': 'assistant',
  'content': '<think>\nOkay, so I\'m trying to figure out how to analyze the dataset provided by my instructor for this statistics class. The dataset is about customers of a retail company, containing information like age, gender, annual income, shopping frequency, and purchase amount. My task is to identify patterns or trends that might help the company improve their strategies.\n\nFirst, I think I should start by understanding the basic characteristics of the dataset. I remember from class that descriptive statistics can give me an overview. So, maybe I should calculate the mean, median, mode for numerical variables like age and income. That will tell me the central tendency. For categorical variables like gender and shopping frequency, I can count the frequencies to see how each category is represented.\n\nNext, I w

In [12]:
sql_query

"SELECT \n    cif_no AS customer_id,\n    cof_mtd AS monthly_cost_of_fund\nFROM \n    master_funding\nWHERE \n    base_dt BETWEEN '20250101' AND '20250301'\nORDER BY \n    cof_mtd DESC \nLIMIT 10;"

In [13]:
query_result = query_database(sql_query)

In [36]:
testquery= query_database("""
SELECT 
    region_name,
    customer_name,
    mtd_avg_amt_fix
FROM (
    SELECT
        region,
        gcif_name,
        mtd_avg_amt_fix,
        base_dt
    FROM master_funding
    WHERE base_dt = '20241231'
) AS top_customers
GROUP BY region, gcif_name
ORDER BY mtd_avg_amt_fix DESC
FETCH FIRST 5 ROWS;""")

In [37]:
testquery

{'detail': 'syntax error at or near ";"'}

In [28]:
testquery['status_code']

KeyError: 'status_code'

In [14]:
query_result

{'data': [{'customer_id': '8402093193', 'monthly_cost_of_fund': 0.067},
  {'customer_id': '0296418205', 'monthly_cost_of_fund': 0.067},
  {'customer_id': '3447465723', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '1745120444', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '8581019913', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '6538896988', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '0461726637', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '8581019913', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '4760295276', 'monthly_cost_of_fund': 0.065},
  {'customer_id': '8853886466', 'monthly_cost_of_fund': 0.065}]}

In [17]:
chat_hist,insight = generate_insight(chat_hist,naturalQuery,sql_query,query_result)

In [18]:
chat_hist

[{'role': 'system',
  'content': 'You are a friendly and helpful data analyst. The user will ask you question about data analysis, be prepared.'},
 {'role': 'assistant',
  'content': '<think>\nOkay, so I\'m trying to figure out how to analyze the dataset provided by my instructor for this statistics class. The dataset is about customers of a retail company, containing information like age, gender, annual income, shopping frequency, and purchase amount. My task is to identify patterns or trends that might help the company improve their strategies.\n\nFirst, I think I should start by understanding the basic characteristics of the dataset. I remember from class that descriptive statistics can give me an overview. So, maybe I should calculate the mean, median, mode for numerical variables like age and income. That will tell me the central tendency. For categorical variables like gender and shopping frequency, I can count the frequencies to see how each category is represented.\n\nNext, I w

In [19]:
insight

'\n\n**Analysis and Insights:**\n\nThe query retrieves data on the top 10 customers based on their monthly cost of fund for January 2025. Here are the key observations:\n\n1. **Top Customers by Monthly Cost**: The top two customers have a monthly cost of fund (MCF) of approximately 0.067, slightly higher than the next eight customers who all have an MCF of around 0.065.\n\n2. **Customer Distribution**: Among the top 10, customer IDs "8402093193" and "0296418205" lead with the highest MCF. The rest of the customers are tightly clustered in terms of their MCF, indicating a fairly uniform distribution among them.\n\n3. **Close Competition**: The difference between the first two customers\' MCF and the next highest is minimal, suggesting that there isn\'t a significant gap in their performance metrics during the month of January 2025.\n\n4. **Customer Identification**: All customer IDs are unique within the top 10, indicating no overlap or duplication among them.\n\n5. **Data Precision**: 