In [None]:

import sqlite3
import io
import csv
import os
import sys
from pathlib import Path

current_dir = Path(os.getcwd())
parent_dir = str(current_dir.parent)
sys.path.append(parent_dir)

from dotenv import load_dotenv
from openai import OpenAI
import json
from typing import Union, List
from pydantic import BaseModel, Field

schema_path = "/Users/virounikamina/Desktop/PIMCO-Text2SQL/chatgpt_api/schema.json"
with open(schema_path, 'r') as f:
    schema_info = json.load(f)

load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Simple Reasonings Schema
reasonings_schema_json = json.dumps({
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "thought": {
                "type": "string",
                "description": "A thought about the user's question"
            },
            "helpful": {
                "type": "boolean",
                "description": "Whether the thought is helpful to solving the user's question"
            }
        }
    }
})

# Simple Final Output Schema
final_output_schema_json = json.dumps({
    "$schema": "http://json-schema.org/draft-07/schema#",
    "type": "object",
    "properties": {
        "user_nlp_query": {
            "type": "string",
            "description": "The original natural language query to be translated into SQL"
        },
        "reasonings": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "thought": {
                        "type": "string",
                        "description": "A thought about the user's question"
                    },
                    "helpful": {
                        "type": "boolean",
                        "description": "Whether the thought is helpful to solving the user's question"
                    }
                }
            },
            "description": "Step-by-step reasoning process for query generation"
        },
        "generated_sql_query": {
            "type": "string",
            "description": "The final SQL query that answers the natural language question"
        }
    }
})

thought_instructions = f"""
```
Thought Instructions:
```

```
Generate thoughts of increasing complexity.
Each thought should build on the previous ones and thoughts 
should progressively cover the nuances of the problem at hand.
```

```
First set of thoughts should be on whether a the query requires 
Common Table Expressions (CTEs) to calculate the
results for sub queries. 

Prefer using Common Table Expressions rather than
case when statements or nested subqueries.

If CTEs are required then for each CTE, an analysis of the purpose of each
CTE should be done.
An overall structure should be outlined as to what will be calculated in 
each CTE.
```

```
Next set of thoughts should on 
extracting out the names of as many of 
the relevant columns as possible for all CTEs and for all the sql clauses such as the 
`select`, `where` and `group_by` clauses.
There might be additions or deletions from this list based on the 
following additional thoughts to be generated.
```


```
Generate a thought to figure out the possible phrases in the query 
which can be used as values of the columns present in the table so as to use them 
in the `where` clause.
```

```
Generate a thought to compare these extracted values with the list of possible values
of columns listed in the information for the columns so as to use the exact string
in the `where` clause.
```

```
Generate a thought to reason whether `IS_TOP_TIER_ENTITY` flag is required or not.
```

```
Generate a thought to figure out which time period is being queried.
If nothing is specified use `PERIOD_ID = 2023Y`.
```

```
Generate a thought to figure out if a group_by clause is required.
Since the table is structured so that for a single entity multiple securities are listed,
`group_by` is often required over `INS_ENTITY_NAME_LONG` column.
```

```
The above thoughts about 
1. phrases for values of columns
2. query phrase to column value mapping
3. filters such as `IS_TOP_TIER_ENTITY` and others in the where clause
4. Period_id value to use
5. Group by column

should be generated for each of the CTE separately.
```

```
If the input question is similar to any of the examples given above,
then a thought should be generated to detect that and then that example 
should be followed closely to get the SQL for the input question given.
```

```
Closing Thoughts and Observations
```
These should summarize:
1. The structure of the SQL query:
    - This states whether the query has any nested query.
    If so, the structure of the nested query is also mentioned.
    If not, a summary of the function of each of the select`, `where`, `group_by` etc. clauses
    should be mentioned.
2. An explanation of how the query solves the user question.
"""

reasoning_instructions = """
```
1. Reasoning you provide should first focus on why a nested query was chosen or why it wasn't chosen.
2. It should give a query plan on how to solve this question - explain 
the mapping of the columns to the words in the input question.
3. It should explain each of the clauses and why they are structured the way they are structured. 
For example, if there is a `group_by`, an explanation should be given as to why it exists.
4. If there's any sum() or any other function used it should be explained as to why it was required.
```

```
Format the generated sql with proper indentation - the columns in the
(`select` statement should have more indentation than keyword `select`
and so on for each SQL clause.)
```
"""
def load_schema_from_json() -> dict:
    schema_path = "/Users/virounikamina/Desktop/PIMCO-Text2SQL/chatgpt_api/schema.json"
    try:
        with open(schema_path, 'r') as f:
            schema = json.load(f)
        return schema
    except Exception as e:
        print(f"Error loading schema: {str(e)}")
        raise e

class Background(BaseModel):
    """A setup description providing context for the user's question"""
    background: str = Field(
        description="Background for the user's question",
        min_length=10
    )

class Thought(BaseModel):
    """A thought about the user's question"""
    thought: str = Field(
        description="Text of the thought"
    )
    helpful: bool = Field(
        description="Whether the thought is helpful to solving the user's question"
    )

class Observation(BaseModel):
    """An observation summarizing insights from the reasoning process"""
    observation: str = Field(
        description="An insightful observation on the sequence of thoughts and observations generated so far"
    )

class FinalQueryOutput(BaseModel):
    """Complete output structure containing the query, reasoning, and SQL"""
    user_nlp_query: str = Field(
        description="The original natural language query to be translated into SQL"
    )
    
    reasonings: List[Union[Background, Thought, Observation]] = Field(
        description="Step-by-step reasoning process for query generation"
    )
    
    generated_sql_query: str = Field(
        description="The final SQL query that answers the natural language question"
    )

    class Config:
        arbitrary_types_allowed = True
        json_schema_extra = {
            "example": {
                "user_nlp_query": "Show top funds by total assets",
                "reasonings": [
                    {"background": "Analyzing fund asset query"},
                    {"thought": "Need to sort by total assets", "helpful": True},
                    {"observation": "Simple ranking query required"}
                ],
                "generated_sql_query": "SELECT * FROM fund_table ORDER BY total_assets DESC LIMIT 10"
            }
        }

    def get_sql(self) -> str:
        return self.generated_sql_query

    def get_reasoning_steps(self) -> List[str]:
        steps = []
        for item in self.reasonings:
            if isinstance(item, Background):
                steps.append(f"Background: {item.background}")
            elif isinstance(item, Thought):
                steps.append(f"Thought: {item.thought} (Helpful: {item.helpful})")
            elif isinstance(item, Observation):
                steps.append(f"Observation: {item.observation}")
        return steps

def load_schema_from_json() -> dict:
    schema_path = "/Users/virounikamina/Desktop/PIMCO-Text2SQL/chatgpt_api/schema.json"
    try:
        with open(schema_path, 'r') as f:
            schema = json.load(f)
        return schema
    except Exception as e:
        print(f"Error loading schema: {str(e)}")
        raise e

def execute_sql(query: str) -> str:
    conn = None
    try:
        db_path = "/Users/virounikamina/Desktop/PIMCO-Text2SQL/sqlite/nport.db"
        conn = sqlite3.connect(db_path)
        cursor = conn.cursor()
        cursor.execute(query)
        columns = [description[0] for description in cursor.description]
        rows = cursor.fetchall()
        output = io.StringIO()
        writer = csv.writer(output)
        writer.writerow(columns)
        writer.writerows(rows)
        csv_data = output.getvalue()
        output.close()
        return csv_data
    except sqlite3.Error as e:
        print(f"Database error: {str(e)}")
        raise e
    finally:
        if conn:
            conn.close()

def generate_sql(question: str):
    system_prompt = f"""
You are the most intelligent person in the world.

You will receive a $500 tip if you follow ALL the instructions specified.

Instructions:
Provide an explanation of why the given sql query is correct based 
on the input request and the description of the columns.

Use step by step reasoning and at each step generate thoughts of increasing complexity.

Getting this answer right is important for my career. Please do your best.
"""

    # Get schema JSONs
    rreasonings_schema_json = reasonings_schema_json
    final_output_schema_json = FinalQueryOutput.model_json_schema()

    user_prompt = f"""
Generate a SQL query that retrieves from the database the answer to this question: {question}

Database Schema:
{schema_info}

Use the following JSON Schema as the grammar to create the structure 
for the step by step reasoning, and then to create the final SQL query.

Schema for Reasoning:
{reasoning_instructions}
{reasonings_schema_json}

The instructions on how to structure the reasoning is provided below:
{thought_instructions}

Schema for Overall Output:
{final_output_schema_json}

The final response should be a json with names as:
- user_nlp_query: exactly the same as the user query in string format
- reasonings: reasoning steps adhering to the Reasonings schema
- generated_sql_query: the SQL query generated in string format

This is the final answer format required.
"""
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={"type": "json_object"}
    )

    final_response = response.choices[0].message.content
    return final_response


In [None]:
# Use here
response = generate_sql("List the top 5 PIMCO fund series by total assets")
response_parsed = json.loads(response)
print(response)
print(response_parsed["generated_sql_query"])
