In [2]:
from t2j.prem_sdk import PremSDK  
from t2j.prompts import Prompts
from t2j.chunker import DocumentChunker
from t2j.decomposer import SchemaDecomposer
from t2j.utils import *
import json

In [2]:
chunk_size_lines = 100
promptsClass = Prompts()
model = PremSDK()
chunker = DocumentChunker(prompts=promptsClass, model=model)

FILE_PATH = r"C:\Users\Pratyush\Desktop\text-to-json\examples\text2SQL-solutions\chase-sql.pdf"
SCHEMA_PATH = r"C:\Users\Pratyush\Desktop\text-to-json\examples\text2SQL-solutions\schema.json"

In [3]:
# step 1
chunks = chunker.smart_chunk(FILE_PATH)
other_data = chunker.extract_other_info(FILE_PATH)

Generating responses: 100%|██████████| 15/15 [00:02<00:00,  5.39it/s]
100%|██████████| 28/28 [00:00<?, ?it/s]


In [4]:
for d in other_data:
    chunks[d['heading']] = {
        'content': d['content'] if type(d['content']) == str else "\n".join(d['content']),
        "sub-headings": []
    }

print(json.dumps(chunks, indent=4))

{
    "Abstract": {
        "content": " In tackling the challenges of large language model (LLM) performance for Text-to-SQL tasks, we introduce CHASE-SQL, a new framework that employs innovative strategies, using test-time compute in multi-agent modeling to improve candidate generation and selection. CHASE-SQL leverages LLMs intrinsic knowledge to generate diverse and high-quality SQL candidates using different LLM generators with: (1) a divide-and-conquer method that decomposes complex queries into manageable sub-queries in a single LLM call; (2) chain-of-thought reasoning based on query execution plans, reflecting the steps a database engine takes during execution; and (3) a unique instance-aware synthetic example generation technique, which offers specific few-shot demonstrations tailored to test questions. To identify the best candidate, a selection agent is employed to rank the candidates through pairwise comparisons with a fine-tuned binary-candidates selection LLM. This select

In [5]:
# step 2
with open(SCHEMA_PATH, 'r') as f:
    schema_dict = json.load(f)

decomposer = SchemaDecomposer(schema_dict, traversal_limit=1)
res = decomposer.decompose()

for r in res:
    print(r)

{'path': 'title', 'type': 'string', 'description': 'Title of the paper or document', 'node_type': 'trunk'}
{'path': 'authors[]', 'type': 'array<string>', 'description': 'List of authors of the paper', 'node_type': 'trunk'}
{'path': 'abstract', 'type': 'string', 'description': 'Up to 5 bullet points describing what is new in the approach', 'node_type': 'trunk'}
{'path': 'date_written', 'type': 'string', 'description': 'Date when paper was written', 'node_type': 'trunk'}
{'path': 'related_work[]', 'type': 'array<string>', 'description': 'List of strings, with related works done; keep the points short and precise', 'node_type': 'trunk'}
{'path': 'approach[]', 'type': 'array<object>', 'description': 'List of methods used in the approach', 'node_type': 'trunk'}
{'path': 'approach[].approach_name', 'type': 'string', 'description': 'Name of the method or technique', 'node_type': 'branch'}
{'path': 'approach[].description', 'type': 'string', 'description': 'Brief summary of what the method is'

In [6]:
# Step 3
decomposer.print_schema_tree(res, indent=4)

 trunk 0  title
 trunk 0  authors[]
 trunk 0  abstract
 trunk 0  date_written
 trunk 0  related_work[]
 trunk 0  approach[]
branch 1 ──── approach_name
branch 1 ──── description
branch 1 ──── steps[]
branch 2 ──────── step_title
branch 2 ──────── details
branch 1 ──── improvements
branch 2 ──────── metric
branch 2 ──────── value_added
 trunk 0  dataset[]
branch 1 ──── name
branch 1 ──── source
branch 1 ──── preprocessing
branch 2 ──────── steps
branch 2 ──────── tools_used
 trunk 0  experiment_results[]
branch 1 ──── experiment_name
branch 1 ──── metrics[]
branch 2 ──────── metric_name
branch 2 ──────── value
 trunk 0  references[]


In [7]:
chunks

{'Abstract': {'content': ' In tackling the challenges of large language model (LLM) performance for Text-to-SQL tasks, we introduce CHASE-SQL, a new framework that employs innovative strategies, using test-time compute in multi-agent modeling to improve candidate generation and selection. CHASE-SQL leverages LLMs intrinsic knowledge to generate diverse and high-quality SQL candidates using different LLM generators with: (1) a divide-and-conquer method that decomposes complex queries into manageable sub-queries in a single LLM call; (2) chain-of-thought reasoning based on query execution plans, reflecting the steps a database engine takes during execution; and (3) a unique instance-aware synthetic example generation technique, which offers specific few-shot demonstrations tailored to test questions. To identify the best candidate, a selection agent is employed to rank the candidates through pairwise comparisons with a fine-tuned binary-candidates selection LLM. This selection approach h

In [8]:
# step 4
from t2j.extractor import FieldExtractor

e = FieldExtractor(model, promptsClass)
extracted_data = e.extract(chunks, res)

In [12]:
extracted_data

[{'schema_field': {'path': 'title',
   'type': 'string',
   'description': 'Title of the paper or document',
   'node_type': 'trunk'},
  'data': 'CHASE-SQL: Multi-Path Reasoning and Preference Optimized Candidate Selection in Text-to-SQL'},
 {'schema_field': {'path': 'authors[]',
   'type': 'array<string>',
   'description': 'List of authors of the paper',
   'node_type': 'trunk'},
  'data': ['Mohammadreza Pourreza',
   'Hailong Li',
   'Ruoxi Sun',
   'Yeounoh Chung',
   'Shayan Talaei',
   'Gaurav Tarlok Kakkar',
   'Yu Gan',
   'Amin Saberi',
   'Fatma zcan',
   'Sercan . Ark']},
 {'schema_field': {'path': 'abstract',
   'type': 'string',
   'description': 'Up to 5 bullet points describing what is new in the approach',
   'node_type': 'trunk'},
  'data': '- Introduction of CHASE-SQL, a new framework for Text-to-SQL tasks.\n- Utilizes test-time compute in multi-agent modeling for improved candidate generation and selection.\n- Employs a divide-and-conquer method and chain-of-thought 

In [10]:
# step 5: aggregator

In [None]:
final_output = merge(extracted_data)

In [6]:
import json
print(json.dumps(final_output, indent=4))

{
    "title": "CHASE-SQL: Multi-Path Reasoning and Preference Optimized Candidate Selection in Text-to-SQL",
    "authors": [
        "Mohammadreza Pourreza",
        "Hailong Li",
        "Ruoxi Sun",
        "Yeounoh Chung",
        "Shayan Talaei",
        "Gaurav Tarlok Kakkar",
        "Yu Gan",
        "Amin Saberi",
        "Fatma zcan",
        "Sercan . Ark"
    ],
    "abstract": "- Introduction of CHASE-SQL, a new framework for Text-to-SQL tasks.\n- Utilizes test-time compute in multi-agent modeling for improved candidate generation and selection.\n- Employs a divide-and-conquer method and chain-of-thought reasoning for SQL generation.\n- Features a unique instance-aware synthetic example generation technique for few-shot demonstrations.\n- Achieves state-of-the-art execution accuracy on the BIRD Text-to-SQL dataset benchmark.",
    "date_written": "October 4, 2024",
    "related_work": [],
    "approach": [
        {
            "approach": [
                {
            

In [None]:
from t2j.workflow import Workflow

w = Workflow("trace-id")

In [None]:
FILE_PATH = r"C:\Users\Pratyush\Desktop\text-to-json\examples\text2SQL-solutions\chase-sql.pdf"
SCHEMA_PATH = r"C:\Users\Pratyush\Desktop\text-to-json\examples\text2SQL-solutions\schema.json"

w.run(FILE_PATH, SCHEMA_PATH)

In [None]:
# TOTAL_TIME = 1.04 min
# TOTAL PRICE = approx 15rs