# ObjectIndex Method (Doesn't Work)
- Inconsistency before and after persisting

In [None]:
from sqlalchemy import create_engine, text, inspect
from llama_index.core import SQLDatabase
from llama_index.core.objects import ObjectIndex, SQLTableNodeMapping
from llama_index.core.objects import SQLTableSchema
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.retrievers import SQLRetriever
import os
# from flask import current_app

In [None]:
embedding_model = "text-embedding-3-small"

In [None]:
# Path to your DuckDB database file
# db_path = os.path.join("../duckdb", "data.db")  
# Change filename if needed
# engine = create_engine(f"duckdb:///{db_path}")
engine = create_engine(f"duckdb:///{"../data.db"}")

In [None]:

# Create an inspector object
inspector = inspect(engine)
table_names = inspector.get_table_names()
sql_database = SQLDatabase(engine, include_tables=table_names)
table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = []
for table in table_names:
    # Processing Columns to Include as context_str
    columns = inspector.get_columns(table)
    column_info = []
    for column in columns:
        column_info.append(f"{column['name']} ({column['type']})")
    
    # Create context string with table columns
    context_str = f"Table columns: {', '.join(column_info)}"
    
    table_schema = SQLTableSchema(
        table_name=table,
        context_str=context_str
        )
    table_schema_objs.append(table_schema)
# Build the object index
obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
    embed_model=OpenAIEmbedding(model=embedding_model),
)

In [None]:
table_retriever = obj_index.as_retriever(similarity_top_k=3)

In [None]:
tables_retireved = table_retriever.retrieve("How many product lines are there in the database?")
print(tables_retireved)

In [None]:
import os
persist_dir = "../index/tables_index"
print(f"Checking if persist directory exists: {persist_dir}")
print(f"✅ Directory exists: {os.path.exists(persist_dir)}")
if os.path.exists(persist_dir):
    print(f"✅ Directory contents: {os.listdir(persist_dir)}")

In [None]:
# Save the object index to disk
obj_index.persist("../index/tables_index")

In [None]:
obj_index = ObjectIndex.from_persist_dir(
    persist_dir="../index/tables_index", 
    object_node_mapping=table_node_mapping,
)

In [None]:
table_retriever = obj_index.as_retriever(similarity_top_k=3)

In [None]:
tables_retireved = table_retriever.retrieve("How many product lines are there in the database?")
print(tables_retireved)

# VectorStoreIndex Method (Preferred)

In [None]:
from sqlalchemy import create_engine, text, inspect
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import StorageContext, load_index_from_storage
import os
# from flask import current_app

In [2]:
embedding_model = "text-embedding-3-small"
similarity_top_k = 4

In [3]:
engine = create_engine(f"duckdb:///{"../data.db"}")

In [4]:
# Create documents for each table
from llama_index.core import Document
# Create an inspector object
inspector = inspect(engine)
table_names = inspector.get_table_names()
# sql_database = SQLDatabase(engine, include_tables=table_names)
documents = []
for table in table_names:
    columns = inspector.get_columns(table)
    column_info = [f"{col['name']} ({col['type']})" for col in columns]
    text = f"Table: '{table}'. This table containes Columns: {', '.join(column_info)}"
    documents.append(Document(text=text, metadata={"table_name": table, "table_context":f"This table has Columns: {column_info}"}))

# Create vector index
vector_index = VectorStoreIndex.from_documents(documents, embed_model=OpenAIEmbedding(model=embedding_model))

In [5]:
table_retriever = vector_index.as_retriever(similarity_top_k=similarity_top_k)

In [6]:
tables_retireved = table_retriever.retrieve("How many product lines are there in the database?")
print(tables_retireved)

[NodeWithScore(node=TextNode(id_='697c310a-a117-4378-9ad8-f2200ea12a22', embedding=None, metadata={'table_name': 'product', 'table_context': "This table has Columns: ['Key (VARCHAR)', 'ParentId (VARCHAR)', 'Name (VARCHAR)', 'Product Line (VARCHAR)']"}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='87a2963d-74b0-4f1e-ac36-42018fddbfb7', node_type='4', metadata={'table_name': 'product', 'table_context': "This table has Columns: ['Key (VARCHAR)', 'ParentId (VARCHAR)', 'Name (VARCHAR)', 'Product Line (VARCHAR)']"}, hash='3633f9e5bf91cd023d31a61317a3cd52630be6cee0d00051c2714d53b9011b92')}, metadata_template='{key}: {value}', metadata_separator='\n', text="Table: 'product'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Product Line (VARCHAR)", mimetype='text/plain', start_char_idx=0, end_char_idx=121, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'

In [7]:
[f"Score={node.score}\n{node.node.text}" for node in tables_retireved]

["Score=0.48172868452768125\nTable: 'product'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Product Line (VARCHAR)",
 "Score=0.3492820594313677\nTable: 'customer'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Channel (VARCHAR), Channel Parent (VARCHAR), Customer Since (VARCHAR), Industry (VARCHAR), Location (VARCHAR), Sales Manager (VARCHAR), Salesperson (VARCHAR)",
 "Score=0.3364055523860449\nTable: 'account'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), UNARY_OPERATOR (VARCHAR), AccountType (VARCHAR), CalculationMethod (VARCHAR), DebitCredit (VARCHAR), LineItemRequired (VARCHAR), NonLeafInput (VARCHAR), NumericFormat (VARCHAR), PreventDataEntry (VARCHAR), TCMethod (VARCHAR), TCFormulaMDX (VARCHAR), ProceduralCalc (VARCHAR), CurrencyConversionMethod (VARCHAR)",
 "Score=0.3257116989133628\nTable: 'version'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VA

In [8]:
# Persisting Index Data
vector_index.storage_context.persist("../index/tables_index")

In [9]:
# When loading
storage_context = StorageContext.from_defaults(persist_dir="../index/tables_index")
vector_index = load_index_from_storage(storage_context, embed_model=OpenAIEmbedding(model=embedding_model))

Loading llama_index.core.storage.kvstore.simple_kvstore from ../index/tables_index\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ../index/tables_index\index_store.json.


In [10]:
table_retriever = vector_index.as_retriever(similarity_top_k=similarity_top_k)

In [11]:
tables_retireved = table_retriever.retrieve("How many product lines are there in the database?")
print(tables_retireved)

[NodeWithScore(node=TextNode(id_='697c310a-a117-4378-9ad8-f2200ea12a22', embedding=None, metadata={'table_name': 'product', 'table_context': "This table has Columns: ['Key (VARCHAR)', 'ParentId (VARCHAR)', 'Name (VARCHAR)', 'Product Line (VARCHAR)']"}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='87a2963d-74b0-4f1e-ac36-42018fddbfb7', node_type='4', metadata={'table_name': 'product', 'table_context': "This table has Columns: ['Key (VARCHAR)', 'ParentId (VARCHAR)', 'Name (VARCHAR)', 'Product Line (VARCHAR)']"}, hash='3633f9e5bf91cd023d31a61317a3cd52630be6cee0d00051c2714d53b9011b92')}, metadata_template='{key}: {value}', metadata_separator='\n', text="Table: 'product'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Product Line (VARCHAR)", mimetype='text/plain', start_char_idx=0, end_char_idx=121, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'

In [12]:
[f"Score={node.score}\n{node.node.text}" for node in tables_retireved]

["Score=0.48174056358218065\nTable: 'product'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Product Line (VARCHAR)",
 "Score=0.3493270837946805\nTable: 'customer'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), Channel (VARCHAR), Channel Parent (VARCHAR), Customer Since (VARCHAR), Industry (VARCHAR), Location (VARCHAR), Sales Manager (VARCHAR), Salesperson (VARCHAR)",
 "Score=0.3364456668185508\nTable: 'account'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VARCHAR), UNARY_OPERATOR (VARCHAR), AccountType (VARCHAR), CalculationMethod (VARCHAR), DebitCredit (VARCHAR), LineItemRequired (VARCHAR), NonLeafInput (VARCHAR), NumericFormat (VARCHAR), PreventDataEntry (VARCHAR), TCMethod (VARCHAR), TCFormulaMDX (VARCHAR), ProceduralCalc (VARCHAR), CurrencyConversionMethod (VARCHAR)",
 "Score=0.325720812688284\nTable: 'version'. This table containes Columns: Key (VARCHAR), ParentId (VARCHAR), Name (VAR

In [13]:
print(tables_retireved[1].metadata["table_name"])
print(tables_retireved[1].metadata["table_context"])

customer
This table has Columns: ['Key (VARCHAR)', 'ParentId (VARCHAR)', 'Name (VARCHAR)', 'Channel (VARCHAR)', 'Channel Parent (VARCHAR)', 'Customer Since (VARCHAR)', 'Industry (VARCHAR)', 'Location (VARCHAR)', 'Sales Manager (VARCHAR)', 'Salesperson (VARCHAR)']
