# Load local environments from `.env` and define some global variables

## Load local environments from `.env`

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

openai_api_key = os.getenv('OPENAI_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
connection_string = os.getenv('CONNECTION_STRING')

## Define global variables

In [3]:
from llama_index.embeddings import GeminiEmbedding
from llama_index.callbacks import CallbackManager, LlamaDebugHandler

embed_model = GeminiEmbedding()
callback_manager = CallbackManager([
    LlamaDebugHandler(print_trace_on_end=True),
])

# Define functions

## Load source documents

In [4]:
from typing import List
from llama_index import Document, SimpleDirectoryReader

def load_documents() -> List[Document]:
    loader = SimpleDirectoryReader(
        input_files=['./documents/_event__202401120928.json']
    )
    documents = loader.load_data(show_progress=True)
    documents = Document(text='\n\n'.join([doc.get_content() for doc in documents]))

    return [documents]

In [5]:
documents = load_documents()

Loading files: 100%|███████████████████████████████████████████████| 1/1 [00:00<00:00, 166.66file/s]


## Parsing source documents into smaller chunks (nodes)

In [6]:
from typing import List
from llama_index.node_parser import JSONNodeParser
from llama_index.schema import BaseNode

def build_nodes(documents: List[Document]) -> List[BaseNode]:
    node_parser = JSONNodeParser.from_defaults(callback_manager=callback_manager)
    nodes = node_parser.get_nodes_from_documents(
        documents=documents, show_progress=True
    )
    return nodes

In [7]:
nodes = build_nodes(documents)

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

## Construct ServiceContext and StorageContext

In [8]:
from typing import Tuple
from llama_index import ServiceContext, StorageContext
from llama_index.callbacks import CallbackManager
from llama_index.embeddings import BaseEmbedding
from llama_index.vector_stores.postgres import PGVectorStore
from sqlalchemy import make_url

def build_context(
    embed_model: BaseEmbedding | None,
    callback_manager: CallbackManager | None,
) -> Tuple[ServiceContext, StorageContext]:
    service_context = ServiceContext.from_defaults(
        embed_model=embed_model, callback_manager=callback_manager
    )

    uri = make_url(connection_string)
    vector_store = PGVectorStore.from_params(
        host=uri.host,
        port=str(uri.port),
        database=uri.database,
        user=uri.username,
        password=uri.password,
        embed_dim=768, # REMEMBER TO CHANGE THIS TO 1536 if using OpenAI Embedding Model
    )
    storage_context = StorageContext.from_defaults(vector_store=vector_store)

    return service_context, storage_context

In [9]:
context = build_context(embed_model, callback_manager)

## Construct index

In [10]:
from typing import List, Tuple
from llama_index import ServiceContext, StorageContext, VectorStoreIndex
from llama_index.schema import BaseNode

def build_index(
    nodes: List[BaseNode], context: Tuple[ServiceContext, StorageContext]
) -> VectorStoreIndex:
    index = VectorStoreIndex(
        nodes=nodes,
        service_context=context[0],
        storage_context=context[-1],
        show_progress=True,
    )

    return index

In [13]:
# REMEMBER TO CLEAN DATABASE BEFORE RUNNING INDEX CONSTRUCTION
# or else the new embedding vector will be appended into old embedding vector

In [12]:
index = build_index(nodes, context)

Generating embeddings:   0%|          | 0/65 [00:00<?, ?it/s]

**********
Trace: index_construction
    |_embedding ->  6.125391 seconds
    |_embedding ->  5.731168 seconds
    |_embedding ->  5.832827 seconds
    |_embedding ->  5.700512 seconds
    |_embedding ->  5.97814 seconds
    |_embedding ->  4.164257 seconds
    |_embedding ->  2.206623 seconds
**********
