In [19]:
import json
import os
from pathlib import Path

import tomli as tomlib
from dotenv import load_dotenv
from llama_index.core import Settings, VectorStoreIndex, get_response_synthesizer, SimpleDirectoryReader, StorageContext
from llama_index.core.node_parser import TokenTextSplitter, SentenceSplitter
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.neo4jvector import Neo4jVectorStore
from llama_index.core.indices.vector_store.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import SimilarityPostprocessor, NERPIINodePostprocessor, PrevNextNodePostprocessor

In [20]:
load_dotenv(override=True)

True

In [21]:
# parse configuration
with open('../pyproject.toml', "rb") as file:
    CFG = tomlib.load(file)

## Chunks preparation

In [22]:
# initialize models
embed_model = OpenAIEmbedding(
    model=CFG['configuration']['models']['embedding_model'],
    api_key=os.getenv('AZURE_OPENAI_API_KEY'),
    dimensions=CFG['configuration']['embedding_dimension']
)

Settings.embed_model = embed_model

In [23]:
# get documents paths
document_paths = [Path(CFG['configuration']['data']['raw_data_path']) / document for document in CFG['configuration']['data']['source_docs']]

# initialize a file reader
reader = SimpleDirectoryReader(input_files=document_paths)

# load documents into LlamaIndex Documents
documents = reader.load_data()

In [24]:
def id_func(index, document):
    """Creates a specific chunk id"""
    document_name = Path(document.metadata['file_name']).stem
    return f"{document_name}-{index}"

# chunks splitter
parser = SentenceSplitter(
    chunk_size=CFG['configuration']['chunk_size'],
    chunk_overlap=CFG['configuration']['chunk_overlap'],
    separator=CFG['configuration']['separator'],
    id_func=id_func
)

# parse documents into nodes (chunks)
nodes = parser.get_nodes_from_documents(documents)

In [25]:
len(nodes)

36

## DB setup

In [26]:
neo4j_vector = Neo4jVectorStore(
    username=CFG['configuration']['db']['username'],
    password=CFG['configuration']['db']['password'],
    url=CFG['configuration']['db']['url'],
    embedding_dimension=CFG['configuration']['embedding_dimension'],
    hybrid_search=CFG['configuration']['hybrid_search']
)

# setup context
storage_context = StorageContext.from_defaults(
    vector_store=neo4j_vector
)

# populate DB with nodes
index = VectorStoreIndex(nodes, storage_context=storage_context, show_progress=True)

Generating embeddings: 100% 36/36 [00:01<00:00, 27.53it/s]
