In [None]:
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.llms import AzureOpenAI
from llama_index.vector_stores import (
    OpensearchVectorStore,
    OpensearchVectorClient,
)
from llama_index import (
    ServiceContext,
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex
)
import nltk
from llama_index.node_parser import SentenceWindowNodeParser
from llama_index.text_splitter import SentenceSplitter
from llama_index.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank

In [None]:
# 讀取資料
documents = SimpleDirectoryReader("./data/").load_data()
print(len(documents))
print("-"*50)
print(documents[0].text)
print("-"*50)
print(documents[0].metadata)

In [None]:
# parsers
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

# parse nodes from documents
nodes = node_parser.get_nodes_from_documents(documents)

In [None]:
print(len(nodes))
print(nodes[0].text)

In [None]:
print(nodes[0].metadata['window'])

In [None]:
# prepare embedding model
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-small")
embeddings = embed_model.get_text_embedding(nodes[0].text)
print(len(embeddings))
print(embeddings[:5])

# prepare llm
api_key = "YOUR_AZURE_API_KEY"
azure_endpoint = "YOUR_AZURE_ENDPOINT"
api_version = "2023-09-15-preview"
model = "YOUR_MODEL"
deploy_name = "YOUR_DEPLOY_NAME"

llm = AzureOpenAI(
    model=model,
    deployment_name=deploy_name,
    api_key=api_key,
    azure_endpoint=azure_endpoint,
    api_version=api_version,
)

In [None]:
# prepare vector store (Opensearch)
host = "https://localhost:9200"
index_name = "demo_test" # custom index name
# OpensearchVectorClient stores text in this field by default
text_field = "content"
# OpensearchVectorClient stores embeddings in this field by default
embedding_field = "embedding"
client = OpensearchVectorClient(
    host, index_name,384,embedding_field=embedding_field, text_field=text_field, 
    http_auth=("admin", "admin"), use_ssl=False, verify_certs=False,  ssl_assert_hostname=False, ssl_show_warn=False,

)

# initialize vector store
vector_store = OpensearchVectorStore(client)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    documents=documents, storage_context=storage_context, service_context=service_context
)

In [None]:
postprocessors_matadata = MetadataReplacementPostProcessor(target_metadata_key="window")
postprocessors_rerank = SentenceTransformerRerank(model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3)
query_engine = index.as_query_engine(similarity_top_k=5,node_postprocessors=[postprocessors_rerank,postprocessors_matadata],)

response = query_engine.query("請問企業導入AI案例與建議有什麼")

In [None]:
print(response.response)

In [None]:
retrieve_engine = index.as_retriever(similarity_top_k=3)
node_retrieve = retrieve_engine.retrieve("請問企業導入AI案例與建議有什麼")
print(node_retrieve[1])

In [None]:
# store the index data
index.storage_context.persist("./storage/")