# Data and Data storage



In [1]:
# setup data needed for the notes
query_1 = "What are the benefits of renewable energy?" # gt is [0, 3]
query_2 = "How do solar panels impact the environment?" # gt is [1, 2]

org_documents =[
    {
        "title": "The Impact of Renewable Energy on the Economy",
        "content": "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure."
    },
    {
        "title": "Understanding Solar Panels",
        "content": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels."
    },
    {
        "title": "Pros and Cons of Solar Energy",
        "content": "While solar energy offers substantial environmental benefits, such as reducing carbon footprints and pollution, it also has downsides. The production of solar panels can lead to hazardous waste, and large solar farms require significant land, which can disrupt local ecosystems."
    },
    {
        "title":  "Renewable Energy and Its Effects",
        "content": "Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate change. They do not produce greenhouse gases during operation, making them essential for sustainable development. However, the initial setup and material sourcing for these technologies can still have environmental impacts."
    }
]

turns = [
    {
        "user": "What are the benefits of renewable energy?",
        "system": "I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.",
        "user_time": "2021-09-01T12:00:00Z",
        "system_time": "2021-09-01T12:00:01Z"
    },
    {
        "user": "How do solar panels impact the environment?",
        "system": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels.",
        "user_time": "2021-09-01T12:00:02Z",
        "system_time": "2021-09-01T12:00:03Z"
    }
]

## Create Document and DialogTurn objects


In [2]:
# create Document objects
from lightrag.core.types import Document

# we will save the content to text and title in the meta_data
documents  = [Document(text=doc['content'], meta_data={'title': doc['title']}) for doc in org_documents]
print(documents)

[Document(id=864d2eb7-1c67-4e7c-bebf-762b217d088f, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=007a8715-ab8a-4a87-b085-ab289c7abd7a, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=5d99864b-5ac1-4346-8542-1a13aba0c6e4, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=1bdc9c7f-7ea0-462d-a1bc-105ead1308a8, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate ...', meta_dat

In [3]:
# create DialogTurn objects

from lightrag.core.types import DialogTurn, UserQuery, AssistantResponse

dialog_turns = [
    DialogTurn(
        user_query=UserQuery(query_str=turn["user"]),
        assistant_response=AssistantResponse(response_str=turn["system"]),
        user_query_timestamp=turn["user_time"],
        assistant_response_timestamp=turn["system_time"],
    )
    for turn in turns
]
print(dialog_turns)

[DialogTurn(id='28f59f32-7ec3-4677-9d48-d7672c7b841c', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='c6c423b1-2c89-4ef6-93d0-453fd7264696', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='How do solar panels impact the environment?', metadata=None), assistant_response=AssistantResponse(response_str='Sola

## Demonstrating the data pipleine creation and application



In [4]:
# prepare the data pipeline

from lightrag.core.embedder import Embedder 
from lightrag.core.types import ModelClientType
from lightrag.components.data_process import DocumentSplitter, ToEmbeddings
from lightrag.core.component import Sequential


model_kwargs = {
    "model": "text-embedding-3-small",
    "dimensions": 256,
    "encoding_format": "float",
}

splitter_config = {
    "split_by": "word",
    "split_length": 50,
    "split_overlap": 10
}

splitter = DocumentSplitter(**splitter_config)
embedder = Embedder(model_client =ModelClientType.OPENAI(), model_kwargs=model_kwargs)
embedder_transformer = ToEmbeddings(embedder, batch_size=2)
data_transformer = Sequential(splitter, embedder_transformer)
print(data_transformer)

Sequential(
  (0): DocumentSplitter(split_by=word, split_length=50, split_overlap=10)
  (1): ToEmbeddings(
    batch_size=2
    (embedder): Embedder(
      model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
      (model_client): OpenAIClient()
    )
    (batch_embedder): BatchEmbedder(
      (embedder): Embedder(
        model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
        (model_client): OpenAIClient()
      )
    )
  )
)


In [5]:
# prepare mapping functions to map the data to Document object for the pipeline

from typing import Dict
# mapping function for org_documents
def map_to_document(doc: Dict) -> Document:
    return Document(text=doc['content'], meta_data={'title': doc['title']})

def map_dialogturn_to_document(turn: DialogTurn) -> Document:
    # it can be important to keep the original data's id
    return Document(id=turn.id, text=turn.user_query.query_str + ' ' + turn.assistant_response.response_str)

In [6]:
# apply data transformation to the dialog_turns

dialog_turns_as_documents = [map_dialogturn_to_document(turn) for turn in dialog_turns]
print(dialog_turns_as_documents)

# apply data transformation to the documents
output = data_transformer(dialog_turns_as_documents)
print(output)

[Document(id=28f59f32-7ec3-4677-9d48-d7672c7b841c, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector=[], parent_doc_id=None, order=None, score=None), Document(id=c6c423b1-2c89-4ef6-93d0-453fd7264696, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector=[], parent_doc_id=None, order=None, score=None)]


Splitting documents: 100%|██████████| 2/2 [00:00<00:00, 740.78it/s]
Batch embedding documents: 100%|██████████| 2/2 [00:00<00:00,  3.22it/s]
Adding embeddings to documents from batch: 2it [00:00, 5845.72it/s]

[Document(id=ba2db344-c31c-4a23-9ac6-33454e719be6, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=0, score=None), Document(id=c6b375bd-e43f-445d-9706-63f0873af524, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=1, score=None), Document(id=61d3dd92-23fb-4364-ab5c-6d383425b3d2, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=c6c423b1-2c89-4ef6-93d0-453fd7264696, order=0, score=None), Document(id=44f62c0c-c638-44b0-9be6-d5b8b92bbbd0, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='




In [7]:
# apply data transformation to the documents
org_documents_as_documents = [map_to_document(doc) for doc in org_documents]

# apply data transformation to the documents
output = data_transformer(org_documents_as_documents)
print(output)

Splitting documents: 100%|██████████| 4/4 [00:00<00:00, 3968.12it/s]
Batch embedding documents: 100%|██████████| 3/3 [00:00<00:00,  5.80it/s]
Adding embeddings to documents from batch: 3it [00:00, 9546.97it/s]

[Document(id=25379585-a84c-4ec4-9045-2dcd5aee0c2f, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector='len: 256', parent_doc_id=de5e15e3-d63d-4ebe-9e8f-24e6a63bb583, order=0, score=None), Document(id=f6b6351b-67b8-4689-957b-6f7fe3421605, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector='len: 256', parent_doc_id=3458c575-5d72-455f-ac46-a4e4ff7c4938, order=0, score=None), Document(id=98aad464-29c3-4d47-81bc-aff69ab3df49, text='positive effect on the environment by reducing the reliance on fossil fuels.', meta_data={'title': 'Understanding Solar Panels'}, vector='len: 256', parent_doc_id=3458c575-5d72-455f-ac46-a4e4ff7c4938, order=1, score=None), Document(id=6975f2a0-44c7-454e-9485-8983d4039e70, text='While solar energy offers substan




## Use LocalDB to help with the in-memory CRUD, and more importantly to keep track of data pipeline and its transformed data and with data storage and restore

In [7]:
# create a db for the dialog_turns
from lightrag.core.db import LocalDB

dialog_turn_db = LocalDB('dialog_turns')
print(dialog_turn_db)

dialog_turn_db.load(dialog_turns)
print(dialog_turn_db)

LocalDB(name='dialog_turns', items=[], transformed_items={}, transformer_setups={}, mapper_setups={})
LocalDB(name='dialog_turns', items=[DialogTurn(id='28f59f32-7ec3-4677-9d48-d7672c7b841c', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='c6c423b1-2c89-4ef6-93d0-453fd7264696', user_id=None, session_id=None, order=None, user_query=U

In [8]:
# apply data transformation to the dialog_turn_db

key = "split_and_embed"
dialog_turn_db.transform(data_transformer, map_fn=map_dialogturn_to_document, key=key)
print(dialog_turn_db.transformed_items[key])
print(dialog_turn_db.transformer_setups[key])
print(dialog_turn_db.mapper_setups[key])

Splitting documents: 100%|██████████| 2/2 [00:00<00:00, 2652.94it/s]
Batch embedding documents: 100%|██████████| 2/2 [00:00<00:00,  4.18it/s]
Adding embeddings to documents from batch: 2it [00:00, 57456.22it/s]

[Document(id=d77e0863-83f1-4a56-afc6-ec49cf1151d7, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=0, score=None), Document(id=cc1926a4-f20b-4314-8205-fb9f2c3ff327, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=1, score=None), Document(id=a15ea431-64d5-4776-8e23-9bbfaf01f65e, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=c6c423b1-2c89-4ef6-93d0-453fd7264696, order=0, score=None), Document(id=f0cd2864-e0aa-42e0-9ff4-8712bf870ed9, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='




In [9]:
# save the state of the dialog_turn_db
dialog_turn_db.save_state('dialog_turn_db_state.pkl')

print(dialog_turn_db)

LocalDB(name='dialog_turns', items=[DialogTurn(id='28f59f32-7ec3-4677-9d48-d7672c7b841c', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='c6c423b1-2c89-4ef6-93d0-453fd7264696', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='How do solar panels impact the environment?', metadata=None), assistant_response=A

In [10]:
# restore the state of the restored_dialog_turn_db
restored_dialog_turn_db = LocalDB.load_state('dialog_turn_db_state.pkl')
print(restored_dialog_turn_db)


LocalDB(name='dialog_turns', items=[DialogTurn(id='28f59f32-7ec3-4677-9d48-d7672c7b841c', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='What are the benefits of renewable energy?', metadata=None), assistant_response=AssistantResponse(response_str='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', metadata=None), user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='c6c423b1-2c89-4ef6-93d0-453fd7264696', user_id=None, session_id=None, order=None, user_query=UserQuery(query_str='How do solar panels impact the environment?', metadata=None), assistant_response=A

In [11]:
# check if the restored_dialog_turn_db is the same as the dialog_turn_db

print(str(dialog_turn_db.__dict__) == str(restored_dialog_turn_db.__dict__))

True


# CRUD operations

We will use a real user conversation by adding a generator

In [12]:
# prepare the generator for the dialog turns 

from lightrag.core import Generator

llm_kwargs = {
    "model": "gpt-3.5-turbo"
}

# we will use the default prompt, and using input_str and chat_history_str for the final prompt
generator = Generator(model_client = ModelClientType.OPENAI(), model_kwargs=llm_kwargs)
print(generator)



Generator(
  model_kwargs={'model': 'gpt-3.5-turbo'}, 
  (prompt): Prompt(
    template: 
    {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}
    <SYS>
    {% endif %}
    {# task desc #}
    {% if task_desc_str %}
    {{task_desc_str}}
    {% endif %}
    {# output format #}
    {% if output_format_str %}
    <OUTPUT_FORMAT>
    {{output_format_str}}
    </OUTPUT_FORMAT>
    {% endif %}
    {# tools #}
    {% if tools_str %}
    <TOOLS>
    {{tools_str}}
    </TOOLS>
    {% endif %}
    {# example #}
    {% if examples_str %}
    <EXAMPLES>
    {{examples_str}}
    </EXAMPLES>
    {% endif %}
    {# chat history #}
    {% if chat_history_str %}
    <CHAT_HISTORY>
    {{chat_history_str}}
    </CHAT_HISTORY>
    {% endif %}
    {#contex#}
    {% if context_str %}
    <CONTEXT>
    {{context_str}}
    </CONTEXT>
    {% endif %}
    {# steps #}
    {% if steps_str %}
    <STEPS>
    {{steps_str}}
    </STEPS>
    {%

In [13]:
# lets see how the prompt will be if we pass the input_str and chat_history_str
input_str = "What are the benefits of renewable energy? Did I ask this before?" 

def format_chat_history_str(turns: list) -> str:
    chat_history_str = []
    for turn in turns:
        chat_history_str.append(turn.to_yaml()) # format as yaml
    # join with newline
    chat_history_str = '\n_________\n'.join(chat_history_str)
    return chat_history_str

chat_history_str = format_chat_history_str(dialog_turns)
print(generator.print_prompt(input_str=input_str, chat_history_str=chat_history_str))

Prompt:

<SYS>
<CHAT_HISTORY>
id: "28f59f32-7ec3-4677-9d48-d7672c7b841c"
user_id: null
session_id: null
order: null
user_query: 
  metadata: null
  query_str: What are the benefits of renewable energy?
assistant_response: 
  metadata: null
  response_str: I can see you are interested in renewable energy. Renewable energy technologies
    not only help in reducing greenhouse gas emissions but also contribute significantly
    to the economy by creating jobs in the manufacturing and installation sectors. The
    growth in renewable energy usage boosts local economies through increased investment
    in technology and infrastructure
user_query_timestamp: "2021-09-01T12:00:00Z"
assistant_response_timestamp: "2021-09-01T12:00:01Z"
metadata: null
vector: null
_________
id: "c6c423b1-2c89-4ef6-93d0-453fd7264696"
user_id: null
session_id: null
order: null
user_query: 
  metadata: null
  query_str: How do solar panels impact the environment?
assistant_response: 
  metadata: null
  response_str:

In [14]:
# as we have quite a bit of empty fields, lets exclude them 
from typing import List

input_str = "What are the benefits of renewable energy? Did I ask this before?" 

def format_chat_history_str(turns: List[DialogTurn]) -> str:
    chat_history_str = []
    for turn in turns:
        chat_history_str.append(
                    turn.to_yaml(
                        exclude=[
                            "id",
                            "user_id",
                            "session_id",
                            "user_query_timestamp",
                            "assistant_response_timestamp",
                            "order",
                            "metadata",
                            "vector",
                        ],
                    )
                )  
    chat_history_str = '\n_________\n'.join(chat_history_str)
    return chat_history_str

chat_history_str = format_chat_history_str(dialog_turn_db.items[0:1])
print(generator.print_prompt(input_str=input_str, chat_history_str=chat_history_str))

Prompt:

<SYS>
<CHAT_HISTORY>
user_query: 
  metadata: null
  query_str: What are the benefits of renewable energy?
assistant_response: 
  metadata: null
  response_str: I can see you are interested in renewable energy. Renewable energy technologies
    not only help in reducing greenhouse gas emissions but also contribute significantly
    to the economy by creating jobs in the manufacturing and installation sectors. The
    growth in renewable energy usage boosts local economies through increased investment
    in technology and infrastructure
</CHAT_HISTORY>
</SYS>
<User>
What are the benefits of renewable energy? Did I ask this before?
</User>
You:

None


In [15]:
print(dialog_turn_db.length, len(dialog_turn_db.transformed_items[key]))

2 4


In [16]:
prompt_kwargs = {"input_str": input_str, "chat_history_str": chat_history_str}

response = generator(prompt_kwargs=prompt_kwargs)
print(response)

GeneratorOutput(data='Yes, you asked about the benefits of renewable energy before. Renewable energy technologies help in reducing greenhouse gas emissions and contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage also boosts local economies through increased investment in technology and infrastructure.', error=None, usage=None, raw_response='Yes, you asked about the benefits of renewable energy before. Renewable energy technologies help in reducing greenhouse gas emissions and contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage also boosts local economies through increased investment in technology and infrastructure.')


In [17]:
# create a turn from the last dialog

new_turn = DialogTurn(
    user_query=UserQuery(query_str=input_str),
    assistant_response=AssistantResponse(response_str=response.data),
)
dialog_turn_db.add(new_turn, apply_transformer=True)

Splitting documents: 100%|██████████| 1/1 [00:00<00:00, 1477.39it/s]
Batch embedding documents: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Adding embeddings to documents from batch: 1it [00:00, 23831.27it/s]


In [18]:
print(dialog_turn_db.length, len(dialog_turn_db.transformed_items[key]))

3 6


In [20]:
print(dialog_turn_db.transformed_items[key])

# we will find them by comparing the id of new_turn to the parent_doc_id in the transformed_items
new_turn_id = new_turn.id
print(new_turn_id)
for item in dialog_turn_db.transformed_items[key]:
    if item.parent_doc_id == new_turn_id:
        print(item)
        print(item.text)

[Document(id=64987b2b-b6c6-4eb4-9122-02448e3fd394, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=f2eddc77-4667-43f5-87e0-fd11f12958b3, order=0, score=None), Document(id=9a424d4c-4bd0-48ce-aba9-7a4f86892556, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=f2eddc77-4667-43f5-87e0-fd11f12958b3, order=1, score=None), Document(id=45efa517-8e52-4780-bdbd-2329ffa8d4b6, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=b2dbdf2f-f513-493d-aaa8-c77c98ac260f, order=0, score=None), Document(id=bc0ff7f6-27cc-4e24-8c3e-9435ed755e20, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vector='

When the conversation history is getting too long for the promp, we will use a retriever to get the most relevant part of the conversation to chat with user.

In [23]:
from lightrag.components.retriever.faiss_retriever import FAISSRetriever

retriever = FAISSRetriever(top_k=3, embedder=embedder)

In [24]:
dialog_turn_db.transformed_items[key]

[Document(id=d77e0863-83f1-4a56-afc6-ec49cf1151d7, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=0, score=None),
 Document(id=cc1926a4-f20b-4314-8205-fb9f2c3ff327, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=28f59f32-7ec3-4677-9d48-d7672c7b841c, order=1, score=None),
 Document(id=a15ea431-64d5-4776-8e23-9bbfaf01f65e, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=c6c423b1-2c89-4ef6-93d0-453fd7264696, order=0, score=None),
 Document(id=f0cd2864-e0aa-42e0-9ff4-8712bf870ed9, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vecto

In [25]:
# we will use the retriever to find top_k chunked documents, from its partent_doc_id, we will find the initial dialog_turn, and feed that to the generator
from lightrag.utils.logger import enable_library_logging
enable_library_logging()

embeddings = [item.vector for item in dialog_turn_db.transformed_items[key]]
print(embeddings)
retriever.build_index_from_documents(documents=embeddings)

# top_k_documents = retriever(input=input_str)
# print(top_k_documents)


[[0.038091756, 0.08130312, 0.10887385, 0.09140143, 0.02799345, 0.016509559, -0.060777724, 0.07463354, 0.0887242, 0.083369754, 0.03921901, 0.04715675, -0.09553469, 0.05082032, 0.06218679, -0.016533043, -0.0772638, 0.027077556, 0.09825888, 0.07082906, 0.024470784, 0.075244136, 0.0071099135, 0.053262703, 0.009652103, 0.0072566913, -0.064394325, -0.02148826, 0.030036595, -0.013421356, 0.018329604, -0.021781815, -0.121085756, -0.07472748, 0.0879727, 0.012693338, -0.045747682, -0.13132498, 0.015159205, 0.018482253, 0.038467508, -0.07731077, -0.07660624, -0.06664883, -0.062233757, -0.017448938, -0.076935016, -0.023578376, 0.07200328, 0.09407865, 0.023472695, -0.056080837, -0.0056039738, -0.13677336, -0.027547246, -0.06434736, -0.007996451, 0.04652267, -0.109625354, -0.014912617, 0.12681596, -0.0016101517, 0.07120481, 0.14006118, -0.09468925, 0.07740471, -0.040087935, 0.01868187, -0.09346806, -0.005096123, 0.044620432, -0.034850907, 0.054906614, -0.034991812, 0.0056979116, -0.057771716, 0.0273

In [27]:
top_k_documents = retriever(input=input_str)
print(top_k_documents)

2024-06-22 15:35:09 - openai_client - INFO - [openai_client.py:185:call] - api_kwargs: {'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float', 'input': ['What are the benefits of renewable energy? Did I ask this before?']}
2024-06-22 15:35:09 - _client - INFO - [_client.py:1026:_send_single_request] - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
[RetrieverOutput(doc_indices=[4, 0, 5], doc_scores=[0.9079999923706055, 0.8050000071525574, 0.6890000104904175], query='What are the benefits of renewable energy? Did I ask this before?', documents=None)]


In [29]:
# get the parent_doc_id from the top_k_documents

parent_doc_ids = set([dialog_turn_db.transformed_items[key][doc_index].parent_doc_id for doc_index in top_k_documents[0].doc_indices])
print(parent_doc_ids)


{'2ec3d54e-4579-4ff5-acb9-65abfad58a20', '28f59f32-7ec3-4677-9d48-d7672c7b841c'}


In [31]:
condition_fn = lambda item: item.id in parent_doc_ids

fetched_dialog_turns = [item for item in dialog_turn_db.items if condition_fn(item)]

In [32]:
chat_history_str = format_chat_history_str(fetched_dialog_turns)

output = generator(prompt_kwargs={"input_str": input_str, "chat_history_str": chat_history_str})
print(output)

2024-06-22 15:46:48 - generator - INFO - [generator.py:224:call] - prompt_kwargs: {'input_str': 'What are the benefits of renewable energy? Did I ask this before?', 'chat_history_str': 'user_query: \n  metadata: null\n  query_str: What are the benefits of renewable energy?\nassistant_response: \n  metadata: null\n  response_str: I can see you are interested in renewable energy. Renewable energy technologies\n    not only help in reducing greenhouse gas emissions but also contribute significantly\n    to the economy by creating jobs in the manufacturing and installation sectors. The\n    growth in renewable energy usage boosts local economies through increased investment\n    in technology and infrastructure\n_________\nuser_query: \n  metadata: null\n  query_str: What are the benefits of renewable energy? Did I ask this before?\nassistant_response: \n  metadata: null\n  response_str: Yes, you asked about the benefits of renewable energy before. Renewable\n    energy technologies help i