# Data and Data storage



## Document and management

In [2]:
query_1 = "What are the benefits of renewable energy?" # gt is [0, 3]
query_2 = "How do solar panels impact the environment?" # gt is [1, 2]

org_documents =[
    {
        "title": "The Impact of Renewable Energy on the Economy",
        "content": "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure."
    },
    {
        "title": "Understanding Solar Panels",
        "content": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels."
    },
    {
        "title": "Pros and Cons of Solar Energy",
        "content": "While solar energy offers substantial environmental benefits, such as reducing carbon footprints and pollution, it also has downsides. The production of solar panels can lead to hazardous waste, and large solar farms require significant land, which can disrupt local ecosystems."
    },
    {
        "title":  "Renewable Energy and Its Effects",
        "content": "Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate change. They do not produce greenhouse gases during operation, making them essential for sustainable development. However, the initial setup and material sourcing for these technologies can still have environmental impacts."
    }
]

In [3]:
from lightrag.core.types import Document

# we will save the content to text and title in the meta_data
documents  = [Document(text=doc['content'], meta_data={'title': doc['title']}) for doc in org_documents]
print(documents)

[Document(id=d257ea6c-caf4-4a8d-b581-a0809c309bb5, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=158bbb1e-be94-4c2f-aaaa-4a833295e949, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=4497bbee-6275-468d-8719-a4180be46cfb, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=3037dd65-e135-4b4b-aea7-59c585037134, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate ...', meta_dat

In [4]:
# do dialogturns 

from lightrag.core.types import DialogTurn

turns = [
    {
        "user": query_1,
        "system": "I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.",
        "user_time": "2021-09-01T12:00:00Z",
        "system_time": "2021-09-01T12:00:01Z"
    },
    {
        "user": query_2,
        "system": "Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electricity. Solar panels are a type of renewable energy technology that has been found to have a significant positive effect on the environment by reducing the reliance on fossil fuels.",
        "user_time": "2021-09-01T12:00:02Z",
        "system_time": "2021-09-01T12:00:03Z"
    }
]

dialog_turns = [DialogTurn(user_query = turn['user'], assistant_response = turn['system'], user_query_timestamp = turn['user_time'], assistant_response_timestamp = turn['system_time']) for turn in turns]
print(dialog_turns)

[DialogTurn(id='ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a', user_id=None, session_id=None, order=None, user_query='What are the benefits of renewable energy?', assistant_response='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='0858d694-3158-460b-be3a-6925e4afc9c1', user_id=None, session_id=None, order=None, user_query='How do solar panels impact the environment?', assistant_response='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from atoms, generating a flow of electrici

In [5]:
# use LocalDB to assist the CRUD of the documents and the persisting of the documents

from lightrag.core.db import LocalDB

db = LocalDB(  )
print(db)

LocalDB(name=None, items=[], transformed_items={}, mapped_items={}, transformer_setups={}, mapper_setups={})


### Loading and CRUD operations on DialogTurns



In [6]:
dialog_turn_db = LocalDB('dialog_turns')
print(dialog_turn_db)
dialog_turn_db.load(dialog_turns)
print(dialog_turn_db)

LocalDB(name='dialog_turns', items=[], transformed_items={}, mapped_items={}, transformer_setups={}, mapper_setups={})
LocalDB(name='dialog_turns', items=[DialogTurn(id='ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a', user_id=None, session_id=None, order=None, user_query='What are the benefits of renewable energy?', assistant_response='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='0858d694-3158-460b-be3a-6925e4afc9c1', user_id=None, session_id=None, order=None, user_query='How do solar panels impact the environment?', assistant_response='

In [7]:
# embedder pipeline, the data can be large, its better to use toEmbedding instead of directly the embedder who is doing only one batch at a time

from lightrag.core.embedder import Embedder 
from lightrag.core.types import ModelClientType, EmbedderOutput


model_kwargs = {
    "model": "text-embedding-3-small",
    "dimensions": 256,
    "encoding_format": "float",
}

embedder = Embedder(model_client =ModelClientType.OPENAI(), model_kwargs=model_kwargs)
embedder

Embedder(
  model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
  (model_client): OpenAIClient()
)

In [8]:
# embedder as a transformer
from lightrag.core.data_components import ToEmbeddings

def map_fn(dialog_turn: DialogTurn):
    # map to a document to use our data components pipeline
    return Document(text=dialog_turn.user_query + ' ' + dialog_turn.assistant_response, id=dialog_turn.id)

def add_fn(x: DialogTurn, y):
    x.vector = y

embedder_transformer =ToEmbeddings(embedder=embedder, batch_size=1)

dialog_as_documents = [map_fn(turn) for turn in dialog_turn_db.items]
outputs = embedder_transformer(dialog_as_documents)

print(outputs)

# dialog_turn_db.transform_data(transformer=embedder, map_func=map_fn)
# print(dialog_turn_db)

Batch embedding documents: 100%|██████████| 2/2 [00:00<00:00,  2.53it/s]
Adding embeddings to documents from batch: 2it [00:00, 3960.63it/s]

[Document(id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None), Document(id=0858d694-3158-460b-be3a-6925e4afc9c1, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None)]





In [9]:
# create a pipeline -> transformed in a Document

dialog_turn_db.transform_data(transformer=embedder_transformer, map_func=map_fn)

Batch embedding documents: 100%|██████████| 2/2 [00:00<00:00,  5.44it/s]
Adding embeddings to documents from batch: 2it [00:00, 18315.74it/s]


'ToEmbeddings__embedder_embedder.model_client_batch_embedder_'

In [10]:
print(dialog_turn_db.transformed_items)

{'ToEmbeddings__embedder_embedder.model_client_batch_embedder_': [Document(id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None), Document(id=0858d694-3158-460b-be3a-6925e4afc9c1, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None)]}


In [14]:
# adding text splitter

from lightrag.core.document_splitter import DocumentSplitter
from lightrag.core.component import Sequential

splitter_config = {
    "split_by": "word",
    "split_length": 50,
    "split_overlap": 10
}

splitter = DocumentSplitter(**splitter_config)
data_transformer = Sequential(splitter, embedder_transformer)
data_transformer

Sequential(
  (0): DocumentSplitter(split_by=word, split_length=50, split_overlap=10)
  (1): ToEmbeddings(
    batch_size=1
    (embedder): Embedder(
      model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
      (model_client): OpenAIClient()
    )
    (batch_embedder): BatchEmbedder(
      (embedder): Embedder(
        model_kwargs={'model': 'text-embedding-3-small', 'dimensions': 256, 'encoding_format': 'float'}, 
        (model_client): OpenAIClient()
      )
    )
  )
)

In [15]:
# transform the dialog_turns

dialog_turn_db.transform_data(transformer=data_transformer, map_func=map_fn)
print(dialog_turn_db.transformed_items)

Splitting documents: 100%|██████████| 2/2 [00:00<00:00, 436.77it/s]
Batch embedding documents: 100%|██████████| 4/4 [00:00<00:00,  5.51it/s]
Adding embeddings to documents from batch: 4it [00:00, 12418.37it/s]

{'ToEmbeddings__embedder_embedder.model_client_batch_embedder_': [Document(id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None), Document(id=0858d694-3158-460b-be3a-6925e4afc9c1, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=None, order=None, score=None)], 'Sequential__0_1_1.embedder_1.embedder.model_client_1.batch_embedder_': [Document(id=a5e93e64-12ae-409c-995d-03481d5fb2c0, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, order=0, score=None), Document(id=79e9d034-f744-4a78-a9fe-d4d7e4c87462, text='and installation sectors. The growth in renewable




In [17]:
dialog_turn_db.transformed_items.keys()
key = list(dialog_turn_db.transformed_items.keys())[-1]
dialog_turn_db.transformed_items[key]

[Document(id=a5e93e64-12ae-409c-995d-03481d5fb2c0, text='What are the benefits of renewable energy? I can see you are interested in renewable energy. Renewab...', meta_data=None, vector='len: 256', parent_doc_id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, order=0, score=None),
 Document(id=79e9d034-f744-4a78-a9fe-d4d7e4c87462, text='and installation sectors. The growth in renewable energy usage boosts local economies through increa...', meta_data=None, vector='len: 256', parent_doc_id=ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a, order=1, score=None),
 Document(id=0b9fc3f6-ba3c-484e-9321-0a59acd80878, text='How do solar panels impact the environment? Solar panels convert sunlight into electricity by allowi...', meta_data=None, vector='len: 256', parent_doc_id=0858d694-3158-460b-be3a-6925e4afc9c1, order=0, score=None),
 Document(id=6b5dcc8a-11e3-4f66-88e6-034a2ebd1fa6, text='has been found to have a significant positive effect on the environment by reducing the reliance on ...', meta_data=None, vecto

In [18]:
dialog_turn_db.save_state(filepath='.storage/dialog_turns.pkl')

In [19]:
reloaded_dialog_turn_db = LocalDB.load_state(filepath='.storage/dialog_turns.pkl')
print(reloaded_dialog_turn_db)

LocalDB(name='dialog_turns', items=[DialogTurn(id='ca330bbd-bfa9-4fe0-bf36-801cd8f0db5a', user_id=None, session_id=None, order=None, user_query='What are the benefits of renewable energy?', assistant_response='I can see you are interested in renewable energy. Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.', user_query_timestamp='2021-09-01T12:00:00Z', assistant_response_timestamp='2021-09-01T12:00:01Z', metadata=None, vector=None), DialogTurn(id='0858d694-3158-460b-be3a-6925e4afc9c1', user_id=None, session_id=None, order=None, user_query='How do solar panels impact the environment?', assistant_response='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock electrons free from at

### Loading and CRUD operations on documents

In [5]:
db.load(documents)
print(db)

LocalDB(name=None, items=[Document(id=6583698e-c676-4966-a0a3-91a0f3de3796, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=fab80c66-5da8-4639-9431-3f64f5829ab1, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=204e97ed-bd37-4d6f-b14e-e7518b4d1dc8, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=880bf5f9-7dde-4d4c-af74-83bdc82cbfba, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combati

In [6]:
# add and delete documents

new_document = Document(
    text="This is a new document", meta_data={'title': 'New Document'})

db.add(new_document)
print(db)

LocalDB(name=None, items=[Document(id=6583698e-c676-4966-a0a3-91a0f3de3796, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=fab80c66-5da8-4639-9431-3f64f5829ab1, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=204e97ed-bd37-4d6f-b14e-e7518b4d1dc8, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=880bf5f9-7dde-4d4c-af74-83bdc82cbfba, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combati

In [7]:
db.delete(indices=[-1])

In [8]:
len(db.documents)

AttributeError: 'LocalDB' object has no attribute 'documents'

In [None]:
db.add(new_document, index=1)
print(db)
len(db.documents)

LocalDB(documents=[Document(id=b349a519-1139-42cb-9362-aa4a6a6919c4, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=49e80879-5b41-4f81-a236-ec512f2c60ad, text='This is a new document', meta_data={'title': 'New Document'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=12d12f6e-e269-4482-8b5b-7438a12d89b0, text='Solar panels convert sunlight into electricity by allowing photons, or light particles, to knock ele...', meta_data={'title': 'Understanding Solar Panels'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=3ed98cb6-1ee2-4a96-b7fb-94bc5dcd3913, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, ord

5

In [None]:
db.delete(indices=[1])
len(db.documents)

4

In [None]:
db.delete(indices=[1])
len(db.documents)

3

In [None]:
db

LocalDB(documents=[Document(id=b349a519-1139-42cb-9362-aa4a6a6919c4, text='Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute...', meta_data={'title': 'The Impact of Renewable Energy on the Economy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=3ed98cb6-1ee2-4a96-b7fb-94bc5dcd3913, text='While solar energy offers substantial environmental benefits, such as reducing carbon footprints and...', meta_data={'title': 'Pros and Cons of Solar Energy'}, vector=[], parent_doc_id=None, order=None, score=None), Document(id=7e13dac8-3a2f-4dcb-aeba-2941d2249e1d, text='Renewable energy sources like wind, solar, and hydro power play a crucial role in combating climate ...', meta_data={'title': 'Renewable Energy and Its Effects'}, vector=[], parent_doc_id=None, order=None, score=None)], transformed_documents={}, mapped_documents={}, transformer_setups={})

## data transformation (processing)

In [None]:
from lightrag.core.base_data_class import DataClassFormatType
first_document = db.documents[0]
json_str = first_document.format_example_str(DataClassFormatType.EXAMPLE_JSON)
print(json_str)

{
    "text": "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure.",
    "meta_data": {
        "title": "The Impact of Renewable Energy on the Economy"
    },
    "vector": [],
    "id": "b349a519-1139-42cb-9362-aa4a6a6919c4",
    "order": "None",
    "score": "None",
    "parent_doc_id": "None",
    "estimated_num_tokens": 47
}


In [None]:
yaml_str = first_document.format_example_str(DataClassFormatType.EXAMPLE_YAML)
print(yaml_str)

text: "Renewable energy technologies not only help in reducing greenhouse gas emissions but also contribute significantly to the economy by creating jobs in the manufacturing and installation sectors. The growth in renewable energy usage boosts local economies through increased investment in technology and infrastructure."
meta_data: title: The Impact of Renewable Energy on the Economy
vector: []
id: "b349a519-1139-42cb-9362-aa4a6a6919c4"
order: null
score: null
parent_doc_id: null
estimated_num_tokens: 47
