In [20]:
%pip install llama-index-extractors-entity

Note: you may need to restart the kernel to use updated packages.


In [21]:
%pip install span-marker

Note: you may need to restart the kernel to use updated packages.


In [22]:
%pip install nest_asyncio

Note: you may need to restart the kernel to use updated packages.


In [23]:
import nest_asyncio
nest_asyncio.apply()

In [24]:
from span_marker import SpanMarkerModel

model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")



In [25]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display your GPU's name


True
NVIDIA GeForce RTX 3080


In [26]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import SentenceSplitter

entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  # include the entity label in the metadata (can be erroneous)
    device="cuda",  # set to "cuda" if you have a GPU or "cpu" if you have CPU alone
    model_name="tomaarsen/span-marker-mbert-base-multinerd",  # Local model name
    entity_types=["PERSON", "ORG", "GPE"],
)




In [27]:
from llama_index.core import Document

test_document_1 = Document(text="""
                        LLMs offer a natural language interface between humans and data. LLMs come pre-trained on huge amounts of publicly available data, but they are not trained on your data. Your data may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks.
                        Context augmentation makes your data available to the LLM to solve the problem at hand. LlamaIndex provides the tools to build any of context-augmentation use case, from prototype to production. Our tools allow you to ingest, parse, index and process your data and quickly implement complex query workflows combining data access with LLM prompting.
                        The most popular example of context-augmentation is Retrieval-Augmented Generation or RAG, which combines context with LLMs at inference time.""")
test_document_2 = Document(text=""" 
                        Agents are LLM-powered knowledge assistants that use tools to perform tasks like research, data extraction, and more. Agents range from simple question-answering to being able to sense, decide and take actions in order to complete tasks.
                        LlamaIndex provides a framework for building agents including the ability to use RAG pipelines as one of many tools to complete a task.""")

test_documents = [test_document_1,test_document_2]




In [None]:
import nltk

# Redownload the punkt resource
nltk.download('punkt_tab')

In [28]:

entity_result = await entity_extractor.aprocess_nodes(test_documents)



Extracting entities: 100%|██████████| 2/2 [00:00<00:00,  5.92it/s]


In [29]:
entity_result

[Document(id_='267728c4-76bf-4190-9fb1-e60e05a3f450', embedding=None, metadata={'entities': ['LlamaIndex', 'SQL']}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text="\n                        LLMs offer a natural language interface between humans and data. LLMs come pre-trained on huge amounts of publicly available data, but they are not trained on your data. Your data may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks.\n                        Context augmentation makes your data available to the LLM to solve the problem at hand. LlamaIndex provides the tools to build any of context-augmentation use case, from prototype to production. Our tools allow you to ingest, parse, index and process your data and quickly implement complex query workflows combining data access with LLM prompting.\n                

In [30]:

print("\nEntity Output:")
for node in entity_result:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")


Entity Output:
Node ID: 267728c4-76bf-4190-9fb1-e60e05a3f450, Entities: ['LlamaIndex', 'SQL']
Node ID: 6a14268f-e551-42a3-904d-1a3559d35800, Entities: ['LlamaIndex']


In [32]:
node_parser = SentenceSplitter()

transformations = [node_parser, entity_extractor]

In [33]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=test_documents)

Extracting entities: 100%|██████████| 2/2 [00:00<00:00,  5.31it/s]


In [34]:
nodes

[TextNode(id_='846cac92-5745-474a-afaa-8d5f397cade3', embedding=None, metadata={'entities': ['SQL']}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='267728c4-76bf-4190-9fb1-e60e05a3f450', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'entities': ['LlamaIndex', 'SQL']}, hash='cc6b43aa8335c6efe1e351005bb7ee363a12e227222b42d4682c185062d7ae45')}, metadata_template='{key}: {value}', metadata_separator='\n', text="LLMs offer a natural language interface between humans and data. LLMs come pre-trained on huge amounts of publicly available data, but they are not trained on your data. Your data may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks.\n                        Context augmentation makes your data available to the LLM to solve the problem at hand. LlamaIndex provides the tools to build any of context-augmentation

In [35]:
for node in nodes:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")
    print("======================================================")


Node ID: 846cac92-5745-474a-afaa-8d5f397cade3, Entities: ['SQL']
Node ID: dbebeca3-ac5d-4fca-99b3-d4bb81e086e5, Entities: ['LlamaIndex']


In [36]:
from llama_index.core import SimpleDirectoryReader

#documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()
documents = SimpleDirectoryReader(input_files=['../data/paul_graham_essay3.txt']).load_data()


In [37]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=1024, chunk_overlap=50),  # Split text into manageable chunks
        entity_extractor,  # Extract summaries
    ]
)


In [38]:
nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True
)


Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 22.22it/s]
Extracting entities: 100%|██████████| 18/18 [00:04<00:00,  3.90it/s]


In [39]:
nodes

[TextNode(id_='73c51d9f-e8d4-4131-a662-b9691f7a9cc5', embedding=None, metadata={'file_path': '..\\data\\paul_graham_essay3.txt', 'file_name': 'paul_graham_essay3.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-11-21', 'last_modified_date': '2024-11-21', 'entities': ['Rich Draves', 'IBM']}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='10cf2deb-dc5f-4485-9d1a-8ff6102aa283', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '..\\data\\paul_graham_essay3.txt', 'file_name': 'paul_graham_essay3.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-11-21', 'last_modified_date': '2024-11-21'}, hash='a86f961a164bba987be1869d2a6d71500fb50aa620683ded7971c

In [40]:
for node in nodes:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")
    print("======================================================")


Node ID: 73c51d9f-e8d4-4131-a662-b9691f7a9cc5, Entities: ['Rich Draves', 'IBM']
Node ID: 7490cba1-792e-4a23-b616-caadefee6397, Entities: ['Winograd', 'Cornell']
Node ID: 8a48ff31-054a-4ce7-89df-880ffe7f6133, Entities: ['Rich Draves', 'Xerox Dandelions', 'Carnegie Institute', 'CMU']
Node ID: e91f4b7d-20f5-4e92-987d-618f2b7d4ab7, Entities: ['Cezanne', 'Florence']
Node ID: 78185e2a-99f3-4995-b636-ad50bb9328a0, Entities: ['Ulivi', 'Interleaf', 'Microsoft Word', 'Italian']
Node ID: 1ad59f24-2091-41ce-bcc2-95a9d8444dc6, Entities: ['Roy Lichtenstein']
Node ID: 50ff204e-81f4-46a0-88b0-fd63307afc42, Entities: ['Y Combinator', 'Robert']
Node ID: 8da892a2-e6e9-46d9-97ed-f374a99536b6, Entities: ['Robert', 'Trevor', 'Trevor Blackwell']
Node ID: 7880e173-88c2-4a2b-b432-8263d93410a0, Entities: ['Y Combinator']
Node ID: 59400b9b-bd52-4cc8-a145-b094e18e8c5f, Entities: ['California', 'New York', 'Santa Cruz Mountains']
Node ID: 2842aa0f-4e65-4548-b818-a6af97c44e81, Entities: ['Y Combinator', 'Lisp']
Nod