In [None]:
%pip install llama-index-extractors-entity

In [None]:
%pip install span-marker

In [None]:
%pip install nest_asyncio

In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
from span_marker import SpanMarkerModel

model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")



In [None]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display your GPU's name


In [None]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import SentenceSplitter

entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  # include the entity label in the metadata (can be erroneous)
    device="cuda",  # set to "cuda" if you have a GPU or "cpu" if you have CPU alone
    model_name="tomaarsen/span-marker-mbert-base-multinerd",  # Local model name
    entity_types=["PERSON", "ORG", "GPE"],
)




In [None]:
from llama_index.core import Document

test_document_1 = Document(text="""
                        LLMs offer a natural language interface between humans and data. LLMs come pre-trained on huge amounts of publicly available data, but they are not trained on your data. Your data may be private or specific to the problem you're trying to solve. It's behind APIs, in SQL databases, or trapped in PDFs and slide decks.
                        Context augmentation makes your data available to the LLM to solve the problem at hand. LlamaIndex provides the tools to build any of context-augmentation use case, from prototype to production. Our tools allow you to ingest, parse, index and process your data and quickly implement complex query workflows combining data access with LLM prompting.
                        The most popular example of context-augmentation is Retrieval-Augmented Generation or RAG, which combines context with LLMs at inference time.""")
test_document_2 = Document(text=""" 
                        Agents are LLM-powered knowledge assistants that use tools to perform tasks like research, data extraction, and more. Agents range from simple question-answering to being able to sense, decide and take actions in order to complete tasks.
                        LlamaIndex provides a framework for building agents including the ability to use RAG pipelines as one of many tools to complete a task.""")

test_documents = [test_document_1,test_document_2]




In [None]:
import nltk

# Redownload the punkt resource
nltk.download('punkt_tab')

In [None]:

entity_result = await entity_extractor.aprocess_nodes(test_documents)



In [None]:
entity_result

In [None]:

print("\nEntity Output:")
for node in entity_result:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")

In [None]:
node_parser = SentenceSplitter()

transformations = [node_parser, entity_extractor]

In [None]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)

nodes = pipeline.run(documents=test_documents)

In [None]:
nodes

In [None]:
for node in nodes:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")
    print("======================================================")


In [None]:
from llama_index.core import SimpleDirectoryReader

#documents = SimpleDirectoryReader(input_files=['../data/2022 Q3 AAPL.pdf']).load_data()
documents = SimpleDirectoryReader(input_files=['../data/paul_graham_essay3.txt']).load_data()


In [None]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=1024, chunk_overlap=50),  # Split text into manageable chunks
        entity_extractor,  # Extract summaries
    ]
)


In [None]:
nodes = pipeline.run(
    documents=documents,
    in_place=True,
    show_progress=True
)


In [None]:
nodes

In [None]:
for node in nodes:
    print(f"Node ID: {node.node_id}, Entities: {node.metadata.get('entities', 'No entities available')}")
    print("======================================================")
