In [2]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor


In [None]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(
    model='llama3.2:latest',
    base_url='http://localhost:11434',
    # temperature=0.1,
    #mean_resizing=False,
)

In [4]:

from span_marker import SpanMarkerModel

# Load the pretrained SpanMarker model
model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [None]:
entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  # include the entity label in the metadata (can be erroneous)
    device="cuda",  # set to "cuda" if you have a GPU or "cpu" if you have CPU alone
    model_name="tomaarsen/span-marker-mbert-base-multinerd",  # Local model name
    entity_types=["PERSON", "ORG", "GPE"],
    batch_size=1,
)

entity_extractor.aextract

In [11]:
transformations = [
    SentenceSplitter(),
    TitleExtractor(nodes=5),  # Extracts up to 5 titles.
    QuestionsAnsweredExtractor(questions=3),  # Extracts up to 3 questions.
    SummaryExtractor(summaries=["prev", "self"]),  # Summarizes previous and current sections.
    KeywordExtractor(keywords=10),  # Extracts up to 10 keywords.
    entity_extractor,  
]


In [6]:
%pip install nest_asyncio




In [6]:
import nest_asyncio
nest_asyncio.apply()

In [12]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)


In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=['../data/paul_graham_short.txt']).load_data()


In [None]:
nodes = pipeline.run(documents=documents)


100%|██████████| 5/5 [00:06<00:00,  1.23s/it]
100%|██████████| 21/21 [00:44<00:00,  2.12s/it]
100%|██████████| 21/21 [00:39<00:00,  1.90s/it]
100%|██████████| 21/21 [00:24<00:00,  1.18s/it]


Extracting entities:   0%|          | 0/21 [00:00<?, ?it/s]

In [None]:
nodes[2].metadata

In [None]:
for node in nodes:
    print(f"Node ID: {node.node_id}")
    print("-------------------------------------------------------------")
    print(f"Document Title: {node.metadata.get('document_title', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Questions Answered: {node.metadata.get('questions_this_excerpt_can_answer', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Section Summary: {node.metadata.get('section_summary', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Excerpt Keywords: {node.metadata.get('excerpt_keywords', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Entities: {node.metadata.get('entities', 'N/A')}")
    print("======================================================")
    print("======================================================")


In [None]:
from llama_index.core.extractors import BaseExtractor

class CustomExtractor(BaseExtractor):
    async def aextract(self, nodes) -> list:
        metadata_list = [
            {
                "custom_metadata": node.metadata.get("document_title", "N/A") +
                "\nKeywords: " + node.metadata.get("excerpt_keywords", "N/A")
            }
            for node in nodes
        ]
        return metadata_list


In [None]:
custom_extractor = CustomExtractor()

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        custom_extractor,
    ]
)
