In [17]:
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
)
from llama_index.extractors.entity import EntityExtractor


In [18]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama

Settings.llm = Ollama(
    model='llama3.2:latest',
    base_url='http://localhost:11434',
    # temperature=0.1,
    #mean_resizing=False,
)

In [19]:

from span_marker import SpanMarkerModel

# Load the pretrained SpanMarker model
model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")


In [20]:
entity_extractor = EntityExtractor(
    prediction_threshold=0.5,
    label_entities=False,  # include the entity label in the metadata (can be erroneous)
    device="cuda",  # set to "cuda" if you have a GPU or "cpu" if you have CPU alone
    model_name="tomaarsen/span-marker-mbert-base-multinerd",  # Local model name
    entity_types=["PERSON", "ORG", "GPE"],
    batch_size=1,
)



In [23]:
transformations = [
    SentenceSplitter(),
    TitleExtractor(nodes=5),  # Extracts up to 5 titles.
    QuestionsAnsweredExtractor(questions=3),  # Extracts up to 3 questions.
    SummaryExtractor(summaries=["prev", "self"]),  # Summarizes previous and current sections.
    KeywordExtractor(keywords=10),  # Extracts up to 10 keywords.
    entity_extractor,  
]


In [22]:
%pip install nest_asyncio

Note: you may need to restart the kernel to use updated packages.


In [24]:
import nest_asyncio
nest_asyncio.apply()

In [25]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)


In [26]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=['../data/paul_graham_short.txt']).load_data()


In [27]:
nodes = pipeline.run(documents=documents)


100%|██████████| 5/5 [00:05<00:00,  1.20s/it]
100%|██████████| 5/5 [00:11<00:00,  2.25s/it]
100%|██████████| 5/5 [00:08<00:00,  1.66s/it]
100%|██████████| 5/5 [00:06<00:00,  1.20s/it]


Extracting entities:   0%|          | 0/5 [00:00<?, ?it/s]

In [28]:
nodes

[TextNode(id_='ddc7bcd9-61d9-498a-b043-3f01d4baa75f', embedding=None, metadata={'file_path': '..\\data\\paul_graham_short.txt', 'file_name': 'paul_graham_short.txt', 'file_type': 'text/plain', 'file_size': 16447, 'creation_date': '2024-11-26', 'last_modified_date': '2024-11-26', 'document_title': '"Observations from YC: Essays, Misinterpretations, Departures, Language Evolution, and Acknowledgments"', 'questions_this_excerpt_can_answer': "Here are three potential questions that this context can provide specific answers to:\n\n1. What was the primary input method for programming on the IBM 1401, and how did it limit Paul Graham's ability to create useful programs?\n\nThis question is likely to yield a detailed answer about the technical limitations of the IBM 1401 and how they constrained Paul Graham's early programming experiences.\n\n2. In what year did Paul Graham convince his father to buy him a TRS-80, marking the beginning of his serious involvement with computers?\n\nThis questio

In [29]:
nodes[2].metadata

{'file_path': '..\\data\\paul_graham_short.txt',
 'file_name': 'paul_graham_short.txt',
 'file_type': 'text/plain',
 'file_size': 16447,
 'creation_date': '2024-11-26',
 'last_modified_date': '2024-11-26',
 'document_title': '"Observations from YC: Essays, Misinterpretations, Departures, Language Evolution, and Acknowledgments"',
 'questions_this_excerpt_can_answer': "Here are three questions that can provide specific answers to which are unlikely to be found elsewhere:\n\n1. What was the exact location where Paul Graham lived in Florence, Italy, and how did he walk to the Accademia?\n\n(This question is likely to yield a precise location, such as Piazza San Felice 4, and details about his daily commute.)\n\n2. What specific features of Viaweb's code editor allowed users to define their own page styles without realizing it was an editing experience for Lisp expressions?\n\n(This question is likely to reveal the intricacies of Viaweb's code editor and its hidden features.)\n\n3. How did

In [30]:
for node in nodes:
    print(f"Node ID: {node.node_id}")
    print("-------------------------------------------------------------")
    print(f"Document Title: {node.metadata.get('document_title', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Questions Answered: {node.metadata.get('questions_this_excerpt_can_answer', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Section Summary: {node.metadata.get('section_summary', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Excerpt Keywords: {node.metadata.get('excerpt_keywords', 'N/A')}")
    print("-------------------------------------------------------------")
    print(f"Entities: {node.metadata.get('entities', 'N/A')}")
    print("======================================================")
    print("======================================================")


Node ID: ddc7bcd9-61d9-498a-b043-3f01d4baa75f
-------------------------------------------------------------
Document Title: "Observations from YC: Essays, Misinterpretations, Departures, Language Evolution, and Acknowledgments"
-------------------------------------------------------------
Questions Answered: Here are three potential questions that this context can provide specific answers to:

1. What was the primary input method for programming on the IBM 1401, and how did it limit Paul Graham's ability to create useful programs?

This question is likely to yield a detailed answer about the technical limitations of the IBM 1401 and how they constrained Paul Graham's early programming experiences.

2. In what year did Paul Graham convince his father to buy him a TRS-80, marking the beginning of his serious involvement with computers?

This question provides an opportunity for a specific date and context that can help flesh out Paul Graham's personal story and transition from high schoo

In [None]:
from llama_index.core.extractors import BaseExtractor

class CustomExtractor(BaseExtractor):
    async def aextract(self, nodes) -> list:
        metadata_list = [
            {
                "custom_metadata": node.metadata.get("document_title", "N/A") +
                "\nKeywords: " + node.metadata.get("excerpt_keywords", "N/A")
            }
            for node in nodes
        ]
        return metadata_list


In [None]:
custom_extractor = CustomExtractor()

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(),
        custom_extractor,
    ]
)
