# Extract MetaData

In [1]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings

In [2]:
from llama_index.core.schema import MetadataMode

In [3]:
Settings.llm = Ollama(model = "granite3.2:2b")
Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text")

In [4]:
from llama_index.core.extractors import (SummaryExtractor,QuestionsAnsweredExtractor,TitleExtractor,KeywordExtractor,BaseExtractor)

In [5]:
# %pip install llama-index-extractors-entity --break-system-packages

In [6]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter

In [7]:
test_splitter = TokenTextSplitter(separator=' ', chunk_size=1200, chunk_overlap=100)

In [8]:
extractors = [
    TitleExtractor(nodes = 5),
    QuestionsAnsweredExtractor(questions=3),
    # SummaryExtractor(),
    # KeywordExtractor(),
    # EntityExtractor(),
    # BaseExtractor()
]

In [30]:
transformations = [test_splitter] + extractors

In [31]:
from llama_index.core import SimpleDirectoryReader
docs = SimpleDirectoryReader(input_files=['./data/Tulu_Language_Text_Recognition_and_Translation.pdf']).load_data()

In [34]:
docs

[Document(id_='8e68779c-1900-4b8c-9903-98384a62283c', embedding=None, metadata={'page_label': '1', 'file_name': 'Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_path': 'data/Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_type': 'application/pdf', 'file_size': 1427571, 'creation_date': '2025-02-28', 'last_modified_date': '2025-02-28'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="Tulu Language Text Recognition and\nTranslation\nPRATHWINI1, ANISHA P RODRIGUES2, P. VIJAYA3, ROSHAN FERNANDES4*\n1Department of Master of Computer Applications, NMAM Institute of Technology, NITTE(Deemed to be University), India\n(e-mail:

In [35]:
front_page = docs[0:3]

In [36]:
content = docs[3:]

In [37]:
main_docs = front_page + content

In [32]:
from llama_index.core.ingestion import IngestionPipeline

In [38]:
pipeline = IngestionPipeline(transformations=transformations)
nodes = pipeline.run(documents=main_docs)

100%|██████████| 1/1 [00:02<00:00,  2.31s/it]
100%|██████████| 2/2 [00:06<00:00,  3.29s/it]
100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
100%|██████████| 1/1 [00:04<00:00,  4.58s/it]
100%|██████████| 1/1 [00:04<00:00,  4.83s/it]
100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 1/1 [00:00<00:00,  1.27it/s]
100%|██████████| 1/1 [00:08<00:00,  8.03s/it]
100%|██████████| 1/1 [00:02<00:00,  2.23s/it]
100%|██████████| 2/2 [00:04<00:00,  2.50s/it]
100%|██████████| 2/2 [00:05<00:00,  2.75s/it]
100%|██████████| 1/1 [00:02<00:00,  2.37s/it]
100%|██████████| 15/15 [01:27<00:00,  5.83s/it]


In [41]:
nodes[0].metadata

{'page_label': '1',
 'file_name': 'Tulu_Language_Text_Recognition_and_Translation.pdf',
 'file_path': 'data/Tulu_Language_Text_Recognition_and_Translation.pdf',
 'file_type': 'application/pdf',
 'file_size': 1427571,
 'creation_date': '2025-02-28',
 'last_modified_date': '2025-02-28',
 'document_title': 'Title: "Comparative Exploration of Advanced Machine Translation Techniques for Tulu Language: A Comprehensive Analysis of Convolutional Neural Networks, Rule-based Method, and Hybrid Approach"\n\nThis title encapsulates the core elements of the described document. It highlights the focus on Tulu language translation using advanced machine techniques, specifically detailing three approaches: a) Convolutional Neural Networks (CNN), b) rule-based method, and c) hybrid approach. The title also emphasizes the comparative nature by including "comprehensive analysis."',
 'questions_this_excerpt_can_answer': '1. **What is the current status and recognition of the Tulu language according to the