# Extract MetaData

In [1]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from llama_index.core.schema import MetadataMode

In [4]:
Settings.llm = Ollama(model = "granite3.2:2b",request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text")

In [5]:
from llama_index.core.extractors import (SummaryExtractor,QuestionsAnsweredExtractor,TitleExtractor,KeywordExtractor,BaseExtractor)

In [6]:
# %pip install llama-index-extractors-entity --break-system-packages

In [7]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter

In [8]:
test_splitter = TokenTextSplitter(separator=' ', chunk_size=1200, chunk_overlap=100)

In [9]:
extractors = [
    TitleExtractor(nodes = 5),
    QuestionsAnsweredExtractor(questions=3),
    SummaryExtractor(summaries=["self"]),
    KeywordExtractor(),
    # EntityExtractor(),
    # BaseExtractor()
]

In [10]:
transformations = [test_splitter] + extractors

In [19]:
from llama_index.core import SimpleDirectoryReader
docs = SimpleDirectoryReader(input_files=['./data/Tulu_Language_Text_Recognition_and_Translation.pdf']).load_data()

In [20]:
docs

[Document(id_='655d59ca-3a76-4b11-9f03-ca7aa9eb30a3', embedding=None, metadata={'page_label': '1', 'file_name': 'Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_path': 'data/Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_type': 'application/pdf', 'file_size': 1427571, 'creation_date': '2025-02-28', 'last_modified_date': '2025-02-28'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="Tulu Language Text Recognition and\nTranslation\nPRATHWINI1, ANISHA P RODRIGUES2, P. VIJAYA3, ROSHAN FERNANDES4*\n1Department of Master of Computer Applications, NMAM Institute of Technology, NITTE(Deemed to be University), India\n(e-mail:

In [12]:
from llama_index.core.ingestion import IngestionPipeline

In [13]:
async def run_pipeline(documents):
    pipeline = IngestionPipeline(transformations=transformations)
    nodes = await pipeline.arun(documents=documents)
    return nodes

In [14]:
import asyncio
nodes = asyncio.run(run_pipeline(documents=docs))

100%|██████████| 1/1 [00:03<00:00,  3.59s/it]
100%|██████████| 2/2 [00:08<00:00,  4.39s/it]
100%|██████████| 1/1 [00:01<00:00,  1.78s/it]
100%|██████████| 1/1 [00:04<00:00,  4.72s/it]
100%|██████████| 1/1 [00:01<00:00,  1.96s/it]
100%|██████████| 1/1 [00:03<00:00,  3.29s/it]
100%|██████████| 1/1 [00:00<00:00,  1.46it/s]
100%|██████████| 1/1 [00:02<00:00,  2.58s/it]
100%|██████████| 1/1 [00:00<00:00,  1.71it/s]
100%|██████████| 2/2 [00:07<00:00,  3.63s/it]
100%|██████████| 2/2 [00:05<00:00,  2.74s/it]
100%|██████████| 1/1 [00:01<00:00,  1.55s/it]
100%|██████████| 15/15 [01:28<00:00,  5.88s/it]
100%|██████████| 15/15 [02:15<00:00,  9.04s/it]
100%|██████████| 15/15 [00:39<00:00,  2.66s/it]


In [18]:
nodes[0]

TextNode(id_='43952eeb-3710-443c-a4b6-8d8455475e1f', embedding=None, metadata={'page_label': '1', 'file_name': 'Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_path': 'data/Tulu_Language_Text_Recognition_and_Translation.pdf', 'file_type': 'application/pdf', 'file_size': 1427571, 'creation_date': '2025-02-28', 'last_modified_date': '2025-02-28', 'document_title': '"Comparative Evaluation of Machine Learning Methodologies: A Comprehensive Study on Rule-Based and Neural Network Approaches for Enhancing Tulu Language Translation, Facilitating Communication Access in Karnataka\'s Dravidian Dialects, and Its Implications for Linguistic Preservation and Technological Advancements"\n\nThis title encapsulates the core elements of the document:\n\n- "Comparative Evaluation": Indicates that the study will compare rule-based and neural network techniques.\n  \n- "Machine Learning Methodologies": Specifies that machine learning approaches are under investigation.\n\n- "Tulu Language Tran

In [23]:
from llama_index.core import VectorStoreIndex, Settings
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import TextNode
from llama_index.llms.ollama import Ollama
import asyncio

# System prompt for LLM to decide response type
system_prompt = """
You are an AI assistant capable of both general conversation and retrieving information from a specific PDF document.
If a query requires information from the PDF, retrieve and return relevant details from the indexed document.
Otherwise, respond as a general AI assistant.
"""

# Create an index from extracted nodes
def create_index_from_nodes(nodes):
    index = VectorStoreIndex(nodes)
    return index

# Function to generate a response where the LLM decides the response type
async def generate_response(query, index=None):
    prompt = f"{system_prompt}\nUser Query: {query}\n"
    if index:
        retriever = index.as_retriever()
        query_engine = RetrieverQueryEngine(retriever=retriever)
        retrieved_info = await query_engine.aquery(query)
        prompt += f"\nRetrieved Information:\n{retrieved_info.response}"
    return await Settings.llm.acomplete(prompt)


In [None]:

# Usage example (Assuming nodes are preprocessed)
nodes = asyncio.run(run_pipeline(documents=docs))  # Extracted metadata nodes
index = create_index_from_nodes(nodes)

100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
100%|██████████| 2/2 [00:06<00:00,  3.41s/it]
100%|██████████| 1/1 [00:01<00:00,  1.39s/it]
100%|██████████| 1/1 [00:00<00:00,  1.44it/s]
100%|██████████| 1/1 [00:07<00:00,  7.14s/it]
100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 1/1 [00:00<00:00,  1.53it/s]
100%|██████████| 1/1 [00:00<00:00,  1.55it/s]
100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
100%|██████████| 2/2 [00:01<00:00,  1.11it/s]
100%|██████████| 2/2 [00:03<00:00,  1.67s/it]
100%|██████████| 1/1 [00:03<00:00,  3.44s/it]
100%|██████████| 15/15 [01:36<00:00,  6.42s/it]
100%|██████████| 15/15 [02:12<00:00,  8.85s/it]


In [25]:
# Sample user query
user_query = "How does CNN compare to rule-based translation in the PDF?"
response = asyncio.run(generate_response(user_query, index))
print(response)

Based on the information from Volume 2023's study "Comparative Analysis and Performance Evaluation of Deep Learning Models vs. Rule-Based Methods for Tulu Language's 'a' Character Recognition and Correlated Blue Score Assessment," here's how CNN compares to rule-based translation:

1. **Character Recognition Accuracy**: The Convolutional Neural Network (CNN) model surpasses traditional rule-based translations with an accuracy of 92% on the validation set, significantly higher than other algorithms. This indicates that deep learning models like CNN are more effective at recognizing 'a' characters in this context.

2. **Performance Metrics**: The CNN outperforms rule-based methods in terms of f1-score (harmonic mean of precision and recall). F1-scores around 90% demonstrate that the CNN offers superior, consistent accuracy across different character classifications compared to rule-based translations.

3. **Impact on 'Blue Score'**: The Blue score, a measure of quality or reliability for

In [26]:
user_query = "How are you? can you tell me a story please..."
response = asyncio.run(generate_response(user_query, index))
print(response)

I'm an AI and don't have feelings, but I'm here to engage in conversation and provide information as needed. Let me tell you a story now:

In the verdant hills of Karnataka, where ancient Dravidian scripts thrived, there was a team of dedicated scholars who took on the mission to preserve an important language - Tulu. Their hearts were filled with reverence for this native tongue, threatened by dominant languages like Kannada.

These scholars crafted a state-of-the-art machine learning model, a deep convolutional neural network (CNN), as their eyes into the world of Tulu script. This wasn't an ordinary algorithm; it was meticulously trained with 30,500 handwritten Tulu characters. This digital sentinel could perceive and decipher Tulu text remarkably like humans.

One crisp morning, they tested this creation to see how well it could translate complexities of the Tulu script into a language that could be understood globally - English or vice versa. They were delighted as their model not

In [27]:
user_query = "Who are you?"
response = asyncio.run(generate_response(user_query, index))
print(response)

I am Granite, an AI assistant developed by IBM. I specialize in understanding and explaining complex topics based on the data provided to me. In this context, we're discussing machine learning models designed for deciphering unique scripts like Tulu, with a focus on improving translation outcomes and detecting emotions from historical documents.


In [28]:
user_query = "who are the authors of the pdf attched?"
response = asyncio.run(generate_response(user_query, index))
print(response)

The authors of this PDF are:

1. Manimozhi
2. Seshikala et al.
3. Anush Bijoor
4. Savitha (or a team)
5. Rao et al.
6. Memon et al.
7. Albahli
8. Bora et al.
9. Deore et al.
10. Khandokar et al.
11. Guha et al.
12. Hamdan et al. (possibly repeated)
13. Vinjit et al.
14. Athira
15. Yadav et al.

(Note: There is some repetition in the list of authors, possibly due to multiple authors contributing to various sections or aspects within this document.)


In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings
import nest_asyncio
nest_asyncio.apply()

# Initialize LlamaIndex settings
Settings.llm = Ollama(model="granite3.2:2b", request_timeout=600)
Settings.embed_model = OllamaEmbedding(model_name="nomic-embed-text")

# Define extractors and text splitter
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
# Uncomment if needed: from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter

test_splitter = TokenTextSplitter(separator=' ', chunk_size=1200, chunk_overlap=100)
extractors = [
    TitleExtractor(nodes=5),
    QuestionsAnsweredExtractor(questions=4),
    SummaryExtractor(summaries=["self"]),
    # KeywordExtractor(),
    # EntityExtractor(),
    # BaseExtractor()
]
transformations = [test_splitter] + extractors

# Read documents from PDF
from llama_index.core import SimpleDirectoryReader
docs = SimpleDirectoryReader(input_files=['./data/Tulu_Language_Text_Recognition_and_Translation.pdf']).load_data()

# Batch processing: create nodes and build the vector index in one step
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core import VectorStoreIndex
import asyncio

async def create_index(documents, batch_size=5):
    pipeline = IngestionPipeline(transformations=transformations)
    # Divide documents into batches
    batches = [documents[i:i+batch_size] for i in range(0, len(documents), batch_size)]
    # Process batches concurrently
    tasks = [pipeline.arun(documents=batch) for batch in batches]
    batch_nodes = await asyncio.gather(*tasks)
    # Flatten the list of lists into a single list of nodes
    all_nodes = [node for batch in batch_nodes for node in batch]
    index = VectorStoreIndex(all_nodes)
    return index

# System prompt for LLM to decide the response type
system_prompt = """
You are an AI assistant capable of both general conversation and retrieving information from a specific PDF document.
If a query requires information from the PDF, retrieve and return relevant details from the indexed document.
Otherwise, respond as a general AI assistant.
"""

# Function to generate a response using the system prompt and, if available, retrieved PDF info
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.schema import TextNode

async def generate_response(query, index=None):
    prompt = f"{system_prompt}\nUser Query: {query}\n"
    if index:
        retriever = index.as_retriever()
        query_engine = RetrieverQueryEngine(retriever=retriever)
        retrieved_info = await query_engine.aquery(query)
        prompt += f"\nRetrieved Information:\n{retrieved_info.response}"
        
    return await Settings.llm.acomplete(prompt)

# Usage example: create the index and generate responses
index = asyncio.run(create_index(docs, batch_size=5))

# Example queries:
user_query = "How does CNN compare to rule-based translation in the PDF?"
response = asyncio.run(generate_response(user_query, index))
print("Response 1:", response)

user_query = "How are you? Can you tell me a story please..."
response = asyncio.run(generate_response(user_query, index))
print("Response 2:", response)

user_query = "Who are you?"
response = asyncio.run(generate_response(user_query, index))
print("Response 3:", response)

user_query = "Who are the authors of the pdf attached?"
response = asyncio.run(generate_response(user_query, index))
print("Response 4:", response)


  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:02<00:00,  2.59s/it]

[A
100%|██████████| 2/2 [00:14<00:00,  7.02s/it]
100%|██████████| 1/1 [00:14<00:00, 14.91s/it]
  0%|          | 0/2 [00:00<?, ?it/s]
100%|██████████| 2/2 [00:16<00:00,  8.42s/it]
100%|██████████| 1/1 [00:14<00:00, 14.14s/it]

100%|██████████| 1/1 [00:12<00:00, 12.47s/it]
  0%|          | 0/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:03<00:00,  3.77s/it]
100%|██████████| 3/3 [00:31<00:00, 10.56s/it]
  0%|          | 0/3 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:31<00:00, 31.94s/it]
100%|██████████| 3/3 [00:44<00:00, 14.93s/it]
100%|██████████| 1/1 [00:40<00:00, 40.93s/it]
100%|██████████| 1/1 [00:03<00:00,  3.21s/it]
100%|██████████| 1/1 [00:08<00:00,  8.84s/it]
100%|██████████| 2/2 [00:10<00:00,  5.32s/it]
100%|██████████| 6/6 [01:12<00:00, 12.04s/it]
100%|██████████| 6/6 [01:18<00:00, 13.06s/it]
100%|██████████| 6/6 [01:40<00:00, 16.67s/it]
100%|██████████| 6/6 [01:44<00:00, 17.35s/it]


Response 1: Based on the information from the PDF:

**Comparison of CNN and Rule-Based Translation:**

1. **Architecture**:
   - CNN layers have varying configurations, including filter numbers, kernel sizes, dropout rates, max pooling operations, and dense layers with specific activation functions (ReLU or Softmax). These characteristics enable more nuanced understanding of language structures compared to rule-based systems, which typically rely on predefined rules for translation.

2. **Hyperparameters**:
   - For the rule-based system: Batch size = 32, Loss function = Categorical Cross-Entropy, Learning rate = 0.001, Epochs = 150, and Optimizer = Adam. These hyperparameters were likely chosen to optimize the performance of this neural network component within a language translation context but are not as flexible for handling diverse linguistic nuances as CNN does through its deep learning architecture.

3. **Rule-Based Translation System**:
   - This system functions in four sequen