In [None]:
!pip install llama-index
!pip install llama-index-core
!pip install llama-index-embeddings-huggingface
!pip install llama-parse
!pip install llama-index-llms-ollama

In [1]:
# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio
import nest_asyncio
from dotenv import load_dotenv
import os
nest_asyncio.apply()

load_dotenv()

# API access to llama-cloud
os.environ["LLAMA_CLOUD_API_KEY"] = os.getenv("LLAMA_CLOUD_API_KEY")

In [2]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="llama3.2:3b",request_timeout=420.0, temperature= 0.75)

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

  return torch._C._cuda_getDeviceCount() > 0


In [None]:
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model

In [25]:
pdf_path = "/home/nitheesh/Desktop/FinStatAI/DATA/Small_Cap/2023_annual_report_22_32.pdf"
# pdf_path = "/home/nitheesh/Desktop/FinStatAI/DATA/Large Cap/Annual-Report-for-the-Financial-Year-2023-2024.pdf"

In [26]:
import pdfplumber

def detect_tables_in_pdf(pdf_path):
    pages_with_tables = []
    pages_without_tables = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            if page.find_tables():
                pages_with_tables.append(page_num)
            else:
                pages_without_tables.append(page_num)
    return pages_with_tables,pages_without_tables

pages_with_tables, pages_without_tables = detect_tables_in_pdf(pdf_path)

In [27]:
print("Total pages without Tables :",len(pages_without_tables))
print("Total pages with Tables :",len(pages_with_tables))

Total pages without Tables : 8
Total pages with Tables : 3


In [28]:
def extract_text_using_pdfplumber(pdf_path,pages_without_tables):
    extracted_pages = {}
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in pages_without_tables:
            if 1 <= page_num <= len(pdf.pages):
                page = pdf.pages[page_num - 1]  # pdfplumber uses 0-indexing
                text = page.extract_text()
                if text:
                    extracted_pages[page_num] = text.strip()
            else:
                print(f"Warning: Page {page_num} is out of range and will be skipped.")
    return extracted_pages

extracted_pages_pdfplumb = extract_text_using_pdfplumber(pdf_path,pages_without_tables)

In [29]:
from llama_parse import LlamaParse
def extract_mds_using_llamaparse(pdf_path, pages_with_tables):
    extracted_pages = {}
    target_pages = ",".join([str(i-1) for i in pages_with_tables])
    # parsing_instruction = "Extract table headers and link each cell to its corresponding header. Apply any specified instructions provided for the table (e.g., '000s omitted', 'values in crore rupees') to the table values where applicable. Capture numerical values (e.g., currency, percentages) and dates in a standard format."
    if len(pages_with_tables):
        parser = LlamaParse(target_pages=target_pages,result_type="markdown",verbose=False)
        page_markdowns = parser.load_data(pdf_path)
        for page_num,item in zip(pages_with_tables,page_markdowns):
            extracted_pages[page_num] = item
    return extracted_pages
extracted_pages_llamaparse = extract_mds_using_llamaparse(pdf_path,pages_with_tables)

In [12]:
def create_md_file(extracted_pages,filename):
    with open(filename, 'w', encoding="utf-8") as md_file:
        for page_num,page in extracted_pages.items():
            md_file.write(page.text)
            md_file.write("\n\n---\n\n")
create_md_file(extracted_pages_llamaparse,"parsed_file_with_instruction")

In [30]:
from llama_index.core import Document, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.schema import MetadataMode
from typing import List

def create_nodes_from_text(extracted_pages):
    nodes = []
    for page_num, text in extracted_pages.items():
        # Create a Document object for each page
        embedding = embed_model.get_text_embedding(text)
        doc = Document(
            text=text,
            metadata={
                "page_number": page_num
            },
            embedding=embedding
        )
        nodes.append(doc)
    return nodes

def create_nodes_from_markdown_pages(extracted_pages):
    # Create a MarkdownNodeParser
    markdown_parser = MarkdownNodeParser()
        
    # Parse documents into nodes
    nodes = []
    for page_num, doc in extracted_pages.items():
        doc.metadata={"page_number": page_num}
        doc.embedding = embed_model.get_text_embedding(doc.text)
        page_nodes = markdown_parser.get_nodes_from_documents([doc])
        for node in page_nodes:
            node.metadata["page_number"] = doc.metadata["page_number"]
        nodes.extend(page_nodes)
    
    return nodes

In [31]:
md_nodes = create_nodes_from_markdown_pages(extracted_pages_llamaparse)
text_nodes = create_nodes_from_text(extracted_pages_pdfplumb)
nodes = md_nodes + text_nodes

In [34]:
md_nodes[0]

TextNode(id_='2f6bb8b1-40e8-43ad-99ec-646bc8718cf7', embedding=[-0.056055229157209396, -0.010347251780331135, -0.004116282798349857, -0.07599608600139618, -0.04172581061720848, 0.01914050057530403, 0.04949688911437988, -0.02028152532875538, 0.04906217008829117, -0.016588274389505386, 0.0017678203294053674, 0.01741931587457657, 0.0006817742832936347, -0.06077667698264122, 0.020769581198692322, -0.029767174273729324, -0.04147212207317352, 0.004349495284259319, 0.00929032452404499, 0.05472235381603241, -0.01720578409731388, -0.07275687903165817, -0.006431830581277609, -0.009680824354290962, 0.055719103664159775, -0.02916606515645981, -0.0019891755655407906, -0.027885625138878822, -0.10082194954156876, -0.179658442735672, 0.07896465808153152, 0.018243703991174698, 0.015504921786487103, 0.0237293541431427, -0.02749352715909481, -0.05593299865722656, -0.05256479233503342, 0.002666562795639038, -0.04250722751021385, -0.08403332531452179, -0.04000169783830643, 0.017341746017336845, 0.021868946

# Pinecone Vector database

In [None]:
!pip install -qU "pinecone[grpc]"==5.1.0
!pip install -qU llama-index-vector-stores-pinecone==0.2.1

In [25]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

In [26]:
from pinecone.grpc import PineconeGRPC
from pinecone import ServerlessSpec
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import StorageContext

# Initialize connection to Pinecone
pc = PineconeGRPC(api_key=PINECONE_API_KEY)
index_name = "fin-statement-index"

In [27]:
pc.create_index(
    index_name,
    dimension=384,
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': '04ba5a1678059946dbf4e5063c93afc0', 'Date': 'Sun, 13 Oct 2024 13:19:17 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [28]:
pinecone_index = pc.Index(index_name)
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

In [None]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(nodes,storage_context=storage_context)

In [29]:
vector_store.add(nodes)

Upserted vectors:   0%|          | 0/56 [00:00<?, ?it/s]

['73a9536d-ab38-4503-9781-bb4de1045bdd',
 'a5895732-685b-47ec-ba9f-6860977c0f9b',
 'ee143912-562e-46f0-be5e-939ac4d9a773',
 '54f047cf-1d8c-485b-9feb-3871e029a1da',
 'd7804ef2-1972-496f-8f4b-2edd74526876',
 'db301ed7-494b-4e9c-b49d-af8d5b2c8cfa',
 '39c94c3a-5131-4506-bc65-16a7456e7aad',
 'd2de27a3-b35f-4e2d-83b8-2cf8ae5f7db9',
 'a242eba7-bdb1-4f85-91d8-948b5cdf7f35',
 'd2321ab5-fdc6-4a3d-8f38-62c7a9204592',
 'c31086bc-0aff-4d7e-b02f-05801ef1245d',
 '52a7f6af-fcb7-4c87-a801-cbec8f35f6e8',
 'ca56ff09-3fc4-4131-80ce-550f6b461a37',
 '56471b30-bd80-4026-bda5-4f9d825f9db9',
 '903cd96f-e35c-4d48-94ff-a64d31d3cebd',
 '2174a90c-1539-4701-92f0-7e77927a2a20',
 '3b910c25-4919-4415-a827-47b6ce160fcf',
 '4a7c794e-783f-4ddf-9bbe-0212c2eb3fbd',
 '79641c85-89b8-4bbb-87bf-a6d5fd82e573',
 'e5ef8201-dffc-49e5-bb78-44539771fffa',
 '0831f093-d3ac-43ea-a595-4efe18ef6a04',
 'fa6a9c35-5a26-421e-b575-31edd796323e',
 '4c9dc913-0642-4a75-a3cf-6193353474c6',
 '2e3658e5-4986-44d8-86b4-b3e078a30a78',
 '1b7e4117-6f24-

In [30]:
pinecone_index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 56}},
 'total_vector_count': 56}

In [34]:
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=vector_store)

# Grab 5 search results
retriever = VectorIndexRetriever(index=vector_index,embed_model=embed_model ,similarity_top_k=5)

# Query vector DB
answer = retriever.retrieve("What are SCHEDULE 1 - CAPITAL")

# Inspect results
print([i.get_content() for i in answer])

# Response:
# ['some relevant search result 1', 'some relevant search result 1'...]


AttributeError: 'SentenceTransformer' object has no attribute 'get_agg_embedding_from_queries'

In [46]:
# questions = [
#     "Who is Shri Vinay M. Tonse?",
#     "What is the capital reserves deductions during the year as at 31.03.2024?",
#     "What is the Minority Interest on the date of balance sheet as at 31.03.2024?",
#     "What is the total deposits as at 31.03.2024?",
#     "How much is the Borrowings in India from capital instruments?",
#     "What is the Balances with Reserve Bank of India in both current and other accounts as of year 2023"
# ]

In [31]:
question = "What are SCHEDULE 1 - CAPITAL"
query_engine = index.as_query_engine()
response = query_engine.query(question)
print(response)

NameError: name 'index' is not defined

In [27]:
pc.delete_index(index_name)