# Import libraries

In [None]:
%pip install -qU llama-index llama-parse llama-index-embeddings-ollama llama_index-llms-ollama llama-index-vector-stores-mongodb --no-cache-dir

In [2]:
import json
from dotenv import load_dotenv
import os
load_dotenv()

from pymongo import MongoClient
mongo_client = MongoClient(os.environ["MONGODB_URI"])


# Data aquisition:   
Fetch all relevant documents

## **If you directly use a pdf as data**:
- You need access to llama index 
    - Specifically u need a llama-cloud account as well as your own llama-cloud api key
- Once you have your own API key
    - You can save it in a `.env` file according to `.env.example`

In [3]:
llamaparse_api_key = os.getenv("LLAMA_PARSE_API_KEY")

### Setup llama parse from llama index
- A freemium document parser/data converter that can help parse **66** pages per day FOR FREE 

In [4]:
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse  # pip install llama-parse
from llama_index.core import SimpleDirectoryReader, Document  # pip install llama-index

results_encoding = 'markdown'
parser = LlamaParse(
    api_key=llamaparse_api_key,
    result_type=results_encoding,
    premium_mode=True,
    disable_image_extraction=True,
    take_screenshot=False,
    parsing_instruction= "This is an insurance document. SOME tables have structural issues like headers as columns or multi-line headers. RESTRUCTURE those tables, leave the rest as is.")

### If you had already parsed your document in llamaCloud's online Dashboard or with python
Get the job id and results_type for specifiying the encoding for parsing the given document

#### Get history of parsing requests

In [None]:
history = None
async with parser.client_context() as client:
    headers = {"Authorization": f"Bearer {parser.api_key}"}
    history = await client.get("https://api.cloud.llamaindex.ai//api/v1/parsing/history", headers=headers)
    history = history.json()
print("usable records:")
for record in history:
    if record['expired'] == False: print(f"day: {record['day']} \t job: {record['job_id']}")
print("\nexpired records (records that are automatically deleted after 2 days in llama-parse): ")
for record in history[:3]:
    if record['expired'] == True: print(f"day: {record['day']} \t job: {record['job_id']}")
print('\nolder records ommited')

#### Save it for further use

In [6]:
async with parser.client_context() as client:
    headers = {"Authorization": f"Bearer {parser.api_key}"}
    history = await client.get("https://api.cloud.llamaindex.ai//api/v1/parsing/history", headers=headers)
    history = history.json()

You can use the fresh (usable) data from the history

In [None]:
job_id = 'e6301071-cf33-444a-b447-b11d2d4e5e40' # replace with the job_id you want to get the result for
response = await parser._get_job_result(job_id=job_id,result_type=results_encoding)
if results_encoding == 'json':
    results = response['pages']
    for i, pages in enumerate(response['pages']):
        with open(f'./outputs/brochure_{i}.json', 'w') as f:
            f.write(json.dumps(pages['items']))
else:
    results = response['markdown']
    with open("../data/outputs/data.md", 'w') as f:
        f.write(response['markdown'])
    markdowns = response['markdown'].split('\n---\n')
    documents = [Document(text=markdown, metadata = response['job_metadata']) for markdown in markdowns]
for document in documents:
    print(document.get_content())

### Or you can parse a document that you have via the API

In [None]:
file_extractor = {".pdf": parser}

reader = SimpleDirectoryReader(input_files=['../data/sources/BROCHURE.pdf'], file_extractor=file_extractor)
documents = await reader.aload_data()

# Data ingestion:   

## Option 3: MongoDB only

[mongodb pipeline setup](https://medium.com/@abdulsomad.me/how-to-build-rag-app-with-mongodb-atlas-database-llama-index-gemini-llm-and-embedding-and-8e82df16d6bf)   
the cheapest but most limited option

as for setup, we need
1. Mongodb client
    - for setting up a client connection to the storage or project cluster
    - uses a cluster-specific connection string provided in mongodb Atlas
2. A storage context
    - A llamaindex container to prepare data for storage
3. Embedding model
    - Can be from any provider `(openAI)`, self-hosted`(ollama)`

In [10]:
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings
embed_model = OllamaEmbedding(model_name='nomic-embed-text', ollama_additional_kwargs={"mirostat": 0})
llm = Ollama(model='llama3.2', request_timeout=60.0)

Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
collection = None
collection_name = "llamaIndexChunk"
if collection_name in mongo_client["product1Chunked"].list_collection_names():
    print('getting collection')
    collection = mongo_client["product1Chunked"][collection_name]
else:
    print('creating collection')
    collection = mongo_client["product1Chunked"].create_collection('llamaIndexChunk')

In [12]:
# Specify the collection for which to create the index
from pymongo.operations import SearchIndexModel

index_name = 'vector_index'
if index_name not in ([index['name'] for index in list(collection.list_search_indexes())]):
  # Create your index model, then create the search index
  search_index_model = SearchIndexModel(
    definition={
      "fields": [
        {
          "type": "vector",
          "path": "embedding",
          "numDimensions": 768,
          "similarity": "cosine"
        },
        {
          "type": "filter",
          "path": "metadata.page_label"
        }
      ]
    },
    name=index_name,
    type="vectorSearch",
  )
  collection.create_search_index(model=search_index_model)

In [None]:
list(collection.list_search_indexes())

In [14]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_index.core import StorageContext

# Instantiate the vector store
atlas_vector_store = MongoDBAtlasVectorSearch(
    mongodb_client=mongo_client,
    db_name = "product1Chunked",
    collection_name = "llamaIndexChunk",
    vector_index_name = index_name
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)

reset the index if it is not empty


In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.core.schema import MetadataMode
from tqdm import tqdm
semantic_splitter = SemanticSplitterNodeParser(
buffer_size=1, breakpoint_percentile_threshold=98, embed_model=embed_model)
nodes = semantic_splitter.get_nodes_from_documents(documents)
# Progress bar
pbar = tqdm(total=len(nodes), desc="Embedding Progress", unit="node")
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    node.embedding = node_embedding
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

In [None]:
atlas_vector_store.add(nodes=nodes)

Instantiate a vector store and Store your data into it

# Data inference
To query our data, we need to configure our storage and indexes into objects that can be used to infer our data,
for example:


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever

# Instantiate VectorStoreIndex object from your vector_store object
vector_index = VectorStoreIndex.from_vector_store(vector_store=atlas_vector_store)

# Grab 5 search results
retriever = VectorIndexRetriever(index=vector_index, similarity_top_k=5)

question = "is the product shariah compliant"
# Query vector DB
answer = retriever.retrieve(question)

# Inspect results
for i in answer:
    print(i.text)
    print("\n ------------- NEW NODE")



In [None]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core import get_response_synthesizer
synth = get_response_synthesizer(streaming=True)
# Pass in your retriever from above, which is configured to return the top 5 results
query_engine = RetrieverQueryEngine(retriever=retriever,response_synthesizer=synth)

# Now you query:
llm_query = query_engine.query('what are the covered illnesses')

# Response:
llm_query.print_response_stream()

## Option 3: MongoDB only

fetch the vector store as an index for semantic search
then instantiate it as a tool for tool calling later used by our base language model

In [20]:
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core import get_response_synthesizer
synth = get_response_synthesizer(streaming=True)
index = VectorStoreIndex.from_vector_store(atlas_vector_store)
query_engine = index.as_query_engine(similarity_top_k=5, llm=llm, response_synthesizer=synth)

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="knowledge_base",
        description=(
            "Provides information about Group Multiple Benefits Insurance Scheme (GMBIS)."
            "Use a detailed plain text question as input to the tool."
        ),
    ),
)

In [None]:
query_engine.query("What are the covered illnesses").print_response_stream()
