## Installing Dependencies

In [1]:
import os
import subprocess
import sys

# NumPy fix
desired_version = "1.26.4"

try:
    import numpy as np
    current_version = np.__version__
    print(f"Current NumPy version: {current_version}")

    if current_version != desired_version:
        print(f"Installing NumPy version {desired_version}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", f"numpy=={desired_version}"])

        print("Restarting runtime to apply changes...")
        os.kill(os.getpid(), 9)
    else:
        print("NumPy is already the desired version.")

except ImportError:
    print("NumPy is not installed. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", f"numpy=={desired_version}"])
    os.kill(os.getpid(), 9)


Current NumPy version: 1.26.4
NumPy is already the desired version.


In [2]:
%pip install --quiet \
    chainlit==1.3.2 \
    chromadb==0.5.20 \
    dataclasses-json==0.6.7 \
    fastapi==0.115.5 \
    kaleido==0.2.1 \
    langchain==0.3.0 \
    langchain-chroma==0.1.4 \
    langchain-community==0.3.0 \
    langchain-nvidia-ai-endpoints==0.3.5 \
    langchain-unstructured==0.1.6 \
    protobuf==4.25.2 \
    pydantic==2.9.2 \
    pymupdf==1.25.3 \
    "unstructured[all-docs]"==0.17.2 \
    psycopg2-binary

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Input API Key

In [3]:
import getpass
import os

def set_ngc_api_key():
    """Prompt the user to enter an NVIDIA API key if it's not set or invalid."""
    while True:
        nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")

        if nvapi_key.startswith("nvapi-"):
            os.environ["NVIDIA_API_KEY"] = nvapi_key
            print("NVIDIA API Key has been successfully set!")
            break
        else:
            print("Invalid API key. Please try again.")

# Check if the key is already set and valid
current_key = os.environ.get("NVIDIA_API_KEY", "")

if not current_key.startswith("nvapi-"):
    print("NVIDIA API Key is missing or invalid. Please enter a new key.")
    set_ngc_api_key()
else:
    print("NVIDIA API Key is already set.")
    change_key = input("Would you like to enter a different key? (yes/no): ").strip().lower()

    if change_key in ["yes", "y"]:
        set_ngc_api_key()


NVIDIA API Key is missing or invalid. Please enter a new key.
NVIDIA API Key has been successfully set!


# Query the Database

In [4]:
import psycopg2

conn = psycopg2.connect(
    dbname="Test-DB",
    user="postgres",
    password="tdsynnex123",
    host="localhost"
)

cur = conn.cursor()

cur.execute('SELECT we.event_id, we.date, we.location, we.cause, we.area_burned, we.duration, ec.temperature, ec.humidity, ' \
            'ec.wind_speed, ec.precipitation, ra.action_type, ra.resources_used, ra.outcome, hd.lessons_learned, hd.recommendations ' \
            'FROM wildfire_events we ' \
            'JOIN environmental_conditions ec ON we.event_id = ec.event_id ' \
            'JOIN response_actions ra ON we.event_id = ra.event_id ' \
            'JOIN historical_data hd ON we.event_id = hd.event_id;')

results = cur.fetchall()

cur.close()
conn.close()

# Preprocessing the Data

In [5]:
from langchain.docstore.document import Document

documents = []
for row in results:
    event_id, date, location, cause, area_burned, duration, temperature, humidity, wind_speed, precipitation, action_type, resources_used, outcome, lessons_learned, recommendations = row

    # Create a document for each row
    content = f"""
    Event ID: {event_id}
    Date: {date}
    Location: {location}
    Cause: {cause}
    Area Burned: {area_burned} hectares
    Duration: {duration} days
    Temperature: {temperature}°C
    Humidity: {humidity}%
    Wind Speed: {wind_speed} km/h
    Precipitation: {precipitation} mm
    Action Type: {action_type}
    Resources Used: {resources_used}
    Outcome: {outcome}
    Lessons Learned: {lessons_learned}
    Recommendations: {recommendations}
    """

    doc = Document(
        page_content=content,
        metadata={
            'source': "PostgreSQL Database",
            'event_id': event_id
        }
    )
    documents.append(doc)

print(f"Loaded {len(documents)} document elements from the database.")

Loaded 2 document elements from the database.


# Generating Embeddings

In [6]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings

# Create embeddings
embedding_model = "nvidia/nv-embedqa-e5-v5"
embedder = NVIDIAEmbeddings(model=embedding_model, truncate="END")

# Storing Embeddings in a Vector Database

In [7]:
from langchain_community.vectorstores import Chroma
import time

# Create and persist vectorstore
start_time = time.time()
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedder,
    collection_name="docs",
    persist_directory="./chroma_db"
)

if vectorstore:
    print(f"Vector database was successfully created! Total embeddings indexed: {len(documents)}")
else:
    print("Failed to create the vector database. Please check your input data.")

print(f"--- {time.time() - start_time} seconds ---")

Vector database was successfully created! Total embeddings indexed: 2
--- 2.828407049179077 seconds ---


# Adding a Reranker

In [8]:
from langchain_nvidia_ai_endpoints import NVIDIARerank

NV_rerank = NVIDIARerank(model='nvidia/nv-rerankqa-mistral-4b-v3', top_n=10)

# Set up the query

In [9]:
question = "What are the most common causes of wildfires in the dataset?"

In [10]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import Runnable, RunnablePassthrough, RunnableConfig
from langchain_core.runnables import RunnableParallel

from langchain_nvidia_ai_endpoints import ChatNVIDIA

llm = ChatNVIDIA(model="nvidia/llama-3.1-nemotron-ultra-253b-v1")

retriever = vectorstore.as_retriever(search_kwargs={'k':100})

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer solely based on the following context:\n<Documents>\n{context}\n</Documents>",
        ),
        ("user", "{question}"),
    ]
)

reranker = lambda input: NV_rerank.compress_documents(query=input['question'], documents=input['context'])

chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | {"context": reranker, "question": lambda input: input['question']}
    | prompt
    | llm
    | StrOutputParser()
)



In [11]:
chain.invoke(question)

Number of requested results 100 is greater than number of elements in index 2, updating n_results = 2


"<think>\nOkay, let's look at the documents provided. There are two events here. For Event ID 1, the cause is listed as Lightning. For Event ID 2, the cause is Human Activity. So, there are two different causes, each appearing once. Since the dataset only has two entries, the most common causes would be both, but they occur with the same frequency. So the answer is that the most common causes are Lightning and Human Activity, each occurring once.\n</think>\n\nThe dataset includes two wildfire events with the following causes:\n\n1. **Event ID 1**: Cause = **Lightning**  \n2. **Event ID 2**: Cause = **Human Activity**  \n\nBoth causes occur **once**, making them equally the most common in this specific dataset."