# RAG for semi-structured data

In [19]:
# import modules
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.self_query.base import SelfQueryRetriever

# MongoDB
from dotenv import load_dotenv
from pymongo import MongoClient
import os

# Azure OpenAI
from langchain_openai import AzureChatOpenAI

In [20]:
# config Mongo DB

load_dotenv()

# MongoDB
MONGO_USERNAME = os.environ.get('MONGO_USERNAME')
MONGO_PASSWORD = os.environ.get('MONGO_PASSWORD')
MONGO_HOST = os.environ.get('MONGO_HOST')
MONGO_DB_CONNECTION_STRING = os.environ.get('MONGO_DB_CONNECTION_STRING')
MONGO_DB = os.environ.get('MONGO_DB')
MONGO_DB_COLLECTION = "InventoryData"

# OpenAI
AZURE_OPENAI_API_KEY = os.environ.get("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = os.environ.get('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_VERSION = os.environ.get('AZURE_OPENAI_VERSION')
AZURE_OPENAI_DEPLOYMENT_NAME = os.environ.get('AZURE_OPENAI_DEPLOYMENT_NAME')

In [21]:
# connect to Mongo DB

client = MongoClient(MONGO_DB_CONNECTION_STRING)
db = client[MONGO_DB]
collections = db.list_collection_names()

inventory_coll = db['InventoryData']

print(f'Collection: {inventory_coll.name}')
print(f"Total no. of docs: {inventory_coll.count_documents({})}")

Collection: InventoryData
Total no. of docs: 538


In [22]:
# user query embedding function

emb_model = SentenceTransformerEmbeddings(model_name="thenlper/gte-large")

def get_embedding(text: str) -> list[float]:

    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []

    embedding = emb_model.encode(text)

    return embedding.tolist()

## Classical Search

In [23]:
# keyword search

def search(search_text):
    data = inventory_coll.find({"$text": {"$search": search_text}}).limit(3)
    return list(data)

data = search('Servo-i')

for x in data:
    print(x.keys())

dict_keys(['_id', 'Item Code', 'Item Description', 'Part No', 'Part Description', 'Model', 'Unit Of Measurement', 'Spare Part Type', 'Location', 'Part Category', 'Is Expiry date Required', 'Min Nos', 'Max Nos', ' Minimum Price Per Nos (RM) ', ' Maximum Price Per Nos (RM) ', 'Status', 'Expiry Age (In Month)', 'Current Stock Level'])
dict_keys(['_id', 'Item Code', 'Item Description', 'Part No', 'Part Description', 'Model', 'Unit Of Measurement', 'Spare Part Type', 'Location', 'Part Category', 'Is Expiry date Required', 'Min Nos', 'Max Nos', ' Minimum Price Per Nos (RM) ', ' Maximum Price Per Nos (RM) ', 'Brand', 'Status', 'Expiry Age (In Month)', 'Current Stock Level'])


## Vector Store (Chroma DB)

In [24]:
# 1. fetch all docs from InventoryData container
def fetch_all_documents():
    data = inventory_coll.find({})
    return list(data)

raw_data = fetch_all_documents()
print(len(raw_data))

538


In [25]:
# 2. convert all 538 items to the Langchain Document data type (custom function)

# parse price
def parse_price(price_str):
    if price_str:
        # remove the 'MYR' prefix
        clean_price = price_str.replace('MYR', '').strip()
        
        # replace comma with a dot
        if ',' in clean_price and clean_price[-3] == ',':
            clean_price = clean_price.replace(',', '.')

        # handle multiple dots
        parts = clean_price.split('.')
        if len(parts) > 2:  # 1+ dot
            clean_price = ''.join(parts[:-1]) + '.' + parts[-1]
        elif len(parts) == 2:  # 1 dot
            if len(parts[1]) != 2:
                clean_price = ''.join(parts)

        return float(clean_price)
    return None

# convert to Langchain Documents
def convert_to_langchain_docs(docs):

    documents = []

    for doc in docs:
        
        # clean "Part No" --> nested dict
        part_no = doc['Part No'].get('', '')
        max_price_per_nos = parse_price(doc[' Maximum Price Per Nos (RM) '])
        min_price_per_nos = parse_price(doc[' Minimum Price Per Nos (RM) '])

        # construct page_content (from descriptive attributes)
        content = f"{doc['Item Description']}, {doc['Part Description']}"

        # construct metadata
        metadata = {
            "Item Code": doc['Item Code'],
            "Location": doc['Location'],
            "Part No": part_no,
            "Part Category": doc['Part Category'],
            "Unit Of Measurement": doc['Unit Of Measurement'],
            "Spare Part Type": doc['Spare Part Type'],
            "Is Expiry Date Required": doc['Is Expiry date Required'],
            "Min Nos": doc['Min Nos'],
            "Max Nos": doc['Max Nos'],
            "Minimum Price Per Nos (RM)": min_price_per_nos,
            "Maximum Price Per Nos (RM)": max_price_per_nos,
            "Status": doc['Status'],
            "Brand": doc.get('Brand', ""),
            "Expiry Age (In Month)": doc.get('Expiry Age (In Month)', ""),
            "Current Stock Level": doc['Current Stock Level']
        }

        # Append to list as a new document
        documents.append(
            Document(page_content=content, metadata=metadata)
        )

    return documents

transformed_docs = convert_to_langchain_docs(raw_data)

In [26]:
# load documents (JSONLoader seems to be NOT supported for Windows)

db = Chroma.from_documents(documents=transformed_docs, embedding=emb_model)

In [27]:
# 3. test query

query = "I need a Fresenius 4008S Motor Assy"

docs = db.similarity_search_with_score(query)
print(f"TOP MATCH: {docs[0][0].page_content}.\nSCORE: {docs[0][1]}. \nMETADATA: {docs[0][0].metadata}")
print('TOTAL DOCS: ', len(docs))

TOP MATCH: Motors/ Micromotors/ Carbon brushes, Fresenius 4008S Stepper Motor.
SCORE: 0.19810137152671814. 
METADATA: {'Brand': 'Fresenius', 'Current Stock Level': 0, 'Expiry Age (In Month)': 0, 'Is Expiry Date Required': 'No', 'Item Code': 'BP0018', 'Location': 'Company Site Office', 'Max Nos': 5, 'Maximum Price Per Nos (RM)': 3622.5, 'Min Nos': 1, 'Minimum Price Per Nos (RM)': 3150.0, 'Part Category': 'Biomedical', 'Part No': '01-0802-0502-005', 'Spare Part Type': 'Fast Moving Item', 'Status': 'Active', 'Unit Of Measurement': 'Unit'}
TOTAL DOCS:  4


## Basic Retriever

In [28]:
# 1. init retriever from Chroma DB

retriever = db.as_retriever(search_type="mmr") # maximal marginal relevance

In [29]:
# 2. design system prompt

system_prompt_template = """ 

    You are the most powerful and skillfull expert in querying documents to find answers to user's questions.

    Your main task is to answer the USER QUERY based only on the provided CONTEXT.

    # CONTEXT
    {context}

    # USER QUERY
    {query}

"""

system_prompt = ChatPromptTemplate.from_template(system_prompt_template)

In [30]:
# 3. init OpenAI (or any other open source model)

oai = AzureChatOpenAI(
    openai_api_version=AZURE_OPENAI_VERSION,
    azure_deployment=AZURE_OPENAI_DEPLOYMENT_NAME,
)

In [31]:
# create chain

chain = (
    {"context": retriever, "query": RunnablePassthrough()}
    | system_prompt
    | oai
    | StrOutputParser()
)

In [32]:
query = "How many parts in stock do we have for Door 22MM Bubble C.Holder 4008SV10?"
response = chain.invoke(input=query)

print(response)

The current stock level for Door 22MM Bubble C.Holder 4008SV10 is 3.


## Self-Querying Retriever

A self-querying retriever is capable of running query on itself, which means --> the retriever uses a query-constructing LLM chain (below) to write a structured query and then applies that structured query to its underlying vector store.

**BIGGEST BENEFIT**: Ability to search based **not only on semantic similarity comparison**, but also **extract & execute filters** from the user query on metadata of stored documents.

You can also specify the number of **n elements** that must be fetched using ``` enable_limit=True ```

!["Self-querying Retriever"](../images/self-querying.jpg)

In [33]:
# 1. metadata to adjust

retriever.get_relevant_documents(query)[0].metadata

{'Brand': 'Fresenius',
 'Current Stock Level': 3,
 'Expiry Age (In Month)': 0,
 'Is Expiry Date Required': 'No',
 'Item Code': 'BP0005',
 'Location': 'Company Site Office',
 'Max Nos': 3,
 'Maximum Price Per Nos (RM)': 826.8,
 'Min Nos': 1,
 'Minimum Price Per Nos (RM)': 800.0,
 'Part Category': 'Biomedical',
 'Part No': '01-0802-0504-011',
 'Spare Part Type': 'Just In Time',
 'Status': 'Active',
 'Unit Of Measurement': 'Unit'}

In [34]:
# 2. new metadata descriptions for all the fields/attributes

metadata_field_info = [
    AttributeInfo(
        name="Brand",
        description="The brand of the spare part",
        type="string",
    ),
    AttributeInfo(
        name="Current Stock Level",
        description="The current stock level of the spare part",
        type="integer",
    ),
    AttributeInfo(
        name="Expiry Age (In Month)",
        description="The expiry age of the spare part in months",
        type="integer",
    ),
    AttributeInfo(
        name="Is Expiry Date Required",
        description="Indicates whether an expiry date is required for the spare part",
        type="string",
    ),
    AttributeInfo(
        name="Item Code",
        description="The code assigned to the spare part",
        type="string",
    ),
    AttributeInfo(
        name="Location",
        description="The location where the spare part is stored",
        type="string",
    ),
    AttributeInfo(
        name="Max Nos",
        description="The maximum number of units of the spare part",
        type="integer",
    ),
    AttributeInfo(
        name="Maximum Price Per Nos (RM)",
        description="The maximum price per unit of the spare part in RM",
        type="float",
    ),
    AttributeInfo(
        name="Min Nos",
        description="The minimum number of units of the spare part",
        type="integer",
    ),
    AttributeInfo(
        name="Minimum Price Per Nos (RM)",
        description="The minimum price per unit of the spare part in RM",
        type="float",
    ),
    AttributeInfo(
        name="Part Category",
        description="The category of the spare part",
        type="string",
    ),
    AttributeInfo(
        name="Part No",
        description="The part number of the spare part",
        type="string",
    ),
    AttributeInfo(
        name="Spare Part Type",
        description="The type of the spare part (e.g., Fast Moving Item, Just In Time)",
        type="string",
    ),
    AttributeInfo(
        name="Status",
        description="The status of the spare part (e.g., Active, Inactive)",
        type="string",
    ),
    AttributeInfo(
        name="Unit Of Measurement",
        description="The unit of measurement for the spare part (e.g., Unit)",
        type="string",
    ),
]

content_description = "Spare Part data"


In [35]:
# 3. init SelfQueryRetriever w/ AzureOpenAI

sqretriever = SelfQueryRetriever.from_llm(
    oai, db, content_description, metadata_field_info, verbose=True)


In [36]:
# 4. test the SelfQueryRetriever

query = "How many items of a Waterman brand are the in stock?"

response = sqretriever.invoke(query)

print(f"TOP MATCH: {response[0].page_content}")
print(f"METADATA: {response[0].metadata}")
print(len(response))

TOP MATCH: Filters, Black Carbon Filter
METADATA: {'Brand': 'Waterman', 'Current Stock Level': 1, 'Expiry Age (In Month)': '', 'Is Expiry Date Required': 'No', 'Item Code': 'BP0012', 'Location': 'Company Site Office', 'Max Nos': 10, 'Maximum Price Per Nos (RM)': 9.6, 'Min Nos': 0, 'Minimum Price Per Nos (RM)': 8.0, 'Part Category': 'Biomedical', 'Part No': '01-1805-0101-004', 'Spare Part Type': 'One Off', 'Status': 'Active', 'Unit Of Measurement': 'Unit'}
4


In [37]:
# 5. create chain with a SelfQueryRetriever --> can query not only by content, but also by metadata

chain = (
    {"context": sqretriever, "query": RunnablePassthrough()}
    | system_prompt
    | oai
    | StrOutputParser()
)

query = "What's the min and max price per nos for a spare part with the code BP0002?"
response = chain.invoke(input=query)

print(response)

The minimum and maximum price per Nos for a spare part with the code BP0002 are RM 500.0 and RM 650.0 respectively.


!["Self-querying results from MongoDB (reference)"](../images/self-querying-results.png)