In [2]:
import os
from openai import OpenAI

In [18]:
open_router_key = os.getenv("OPENROUTER_API_KEY", "")
if not open_router_key :
    print('OpenRouter key not found.')
else:
    print('OpenRouter key found.')

BASE_URL = "https://openrouter.ai/api/v1"
MODEL = "google/gemini-3-flash-preview"

OpenRouter key found.


In [4]:
openrouter = OpenAI(api_key=open_router_key, base_url=BASE_URL)

In [None]:
# response = openrouter.chat.completions.create(model = MODEL, messages = [
#     {"role": "user", "content": "Hello from OpenRouter via OpenAI SDK"}])
# print(response.choices[0].message.content)

In [None]:
# eval_parameters = [
#     {"Manufacturing Capacity Utilization": "Closer to 100% utilization (to witness economies of scale and reduce cost per unit)"}
#     ]   

In [5]:
import sys
from pathlib import Path
import chromadb
from sentence_transformers import SentenceTransformer

DB_PATH = r"C:\Users\rauna\projects\My Projects\IPO_Checker\ChromaDB"
chroma_client = chromadb.PersistentClient(path=DB_PATH)

COLLECTION_NAME = 'dhrp_embeddings_collection'

#DHRP PDF FILE PATH
PDF_FILE_PATH = r'C:\Users\rauna\projects\My Projects\IPO_Checker\DRHP'

EMBEDDING_MODEL = 'multi-qa-mpnet-base-dot-v1'
# DB_PATH = r"C:\Users\rauna\projects\My Projects\IPO_Checker\ChromaDB"
embedding_model = SentenceTransformer(EMBEDDING_MODEL)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dhrp_doc_collection = chroma_client.get_collection(COLLECTION_NAME)

In [7]:
question = {"Manufacturing Capacity Utilization": "What is the current manufacturing capacity utilization rate?"}

In [8]:
# print(questions[0])
# print([key for key in questions[0].keys()])
# print([value for value in questions[0].values()])

for key,question in question.items():
    print(key)
    print(question)

Manufacturing Capacity Utilization
What is the current manufacturing capacity utilization rate?


In [9]:
def get_relevant_docs(question, collection, top_k=5):
    print('reteriving sentence documents for company...')
    
    # q_key = list(question.keys())[0]
    # question = question[q_key]

    embedded_question = embedding_model.encode(question)
    print(f"Embedded question:")

    results = dhrp_doc_collection.query(
            query_embeddings=[embedded_question],
            n_results=top_k
            )
        # display(f"Top {top_k} relevant documents for question '{question}':{results}\n")
    return results

In [None]:
# question = {"Manufacturing Capacity Utilization": "What is the current manufacturing capacity utilization rate?"}
# results = get_relevant_docs(question, dhrp_doc_collection, top_k=5)
# print(results)

In [None]:
documents = get_relevant_docs(question,dhrp_doc_collection)
print(documents)

In [None]:
docs = documents.get("documents", [[]])[0]
print(docs)
metadatas = documents.get("metadatas", [[]])[0] 
print(metadatas)
# metadatas = item for item in documents.get("metadatas", [[]])[0] 
# print(metadatas.get("source", "unknown_source.txt").replace(".txt", ""))

In [11]:
import json

In [12]:
def build_context_json(documents):
    print('building context in JSON...')

    docs = documents.get("documents", [[]])[0]      
    metadatas = documents.get("metadatas", [[]])[0]

    # print(docs)
    # print(metadatas)

    context_list = []

    for doc, meta in zip(docs, metadatas):
        entry = {
            "source": meta.get("source", "unknown_source.txt").replace(".txt", ""),
            "page": meta.get("page", "Unknown page"),
            "content": doc.strip()
        }
        context_list.append(entry)

    return json.dumps({"context_results": context_list}, indent=4, ensure_ascii=False)


In [None]:
print(build_context_json(get_relevant_docs(question,dhrp_doc_collection)))

In [14]:
def call_llm(MODEL, system_message, user_message):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]
    response = openrouter.chat.completions.create(model=MODEL, messages=messages)
    return response.choices[0].message.content

In [15]:
## rag pipeline

def rag_pipeline(parameters, collection):

    answers = {}
    for questions in parameters :


        for key, question in questions.items() :
            documents = get_relevant_docs(question, collection)
            context_json = build_context_json(documents)
        

            system_message = f'''You are a Senior IPO Investment Analyst and SEBI-registered Research Analyst equivalent. 
                Use Indian IPO Draft Red Herring Prospectus (DRHP) documents to answer the questions accurately.
                
                Use the following context to answer the questions.\n\nContext:\n
                {context_json}'''
          
            # print(f'system_message prepared.{system_message}')

            answer = call_llm(MODEL,system_message,question)
            print('llm answers')
            answers[key] =answer

    return answers

In [16]:
eval_questions = [
    {"Manufacturing Capacity Utilization": "What is the current utilization level of the company’s manufacturing capacity (to witness economies of scale and reduce cost per unit)?"},

    {"Raw Material Source Reliability": "How reliable are the company’s raw material sources, and are there potential trade disruptions due to sourcing from sensitive locations such as China?"},

    {"Management Future Plans": "What are the company management’s clearly stated future plans and strategies?"},

    {"GDP Growth Rate (Annual)": "What is the current GDP growth rate and how does it compare with other relevant economies (e.g., India vs China)?"},

    {"Labor Cost": "What is the comparative labor cost for the regions in which the company operates (e.g., India vs China)?"},

    {"Macro Indicators": "What do the key macro indicators—unemployment rate, CRR, balance of trade, and current account to GDP percentage—indicate about the economic environment?"},

    {"Industry Growth Rate (CAGR)": "What is the current CAGR of the industry, and is the overall industry performing well compared to historical or expected growth?"},

    {"Expected Industry Growth Rate": "What is the expected future industry growth rate, and is it higher than the current growth rate?"},

    {"Revenue, EBITDA, and Profit After Tax (PAT) Growth Rate": "What are the company’s Revenue, EBITDA, and PAT growth rates, and are they higher than the industry growth rate?"},

    {"Debt to Equity Ratio": "What is the company’s current debt-to-equity ratio, and is it within the generally acceptable threshold of up to 2:1?"},

    {"Cash Flow from Operations (CFO)": "Is the company’s cash flow from operations positive and growing at a healthy rate?"},

    {"P/E Ratio (Price to Earning Ratio)": "How does the company’s P/E ratio compare with listed peers, and is the valuation justified (or should P/S be used if the company is loss-making)?"},

    {"Return on Net Worth (RONW)": "What is the company’s Return on Net Worth, and how does it compare to peer companies?"},

    {"Comparable Transaction Multiple (CTM)": "Is the IPO price justified when compared with recent share issuance prices based on Comparable Transaction Multiples?"},

    {"Gray Market Premium (GMP)": "What is the current Gray Market Premium (GMP), and what expected listing price does it imply (while noting it is not guaranteed)?"},

    {"Type of Issue": "Is the IPO structured as a Fresh Issue, an Offer For Sale (OFS), or a mix—and what is the proportion of each?"},

    {"Use of Proceeds": "How does the company intend to use the IPO proceeds, and what portion is allocated toward expansion versus general corporate purposes?"},

    {"Litigation/Investigation": "Are there any ongoing litigations or investigations involving the company or promoters, especially by serious agencies like CBI or SFIO?"},

    {"Customer Concentration": "What is the level of customer concentration, and do a small number of clients contribute disproportionately to revenue (e.g., top 10 customers = 90%)?"}
]


In [17]:
rag_pipeline(eval_questions, dhrp_doc_collection)

reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:
building context in JSON...
llm answers
reteriving sentence documents for company...
Embedded question:


{'Manufacturing Capacity Utilization': "Based on the DRHP of KSH International Limited, here is the analysis of the company's production capacity and utilization levels:\n\n### **1. Capacity and Production Data**\nAs per the Operating KPIs provided in the document (pages 129 and 360), the production capacity and sales volumes are as follows:\n\n| Period | Production Capacity (MT) | Sales Volume (MT) | Estimated Utilization (%) |\n| :--- | :--- | :--- | :--- |\n| **Three-month ended June 30, 2025** | 29,045* | 6,114 | **84.20%** (Annualized) |\n| **Fiscal 2025** | 29,045 | 23,324 | **80.30%** |\n| **Fiscal 2024** | 28,436 | 21,495 | **75.59%** |\n| **Fiscal 2023** | 25,265 | 17,645 | **69.84%** |\n\n*\\*Note: The capacity for the three-month period is shown as the total installed capacity available for the year.*\n\n### **2. Quantitative Trends in Utilization**\n*   **Improving Efficiency:** The company has demonstrated a consistent upward trend in capacity utilization, rising from **69