In [1]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
import os
import pandas as pd
from groq import Groq
from docx import Document
from dotenv import load_dotenv


In [2]:
# Load .env file
load_dotenv()

# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

### Data preprocessing

In [3]:
documents = SimpleDirectoryReader(r"C:\Users\Bollu\genai_rag\portifolio_docs").load_data()

In [80]:
# print(documents[0].text)

In [6]:
text_ = documents[0].text

prompt = f"""
### Instructions:

You are a data extraction and summarization assistant helping prepare data for a Retrieval-Augmented Generation (RAG) system.

Your goal is to extract and structure information from a mutual fund portfolio statement and convert it into **detailed, uniformly-sized, textual summaries** (approximately 500 tokens each). Ignore any disclaimers or legal boilerplate.

Follow these formatting and content rules carefully:

---

### For each mutual fund entry:
- Begin with owner's name.
- Title the fund.
- Summarize the following in a detailed narrative form (same chunk size of ~500 tokens):
  • Fund name and category (e.g., Equity, Liquid)
  • Number of units held
  • Purchase NAV and value
  • Current NAV and value
  • Absolute gain and % return
  • CAGR and holding period in days
  • Any special notes (e.g., underperformance or exceptionally high gain)

### After summarizing all funds:
Provide a **final portfolio summary** (same chunk size of ~500 tokens), including:
  • Star with the owner's name.
  • Total number of funds across categories
  • Combined original and current investment value
  • Total gain and percentage return
  • Average CAGR
  • Any important insights (e.g., high-performing funds, poor performers)

### Additional Instructions:
  • All fund summaries and portfolio summary must be approximately the same length (around 500 tokens or ~350–400 words).
  • Do not include disclaimers or index values (e.g., Sensex or NIFTY).
  • Output in plain text format, suitable for use with a vector database.
   •Use all the synnonymns for the headings to ensure the user gets what is required.

---

### Document:
{text_}
"""



In [7]:
response = client.chat.completions.create(
messages=[
    {
        "role": "user",
        "content": prompt,
    }
],
model="llama-3.3-70b-versatile",
temperature=0.2,
max_completion_tokens=4096,
)

result = response.choices[0].message.content

In [6]:
print(result)

### Mutual Fund Summaries for Durgaprasad Bollu

#### HDFC Mid Cap Opportunities Fund (G)
Durgaprasad Bollu holds an investment in the HDFC Mid Cap Opportunities Fund (G), which falls under the Equity category. As of the valuation date, the number of units held is 1,000.0000, with an average purchase NAV of 10.00, resulting in a total purchase value of 10,000.00. The current NAV stands at 186.5390, leading to a current value of 1,86,539.00. This investment has yielded an absolute gain of 1,76,539.00 and a percentage return of 1,765.39%. The holding period is approximately 6,535 days, with a CAGR of 17.75%. This fund has performed exceptionally well, indicating a strong growth trajectory over the holding period.

#### Mirae Asset ELSS Tax Saver Fund Reg (G)
Durgaprasad Bollu is also invested in the Mirae Asset ELSS Tax Saver Fund Reg (G), categorized under Equity. The investment comprises 14,001.2930 units, purchased at an average NAV of 20.00, amounting to a purchase value of 2,80,000.

#### combined code:

In [8]:
# Paths
input_folder = os.path.expanduser(r"C:\Users\Bollu\genai_rag\portifolio_docs")
output_folder = os.path.expanduser(r"C:\Users\Bollu\genai_rag\processed_docs")

# Load PDF documents
documents = SimpleDirectoryReader(input_folder).load_data()

# Process each document
for doc in documents:
    # Get the document name (assuming metadata exists or infer from filename)
    filename = getattr(doc, "metadata", {}).get("file_name", None)
    if not filename:
        # fallback to index-based name if metadata is missing
        index = documents.index(doc)
        filename = f"document_{index + 1}.pdf"

    doc_name = os.path.splitext(filename)[0]
    word_doc_path = os.path.join(output_folder, f"{doc_name}.docx")

    # Skip if Word document already exists
    if os.path.exists(word_doc_path):
        print(f"Skipping {doc_name}: Word document already exists.")
        continue

    print(f"Processing {doc_name}...")

    text_ = doc.text

    prompt = f"""
    ### Instructions:

    You are a data extraction and summarization assistant helping prepare data for a Retrieval-Augmented Generation (RAG) system.

    Your goal is to extract and structure information from a mutual fund portfolio statement and convert it into **detailed, uniformly-sized, textual summaries** (approximately 500 tokens each). Ignore any disclaimers or legal boilerplate.

    Follow these formatting and content rules carefully:

    ---

    Do not use the word summary. Only use it for the final portfolio ummary.

    ### For each mutual fund entry:
    - Begin with owner's name.
    - Title the fund.
    - Summarize the following in a detailed narrative form (same chunk size of ~500 tokens):
    • Fund name and category (e.g., Equity, Liquid)
    • Number of units held
    • Purchase NAV and value
    • Current NAV and value
    • Absolute gain and % return
    • CAGR and holding period in days
    • Any special notes (e.g., underperformance or exceptionally high gain)

    ### After summarizing all funds:
    Provide a **final portfolio summary** (same chunk size of ~500 tokens), including:
    • Star with the owner's name.
    • Total number of funds across categories
    • Combined original and current investment value
    • Total gain and percentage return
    • Average CAGR
    • Any important insights (e.g., high-performing funds, poor performers)

    ### Additional Instructions:
    • All fund summaries and portfolio summary must be approximately the same length (around 500 tokens or ~350–400 words).
    • Do not include disclaimers or index values (e.g., Sensex or NIFTY).
    • Output in plain text format, suitable for use with a vector database.
    •Use all the synnonymns for the headings to ensure the user gets what is required.

    ---

    ### Document:
    {text_}"""

    try:
        # LLM Call
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.3-70b-versatile",  # or your model
            temperature=0.2,
            max_tokens=4096
        )
        #print(result,"=================")
        result = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error processing {doc_name}: {e}")
        continue

    # Write result to Word Document
    docx_doc = Document()
    docx_doc.add_heading(f"Extracted Portfolio Data - {doc_name}", level=1)
    docx_doc.add_paragraph(result)
    docx_doc.save(word_doc_path)

    print(f"Saved: {word_doc_path}")

Skipping durga prasad: Word document already exists.
Skipping Hariprasad: Word document already exists.
Skipping Lakshmi Devi: Word document already exists.
Skipping nagalaxmi: Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping Sireesha: Word document already exists.


### RAG

In [9]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = None
Settings.chunk_size = 500
Settings.chunk_overlap = 0

LLM is explicitly disabled. Using MockLLM.


In [10]:
documents = SimpleDirectoryReader(r"C:\Users\Bollu\genai_rag\processed_docs").load_data()

In [11]:
index = VectorStoreIndex.from_documents(documents)

In [12]:
# set number of docs to retreive
top_k = 1

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [13]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [14]:
post = "Final Portfolio Summary of Bollu Sireesha"
response = query_engine.query(post)
print(response.source_nodes[0].text)

BOLLU SIREESHA
Final Portfolio Overview
The portfolio consists of 13 equity mutual funds, with a total investment value of 9,82,501.73 and a current value of 12,44,803.65. The total gain is 2,62,301.91, and the percentage return is 26.70. The average CAGR is 17.27, and the average holding period is 497 days. The portfolio has performed well, with several funds showing significant gains in value over their respective holding periods. The top-performing funds are the Axis Small Cap Fund Reg (G), Kotak Emerging Equity (G), and Parag Parikh Flexi Cap Fund Reg (G), which have returns of 41.36, 47.72, and 42.46, respectively. The underperforming funds are the Edelweiss Business Cycle Fund Reg (G), Nippon India Small Cap Fund (G), and Quant Small Cap Fund (G), which have returns of 0.25, 2.62, and -1.36, respectively. Overall, the portfolio has shown a significant gain in value over the holding period, with an average CAGR of 17.27. The portfolio's performance is a positive indicator of its g

#### Function

In [15]:
def get_retrieved_context(user_query):
    response = query_engine.query(user_query)
    return response.source_nodes[0].text

In [16]:
user_input = input("You: ")
# if user_input.lower() in ['exit', 'quit']:
#     print("Chatbot: Goodbye!")

print("User:", user_input, "\n")

retrieved_text = get_retrieved_context(user_input)
print("Retrieved Context:", retrieved_text, "\n")

system_prompt = f"""
You are a helpful assistant. Use the following context to answer the user's question:

--- Start of context ---
{retrieved_text}
--- End of context ---

Now answer the following question accurately and helpfully:
{user_input}
"""

messages = [
    {
        "role": "system", 
        "content": system_prompt
    }]

response = client.chat.completions.create(
                messages=messages,
                model="llama-3.3-70b-versatile",  # Groq’s model name
                temperature=0.2,
                max_tokens=4096
            )

reply = response.choices[0].message.content
print(f"Chatbot: {reply}")

User: Summarize overall valuation of Sireesha Bollu 

Retrieved Context: BOLLU SIREESHA
Final Portfolio Overview
The portfolio consists of 13 equity mutual funds, with a total investment value of 9,82,501.73 and a current value of 12,44,803.65. The total gain is 2,62,301.91, and the percentage return is 26.70. The average CAGR is 17.27, and the average holding period is 497 days. The portfolio has performed well, with several funds showing significant gains in value over their respective holding periods. The top-performing funds are the Axis Small Cap Fund Reg (G), Kotak Emerging Equity (G), and Parag Parikh Flexi Cap Fund Reg (G), which have returns of 41.36, 47.72, and 42.46, respectively. The underperforming funds are the Edelweiss Business Cycle Fund Reg (G), Nippon India Small Cap Fund (G), and Quant Small Cap Fund (G), which have returns of 0.25, 2.62, and -1.36, respectively. Overall, the portfolio has shown a significant gain in value over the holding period, with an average CA

In [None]:
Hello my name is Sireesha, what is my value in HDFC Multi Cap Fund Reg fund?

Object `fund` not found.
