In [33]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
import os
import pandas as pd
from groq import Groq
from docx import Document
from dotenv import load_dotenv


In [34]:
# Load .env file
load_dotenv()

# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))

### Data preprocessing

In [23]:
documents = SimpleDirectoryReader(r"C:\Users\Bollu\genai_rag\portfolio_docs").load_data()

In [80]:
# print(documents[0].text)

In [35]:
text_ = documents[0].text

prompt = f"""
### Instructions:

You are a data extraction and summarization assistant helping prepare data for a Retrieval-Augmented Generation (RAG) system.

Your goal is to extract and structure information from a mutual fund portfolio statement and convert it into **detailed, uniformly-sized, textual summaries** (approximately 500 tokens each). Ignore any disclaimers or legal boilerplate.

Follow these formatting and content rules carefully:

---

### For each mutual fund entry:
- Begin with owner's name.
- Title the fund.
- Summarize the following in a detailed narrative form (same chunk size of ~500 tokens):
  • Fund name and category (e.g., Equity, Liquid)
  • Number of units held
  • Purchase NAV and value
  • Current NAV and value
  • Absolute gain and % return
  • CAGR and holding period in days
  • Any special notes (e.g., underperformance or exceptionally high gain)

### After summarizing all funds:
Provide a **final portfolio summary** (same chunk size of ~500 tokens), including:
  • Star with the owner's name.
  • Total number of funds across categories
  • Combined original and current investment value
  • Total gain and percentage return
  • Average CAGR
  • Any important insights (e.g., high-performing funds, poor performers)

### Additional Instructions:
  • All fund summaries and portfolio summary must be approximately the same length (around 500 tokens or ~350–400 words).
  • Do not include disclaimers or index values (e.g., Sensex or NIFTY).
  • Output in plain text format, suitable for use with a vector database.
   •Use all the synnonymns for the headings to ensure the user gets what is required.

---

### Document:
{text_}
"""



In [36]:
response = client.chat.completions.create(
messages=[
    {
        "role": "user",
        "content": prompt,
    }
],
model="llama-3.3-70b-versatile",
temperature=0.2,
max_completion_tokens=4096,
)

result = response.choices[0].message.content

In [7]:
print(result)

### Mutual Fund Summaries for Durgaprasad Bollu

#### HDFC Mid Cap Opportunities Fund (G)
Durgaprasad Bollu holds an investment in the HDFC Mid Cap Opportunities Fund (G), which falls under the Equity category. As of the valuation date, the number of units held is 1,000.0000, with an average purchase NAV of 10.00, resulting in a total purchase value of 10,000.00. The current NAV stands at 186.5390, leading to a current value of 1,86,539.00. There has been no dividend gain, and the absolute return is 1,76,539.00, with a percentage return of 1,765.39%. The holding period is 6,535 days, and the CAGR is 17.75%. This fund has shown significant growth, indicating a successful investment choice.

The performance of this fund is noteworthy, given its substantial absolute and percentage returns. The long holding period and considerable CAGR underscore the potential of this investment. It is essential to monitor the fund's future performance to ensure it continues to align with Durgaprasad Bollu

#### combined code:

In [25]:
# Paths
input_folder = os.path.expanduser(r"C:\Users\Bollu\genai_rag\portfolio_docs")
output_folder = os.path.expanduser(r"C:\Users\Bollu\genai_rag\processed_docs")

# Load PDF documents
documents = SimpleDirectoryReader(input_folder).load_data()

# Process each document
for doc in documents:
    # Get the document name (assuming metadata exists or infer from filename)
    filename = getattr(doc, "metadata", {}).get("file_name", None)
    if not filename:
        # fallback to index-based name if metadata is missing
        index = documents.index(doc)
        filename = f"document_{index + 1}.pdf"

    doc_name = os.path.splitext(filename)[0]
    word_doc_path = os.path.join(output_folder, f"{doc_name}.docx")

    # Skip if Word document already exists
    if os.path.exists(word_doc_path):
        print(f"Skipping {doc_name}: Word document already exists.")
        continue

    print(f"Processing {doc_name}...")

    text_ = doc.text

    prompt = f"""
    ### Instructions:

    You are a data extraction and summarization assistant helping prepare data for a Retrieval-Augmented Generation (RAG) system.

    Your goal is to extract and structure information from a mutual fund portfolio statement and convert it into **detailed, uniformly-sized, textual summaries** (approximately 500 tokens each). Ignore any disclaimers or legal boilerplate.

    Follow these formatting and content rules carefully:

    ---

    Do not use the word summary. Only use it for the final portfolio ummary.

    ### For each mutual fund entry:
    - Begin with owner's name.
    - Title the fund.
    - Summarize the following in a detailed narrative form (same chunk size of ~500 tokens):
    • Fund name and category (e.g., Equity, Liquid)
    • Number of units held
    • Purchase NAV and value
    • Current NAV and value
    • Absolute gain and % return
    • CAGR and holding period in days
    • Any special notes (e.g., underperformance or exceptionally high gain)

    ### After summarizing all funds:
    Provide a **final portfolio summary** (same chunk size of ~500 tokens), including:
    • Star with the owner's name.
    • Total number of funds across categories
    • Combined original and current investment value
    • Total gain and percentage return
    • Average CAGR
    • Any important insights (e.g., high-performing funds, poor performers)

    ### Additional Instructions:
    • All fund summaries and portfolio summary must be approximately the same length (around 500 tokens or ~350–400 words).
    • Do not include disclaimers or index values (e.g., Sensex or NIFTY).
    • Output in plain text format, suitable for use with a vector database.
    •Use all the synnonymns for the headings to ensure the user gets what is required.

    ---

    ### Document:
    {text_}"""

    try:
        # LLM Call
        response = client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model="llama-3.3-70b-versatile",  # or your model
            temperature=0.2,
            max_tokens=4096
        )
        #print(result,"=================")
        result = response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error processing {doc_name}: {e}")
        continue

    # Write result to Word Document
    docx_doc = Document()
    docx_doc.add_heading(f"Extracted Portfolio Data - {doc_name}", level=1)
    docx_doc.add_paragraph(result)
    docx_doc.save(word_doc_path)

    print(f"Saved: {word_doc_path}")

Skipping durga prasad: Word document already exists.
Skipping Hariprasad: Word document already exists.
Skipping Lakshmi Devi: Word document already exists.
Skipping nagalaxmi: Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping neelima (mirae): Word document already exists.
Skipping Sireesha: Word document already exists.


### RAG

In [10]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

Settings.llm = None
Settings.chunk_size = 500
Settings.chunk_overlap = 0

LLM is explicitly disabled. Using MockLLM.


In [26]:
documents = SimpleDirectoryReader(r"C:\Users\Bollu\genai_rag\processed_docs").load_data()

Failed to load file C:\Users\Bollu\genai_rag\processed_docs\~$rga prasad.docx with error: File is not a zip file. Skipping...


In [27]:
index = VectorStoreIndex.from_documents(documents)

In [28]:
# set number of docs to retreive
top_k = 1

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=top_k,
)

In [29]:
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)

In [30]:
post = "Final Portfolio Summary of Bollu Sireesha"
response = query_engine.query(post)
print(response.source_nodes[0].text)

The average purchase NAV is 58.64, and the total purchase value is 1,12,500.24. As of the current date, the NAV is 83.5423, and the current value is 1,60,268.30. The absolute gain is 47,768.06, and the percentage return is 42.46. The holding period is 676 days, with a CAGR of 19.69. This fund has performed exceptionally well, with significant gains over a relatively short period.

BOLLU SIREESHA
Quant Small Cap Fund (G) Overview
The Quant Small Cap Fund (G) is an equity mutual fund with 137.2600 units held. The average purchase NAV is 254.99, and the total purchase value is 35,000.34. As of the current date, the NAV is 251.5130, and the current value is 34,522.67. The absolute gain is -477.66, and the percentage return is -1.36. The holding period is 208 days, with a CAGR of -2.40. This fund has underperformed, with losses over a short period.

BOLLU SIREESHA
Final Portfolio Summary
The portfolio consists of 13 equity mutual funds, with a total purchase value of 9,82,501.73 and a curre

#### Function

In [31]:
def get_retrieved_context(user_query):
    response = query_engine.query(user_query)
    return response.source_nodes[0].text

In [32]:
user_input = input("You: ")
# if user_input.lower() in ['exit', 'quit']:
#     print("Chatbot: Goodbye!")

print("User:", user_input, "\n")

retrieved_text = get_retrieved_context(user_input)
print("Retrieved Context:", retrieved_text, "\n")

system_prompt = f"""
You are a helpful assistant. Use the following context to answer the user's question:

--- Start of context ---
{retrieved_text}
--- End of context ---

Now answer the following question accurately and helpfully:
{user_input}
"""

messages = [
    {
        "role": "system", 
        "content": system_prompt
    }]

response = client.chat.completions.create(
                messages=messages,
                model="llama-3.3-70b-versatile",  # Groq’s model name
                temperature=0.2,
                max_tokens=4096
            )

reply = response.choices[0].message.content
print(f"Chatbot: {reply}")

User: Summarize overall portfolio valuation of DUrga Prasad Bollu 

Retrieved Context: The first investment consists of 2,699.8650 units, with an average purchase NAV of 10.00, resulting in a purchase value of 27,000.00. The current NAV is 10.6820, leading to a current value of 28,839.96. The dividend gain is 0.00, and the absolute return is 1,839.96, with a percentage return of 6.81%. The holding period for this investment is 105 days, and the CAGR is 23.69%. The second investment comprises 2,199.9050 units, with an average purchase NAV of 9.82, resulting in a purchase value of 21,599.99. The current NAV is 10.6820, leading to a current value of 23,499.39. The dividend gain is 0.00, and the absolute return is 1,899.39, with a percentage return of 8.79%. The holding period for this investment is 45 days, and the CAGR is 71.33%. These investments demonstrate the potential for growth in small-cap funds, despite the relatively short holding periods.

Durgaprasad Bollu 
Quant Small Cap Fun

In [None]:
Hello my name is Sireesha, what is my value in HDFC Multi Cap Fund Reg fund?

Object `fund` not found.
