<h1> Retrieve data from qdrant</h1>


In [1]:
from src.gen_pipeline import GenPipeline
from src.utils import make_filter

gen_pipeline = GenPipeline()
index = gen_pipeline._get_qdrant_index()

  from .autonotebook import tqdm as notebook_tqdm



In [14]:
query = "provide me Investment Portfolio of Alfalah GHP Islamic Dedicated Equity Fund - Compliance Report"
retriever = index.as_retriever(similarity_top_k=40, filters=make_filter(query))
retrieve_nodes = retriever.retrieve(query)
retrieve_nodes

[NodeWithScore(node=TextNode(id_='dd9d2192-650e-4b4e-8e01-553b232e0909', embedding=None, metadata={'filename': 'Alfalah_assist_all_tabulor_md_data.md', 'file_id': '0297846f-8d65-49d4-b6a6-dd5f8c77ba46'}, excluded_embed_metadata_keys=['file_id'], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='0297846f-8d65-49d4-b6a6-dd5f8c77ba46', node_type=None, metadata={'filename': 'Alfalah_assist_all_tabulor_md_data.md'}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text="Alfalah GHP Alpha Fund - Compliance Report\nConventional Scheme  \n| Description                   | Value                     |\n| ----------------------------- | ------------------------- |\n| Applicable NAV Date           | Wednesday, April 30, 2025 |\n| Net Assets                    | 1,761,087,202             |\n| Equity Investment             | 1,760,004,443             |\n| Cash Balance                  | 53,021,832                |\n| Equit

<h1>Format retrieved chunks</h1>


In [4]:
def format_retrieved_chunks(retrieved_chunks):
    formatted_texts = []

    for node_with_score in retrieved_chunks:
        node = node_with_score.node
        metadata = node.metadata

        # Extract metadata
        year = metadata.get("year", None)
        month = metadata.get("month", None)
        filename = metadata.get("filename", "N/A")
        text_metadata = metadata.get("text_metadata", None)
        text_content = node.text

        # Format text according to the desired output
        formatted_text = ""

        if year:
            formatted_text += f"year: {year}\n"
        if month:
            formatted_text += f"month: {month}\n"

        formatted_text += f"filename: {filename}\n"

        if text_metadata:
            formatted_text += f"text_content: {text_content}\n{text_metadata}\n------------------------------"
        else:
            formatted_text += (
                f"text_content: {text_content}\n--------------------------"
            )
        formatted_texts.append(formatted_text)

    # Join all the formatted texts together
    return "\n".join(formatted_texts)


# Format the retrieved chunks and print them
formated_nodes = format_retrieved_chunks(retrieve_nodes)

In [5]:
formated_nodes

"filename: Alfalah_assist_all_tabulor_md_data.md\ntext_content: Alfalah GHP Alpha Fund - Compliance Report\nConventional Scheme  \n| Description                   | Value                     |\n| ----------------------------- | ------------------------- |\n| Applicable NAV Date           | Wednesday, April 30, 2025 |\n| Net Assets                    | 1,761,087,202             |\n| Equity Investment             | 1,760,004,443             |\n| Cash Balance                  | 53,021,832                |\n| Equity Investment as % of N.A | 99.94%                    |\n| Cash & Cash Equivalent        | 3.01%                     |  \nInvestment Portfolio  \n| Entity Detail                    | Valuation   | Single Entity (N.A) (%) | Paid-up Capital (N.A) (%) |\n| -------------------------------- | ----------- | ----------------------- | ------------------------- | --- |\n| United Bank Limited              | 131,031,701 | 7.44%                   | 0.03%                     |\n| Fauji Fertili

<h1>Llama index Token Counter</h1>


In [None]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core import Settings
import tiktoken

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

Settings.callback_manager = CallbackManager([token_counter])
print("prompt: ", token_counter.llm_token_counts[0].prompt[:100], "...\n")
print(
    "prompt token count: ",
    token_counter.llm_token_counts[0].prompt_token_count,
    "\n",
)

print("completion: ", token_counter.llm_token_counts[0].completion[:100], "...\n")
print(
    "completion token count: ",
    token_counter.llm_token_counts[0].completion_token_count,
    "\n",
)

print("total token count", token_counter.llm_token_counts[0].total_token_count)

In [None]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
    token_counter.llm_token_counts,
)

<h1>Cost Calculator</h1>


In [None]:
# Define the costs per token in dollars
prompt_token_cost = 0.50 / 1_000_000  # $0.50 per 1M tokens
completion_token_cost = 1.50 / 1_000_000  # $1.50 per 1M tokens

# Define the number of tokens
prompt_tokens = 5866
completion_tokens = 48

# Calculate the cost
total_prompt_cost = prompt_tokens * prompt_token_cost
total_completion_cost = completion_tokens * completion_token_cost
total_cost = total_prompt_cost + total_completion_cost

# Print the results
print(f"Total cost: ${total_cost:.6f}")

<h1>Reset token counter</h1>


In [None]:
token_counter.reset_counts()

<h1>LLM Rerank</h1>


In [1]:
from llama_index.core.postprocessor import LLMRerank
from pprint import pprint
from llama_index.llms.openai import OpenAI

llm = OpenAI(
    model="gpt-4o-mini",
    temperature=0.0,
)

# llm = ChatOpenAI(
#     model=self.model_name,
#     temperature=0.0,
#     verbose=True,
#     streaming=True,
#     stream_usage=True,
# )

postprocessor = LLMRerank(choice_batch_size=20, top_n=5, llm=llm)
pprint(postprocessor)
# postprocessor = LLMRerank(choice_batch_size=10, top_n=5)
rerank_retrieve_nodes = postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)
print(type(rerank_retrieve_nodes))


LLMRerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x742c80262b90>, top_n=5, choice_select_prompt=PromptTemplate(metadata={'prompt_type': <PromptType.CHOICE_SELECT: 'choice_select'>}, template_vars=['context_str', 'query_str'], kwargs={}, output_parser=None, template_var_mappings=None, function_mappings=None, template="A list of documents is shown below. Each document has a number next to it along with a summary of the document. A question is also provided. \nRespond with the numbers of the documents you should consult to answer the question, in order of relevance, as well \nas the relevance score. The relevance score is a number from 1-10 based on how relevant you think the document is to the question.\nDo not include any documents that are not relevant to the question. \nExample format: \nDocument 1:\n<summary of document 1>\n\nDocument 2:\n<summary of document 2>\n\n...\n\nDocument 10:\n<summary of document 10>\n\nQuestion: <question>\nAnswer:\nDoc

NameError: name 'retrieve_nodes' is not defined

<h1> LLMRank error handling functions</h1>


In [None]:
import logging as logger
from llama_index.core.postprocessor import LLMRerank


def rerank_retrieve_nodes(retrieve_nodes, query_str, reranker):
    attempts = 0
    retries = 3
    while attempts < retries:
        try:
            # Perform reranking
            reranked_nodes = reranker.postprocess_nodes(
                retrieve_nodes, query_str=query_str
            )
            print(f"reranked_nodes = {reranked_nodes}")
            logger.info(f"reranked_nodes = {reranked_nodes}")
            return reranked_nodes  # Exit loop on success
        except (ValueError, IndexError) as e:
            # Log specific error details
            attempts += 1
            logger.error(
                f"Error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
        except Exception as e:
            # Catch any other unexpected errors
            attempts += 1
            logger.error(
                f"Unexpected error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
    logger.error(f"Reranking failed after all retries. on query: '{query_str}'")
    return []

In [None]:
reranker = LLMRerank(choice_batch_size=10, top_n=10)
rerank_retrieve_nodes(retrieve_nodes, query, reranker)

In [None]:
import time


def rerank_retrieve_nodes(retrieve_nodes, query_str, llm):
    attempts = 0
    retries = 3
    while attempts < retries:
        try:
            # Initialize LLMRerank postprocessor
            postprocessor = LLMRerank(choice_batch_size=10, top_n=5, llm=llm)

            # Perform reranking
            reranked_nodes = postprocessor.postprocess_nodes(
                retrieve_nodes, query_str=query_str
            )
            print("--------------------------------")
            # print(reranked_nodes)
            return reranked_nodes  # Exit loop on success
        except (ValueError, IndexError) as e:
            # Log specific error details
            attempts += 1
            logging.error(
                f"Error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
        except Exception as e:
            # Catch any other unexpected errors
            attempts += 1
            logging.error(
                f"Unexpected error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
    logging.error("Reranking failed after all retries. Returning None.")
    return None

In [None]:
rerank_retrieve_nodes(retrieve_nodes, query, llm, retries=3, delay=2)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

# We choose a model with relatively high speed and decent accuracy.
postprocessor = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=5
)

retrieve_nodes = postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> SimilarityPostprocessor Reranker </h1>


In [None]:
from llama_index.core.postprocessor import SimilarityPostprocessor

postprocessor = SimilarityPostprocessor(similarity_cutoff=0.7)

postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> Long context reorder</h1>


In [None]:
from llama_index.core.postprocessor import LongContextReorder

postprocessor = LongContextReorder()

postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> Create nodes data and nodes from input dir(handle best performing funds too)</h1>


In [24]:
import re
import os
import pandas as pd

from langchain_text_splitters import MarkdownHeaderTextSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file.flat import FlatReader
from llama_index.core.schema import TextNode, RelatedNodeInfo, NodeRelationship
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)
from deep_translator import GoogleTranslator
from src.config import MONTH_FULL_NAMES, MONTH_PATTERN, YEAR_PATTERN
from src.utils import extract_month, extract_year


def creat_node_data_from_input_dir(inpur_dir):

    documents = SimpleDirectoryReader(
        input_dir=inpur_dir,
        file_extractor={
            ".md": FlatReader()
        },  # This disables the MarkdownReader for .md files
        recursive=True,
    ).load_data()

    nodes_data = []

    for document in documents:
        markdown_document = document.get_content()
        filename = document.metadata.get("filename")
        file_id = document.id_
        headers_to_split_on = [
            ("#", "Header 1"),
            # ("##", "Header 2"),
            # ("###", "Header 3"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on
        )
        md_header_splits = markdown_splitter.split_text(markdown_document)

        node_data = {"file_id": file_id, "filename": filename, "node_text": []}

        for text in md_header_splits:
            headers_combined = []

            # Loop through metadata and concatenate headers
            for _, header in text.metadata.items():
                if header:
                    headers_combined.append(header)

            headers_combined = " of ".join(headers_combined[::-1])
            # Concatenate headers and page content
            concat_text = headers_combined + "\n" + text.page_content
            node_data["node_text"].append(concat_text)
        nodes_data.append(node_data)

    return nodes_data


# pattern = r"(Top Performing Funds and Returns for .+)\n(title\|subtitle\|percent\|timeperiod\n([\w\s\-\–().|%–:]+))"
# pattern = r"(# .+?)\n+(\| title\s+\| subtitle\s+\| percent\s+\| timeperiod\s+\|[\s\S]+?)(?=\n\s*#|\Z)"
# pattern = r"^(Top Performing Funds and Returns for .+?)\n(\|.+?)(?=\n[A-Z#]|$)"


# def create_nodes_from_nodes_data(nodes_data):
#     nodes = []
#     for data in nodes_data:
#         filename = data["filename"]
#         month = extract_month(filename)
#         year = extract_year(filename)
#         if month and year:
#             node_text = data["node_text"][0]
#             match = re.search(pattern, node_text, re.DOTALL)
#             if match and len(data["node_text"]) == 1:
#                 print(f'lenght of nodes {len(data["node_text"])}')
#                 report_title = match.group(1)
#                 report_content = match.group(2)
#                 node = TextNode(
#                     text=report_title,
#                     metadata={
#                         "year": str(year),
#                         "month": month,
#                         "filename": filename,
#                         "file_id": data["file_id"],
#                         "text_metadata": report_content,
#                     },
#                 )
#                 node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
#                 print("iam here")
#                 node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
#                     node_id=data["file_id"], metadata={"filename": filename}
#                 )
#                 nodes.append(node)
#             else:
#                 for text in data["node_text"]:
#                     node = TextNode(
#                         text=(text),
#                         metadata={
#                             "year": str(year),
#                             "month": month,
#                             "filename": filename,
#                             "file_id": data["file_id"],
#                         },
#                     )
#                     node.excluded_embed_metadata_keys = ["file_id"]
#                     node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
#                         node_id=data["file_id"], metadata={"filename": filename}
#                     )
#                     nodes.append(node)
#         else:
#             for text in data["node_text"]:
#                 node = TextNode(
#                     text=(text),
#                     metadata={
#                         "filename": filename,
#                         "file_id": data["file_id"],
#                     },
#                 )
#                 node.excluded_embed_metadata_keys = ["file_id"]
#                 node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
#                     node_id=data["file_id"], metadata={"filename": filename}
#                 )

#                 nodes.append(node)

#     return nodes

In [25]:
# Define patterns for both types of reports
pattern_top_performing_with_month_and_year = (
    r"^(Top/Best Performing Funds and Returns for .+?)\n(\|.+?)(?=\n[A-Z#]|$)"
)
pattern_top_performing = (
    r"^(Top/Best Performing Funds and Returns)\n(\|.+?)(?=\n[A-Z#]|$)"
)
pattern_fund_names = r"(#?\s*Name of all Funds offered by AAML.*?Profile)\n([\s\S]*)"


def create_nodes_from_nodes_data(nodes_data):
    nodes = []
    for data in nodes_data:
        filename = data["filename"]
        month = extract_month(filename)
        year = extract_year(filename)

        node_text = data["node_text"][0]

        # Try matching "Top Performing Funds with month and year"
        match_top_performing__with_month_and_year = re.search(
            pattern_top_performing_with_month_and_year, node_text, re.DOTALL
        )

        # Try matching "Top Performing Funds"
        match_top_performing = re.search(pattern_top_performing, node_text, re.DOTALL)

        print(match_top_performing__with_month_and_year)
        print("---------------------------------------")

        # Try matching "Name of all Funds offered by AAML"
        match_fund_names = re.search(pattern_fund_names, node_text, re.DOTALL)

        if month and year:
            if match_top_performing__with_month_and_year:
                report_title = match_top_performing__with_month_and_year.group(1)
                report_content = match_top_performing__with_month_and_year.group(2)
            elif match_fund_names:
                report_title = match_fund_names.group(1)
                report_content = match_fund_names.group(2)
            else:
                report_title = None
                report_content = None

            if report_title and report_content and len(data["node_text"]) == 1:
                node = TextNode(
                    text=report_title,
                    metadata={
                        "year": str(year),
                        "month": month,
                        "filename": filename,
                        "file_id": data["file_id"],
                        "text_metadata": report_content,
                    },
                )
                node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )
                nodes.append(node)

            else:
                for text in data["node_text"]:
                    node = TextNode(
                        text=text,
                        metadata={
                            "year": str(year),
                            "month": month,
                            "filename": filename,
                            "file_id": data["file_id"],
                        },
                    )
                    node.excluded_embed_metadata_keys = ["file_id"]
                    node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                        node_id=data["file_id"], metadata={"filename": filename}
                    )
                    nodes.append(node)
        else:
            print("bro i am here eeee")

            if match_top_performing:
                report_title = match_top_performing.group(1)
                report_content = match_top_performing.group(2)

            elif match_fund_names:
                print("✅ Match Found for Fund Names")
                # print("Title:", match_fund_names.group(1))
                # print("Content:", match_fund_names.group(2))
                report_title = match_fund_names.group(1)
                print("text:", report_title)
                report_content = match_fund_names.group(2)
            else:
                report_title = None
                report_content = None
            if report_title and report_content and len(data["node_text"]) == 1:
                node = TextNode(
                    text=report_title,
                    metadata={
                        "filename": filename,
                        "file_id": data["file_id"],
                        "text_metadata": report_content,
                    },
                )
                node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )
                nodes.append(node)
            else:
                for text in data["node_text"]:
                    node = TextNode(
                        text=text,
                        metadata={
                            "filename": filename,
                            "file_id": data["file_id"],
                        },
                    )
                    node.excluded_embed_metadata_keys = ["file_id"]
                    node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                        node_id=data["file_id"], metadata={"filename": filename}
                    )
                    nodes.append(node)

    return nodes

In [None]:
from pprint import pprint

path = "latest_modified_fmr_data/Alfalah_assist_all_tabulor_md_data"
nodes_data = creat_node_data_from_input_dir(path)
nodes = create_nodes_from_nodes_data(nodes_data)
print(nodes)

In [None]:
import os

# Set the directory containing your .md files
input_directory = (
    "latest_modified_fmr_data/alfalah assist table data/Alfalah_assist_tabulor_md_data"
)
output_file = "Alfalah_assist_all_tabulor_md_data.md"

# Get all .md files sorted by name
md_files = sorted([f for f in os.listdir(input_directory) if f.endswith(".md")])

with open(output_file, "w", encoding="utf-8") as outfile:
    for filename in md_files:
        filepath = os.path.join(input_directory, filename)
        with open(filepath, "r", encoding="utf-8") as infile:
            outfile.write(f"\n\n# {filename}\n\n")  # Optional: file header
            outfile.write(infile.read())
            outfile.write("\n")  # Ensure separation between files

In [None]:
len(nodes)

In [None]:
from pprint import pprint

pprint(str(nodes[0]), width=200)

<h1> Delete data from Qdrant</h1>


In [17]:
import logging
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# from src.utils import creat_node_data_from_input_dir, create_nodes_from_nodes_data
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
import qdrant_client
from llama_index.embeddings.openai import OpenAIEmbedding

In [18]:
def delete_data_from_qdrant(filename: str):
    # Initialize Qdrant client
    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    # Set the embedding model
    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    # Perform the delete operation
    try:
        response = client.delete(
            collection_name=COLLECTION_NAME,  # Use the variable directly
            points_selector=models.FilterSelector(
                filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="filename",
                            match=models.MatchValue(value=filename),
                        )
                    ]
                )
            ),
        )
        print(f"File for the year '{filename}' has been deleted successfully.")
    except Exception as e:
        print(f"Error deleting the file: {e}")

In [19]:
delete_data_from_qdrant("QUERIES_FOR_CHATBOT.md")

File for the year 'QUERIES_FOR_CHATBOT.md' has been deleted successfully.


<h1> Add data to Qdrant</h1>


In [20]:
from pprint import pprint


def add_data_to_qdrant(path):

    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)
    nodes_data = creat_node_data_from_input_dir(path)
    nodes = create_nodes_from_nodes_data(nodes_data)

    print(nodes)

    logging.info("no collection found")
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
    )
    index = VectorStoreIndex(nodes, storage_context=storage_context)
    return index

In [26]:
add_data_to_qdrant("latest_modified_fmr_data/FAQs/queries for chatbot")

None
---------------------------------------
bro i am here eeee
[TextNode(id_='d2bf1bc8-1ee3-4742-863b-9503d7fa0a00', embedding=None, metadata={'filename': 'QUERIES_FOR_CHATBOT.md', 'file_id': '36e00933-c67c-4e45-9667-188d97f95731'}, excluded_embed_metadata_keys=['file_id'], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='36e00933-c67c-4e45-9667-188d97f95731', node_type=None, metadata={'filename': 'QUERIES_FOR_CHATBOT.md'}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text='Q7. What documents are required for opening a mutual fund account?\nIdentification proof (like an CNIC or passport), address proof, and a recent passport-sized photograph. Specific requirements:\nDocumentation RequiermentCDD:-\nSARMAYAKARIACCOUNT (INDIVIDUAL)  \n• AccountOpeningForm & Investment Form\n• Investment Cheque\n• CopyofCNIC\n• CopyofCNICofJoint Holder /Nominee -(if mention on AOF)\n• Last month/recent pay slip; or Annual s

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x700b00657af0>

<h1> Prcesssing md Files</h1>


In [None]:
import os


def replace_in_files(folder_path: str):
    """
    Reads all files in the given folder, replaces specific headers with their updated versions,
    and saves the changes back to the files.

    Args:
        folder_path (str): The path to the folder containing the files.
    """
    # Mapping of original headers to their replacements
    replacements = {
        "\nSindh Workers": "\n## Sindh Workers",
        "\n(Holdings as % of Total Assets)": "\n### (Holdings as % of Total Assets)",
        "\nHoldings as % of Total Assets": "\n### (Holdings as % of Total Assets)",
        "\nFund Statistics:": "\n### Fund Statistics:",
        "\nFund Statistic": "\n### Fund Statistic:",
        "\nFund Statistics": "\n### Fund Statistics:",
        "\nFund Stataistics": "\n### Fund Stataistics:",
        "\nfund statistics": "\n### fund statistics:",
        "\nTop Ten Holdings (as a % of total assets)": "\n### Top Ten Holdings (as a % of total assets)",
        "\nSector Allocation (as a % of total assets)": "\n### Sector Allocation (as a % of total assets)",
        "\nAsset Allocation (as % of Total Assets)": "\n### Asset Allocation (as % of Total Assets)",
        "\nRisk Profile:": "\n## Risk Profile:",
        "\nRisk Profile": "\n## Risk Profile:",
        "\nFund Performance": "\n### Fund Performance",
        "\nFund Performanace": "\n### Fund Performanace",
        "\nFund Performanace:": "\n### Fund Performanace:",
        "\nFund Perfomance": "\n### Fund Perfomance",
        "\nfund performance": "\n### Fund Performance",
        "\nPerformance": "\n### Performance",
        "\nAsset Allocation": "\n### Asset Allocation",
        "\n### RISK PROFILE OF ISLAMIC COLLECTIVE INVESTMENT SCHEMES/PLANS": "\n# RISK PROFILE OF ISLAMIC COLLECTIVE INVESTMENT SCHEMES/PLANS",
        "\n### RISK PROFILE OF CONVENTIONAL COLLECTIVE INVESTMENT SCHEMES/PLANS": "\n# RISK PROFILE OF CONVENTIONAL COLLECTIVE INVESTMENT SCHEMES/PLANS",
    }

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Read the file
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()

                # Apply all replacements
                for original, replacement in replacements.items():
                    content = content.replace(original, replacement)

                # Write back the updated content
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(content)

                print(f"Processed file: {file_path}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")


# Example usage
folder_path = "latest_modified_fmr_data/single_modified_file"
replace_in_files(folder_path)

<h1> Create Filters</h1>


In [None]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)

In [None]:
def create_filters_for_all_data(month, year):
    filters_list = []

    # If a month is found, add a month filter
    if month:
        filters_list.append(
            MetadataFilter(key="month", operator=FilterOperator.EQ, value=month)
        )

    # If a year is found, add a year filter
    if year:
        filters_list.append(
            MetadataFilter(key="year", operator=FilterOperator.EQ, value=year)
        )

    # Return filters if any are found, otherwise None
    print(filters_list)
    if filters_list:
        return MetadataFilters(filters=filters_list, condition=FilterCondition.AND)
    return MetadataFilters(
        filters=[
            # MetadataFilter(key="year", operator=FilterOperator.NIN, value=list(range(2014, 2024))),
            MetadataFilter(
                key="year", operator=FilterOperator.IS_EMPTY, value=None
            )  # Avoid including `value`
        ],
        condition=FilterCondition.OR,
    )

In [None]:
create_filters_for_all_data("nov", "2024")

In [None]:
def create_filters_for_specific_files(file1, file2):
    filters_list = []

    # Add a filter for the first file
    if file1:
        filters_list.append(
            MetadataFilter(key="filename", operator=FilterOperator.EQ, value=file1)
        )

    # Add a filter for the second file
    if file2:
        filters_list.append(
            MetadataFilter(key="filename", operator=FilterOperator.EQ, value=file2)
        )

    # Combine the filters with an OR condition to get results from both files
    return MetadataFilters(filters=filters_list, condition=FilterCondition.OR)

In [None]:
def make_filter_for_specific_file(query):
    # Extract month and year from the query
    file1 = "Conventional_all_Fund_Data.md"
    file2 = "Islamic_all_Fund_Data.md"
    # Create and return filters based on extracted month and year
    return create_filters_for_specific_files(file1, file2)

In [None]:
from src.gen_pipeline import GenPipeline
from src.utils import make_filter

gen_pipeline = GenPipeline()
index = gen_pipeline._get_qdrant_index()

In [None]:
query = "tell me fund performance for all funds "
retriever = index.as_retriever(
    similarity_top_k=20, filters=make_filter_for_specific_file(query)
)
retrieve_nodes = retriever.retrieve(query)
retrieve_nodes