<h1> Retrieve data from qdrant</h1>


In [1]:
from src.gen_pipeline import GenPipeline
from src.utils import make_filter

gen_pipeline = GenPipeline()
index = gen_pipeline._get_qdrant_index()

  from .autonotebook import tqdm as notebook_tqdm



In [2]:
query = "Fund performance of Stock Fund for January 2025"
retriever = index.as_retriever(similarity_top_k=100, filters=make_filter(query))
retrieve_nodes = retriever.retrieve(query)
retrieve_nodes

[NodeWithScore(node=TextNode(id_='635e9ac1-494c-40b1-b504-94d5419faec9', embedding=None, metadata={'year': '2025', 'month': 'January', 'filename': 'fund_performance January 2025.md', 'file_id': '51105080-b74b-4b41-81c8-34eb8d5f7e6a', 'text_metadata': '| title                                          | subtitle                                       | percent                             | timeperiod                         |\n| :--------------------------------------------- | :--------------------------------------------- | :---------------------------------- | :--------------------------------- |\n| Alfalah GHP Money Market Fund                  | Money Market Scheme                            | 10.77%                              | Annualized MTD Return (Jan 2025)\\* |\n| Alfalah GHP Cash Fund                          | Money Market Scheme                            | 9.89%                               | Annualized MTD Return (Jan 2025)   |\n| Alfalah GHP Sovereign Fund               

<h1>Format retrieved chunks</h1>


In [None]:
def format_retrieved_chunks(retrieved_chunks):
    formatted_texts = []

    for node_with_score in retrieved_chunks:
        node = node_with_score.node
        metadata = node.metadata

        # Extract metadata
        year = metadata.get("year", None)
        month = metadata.get("month", None)
        filename = metadata.get("filename", "N/A")
        text_metadata = metadata.get("text_metadata", None)
        text_content = node.text

        # Format text according to the desired output
        formatted_text = ""

        if year:
            formatted_text += f"year: {year}\n"
        if month:
            formatted_text += f"month: {month}\n"

        formatted_text += f"filename: {filename}\n"

        if text_metadata:
            formatted_text += f"text_content: {text_content}\n{text_metadata}\n------------------------------"
        else:
            formatted_text += (
                f"text_content: {text_content}\n--------------------------"
            )
        formatted_texts.append(formatted_text)

    # Join all the formatted texts together
    return "\n".join(formatted_texts)


# Format the retrieved chunks and print them
formated_nodes = format_retrieved_chunks(retrieve_nodes)

<h1>Llama index Token Counter</h1>


In [None]:
from llama_index.core.callbacks import CallbackManager, TokenCountingHandler
from llama_index.core import Settings
import tiktoken

token_counter = TokenCountingHandler(
    tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
)

Settings.callback_manager = CallbackManager([token_counter])
print("prompt: ", token_counter.llm_token_counts[0].prompt[:100], "...\n")
print(
    "prompt token count: ",
    token_counter.llm_token_counts[0].prompt_token_count,
    "\n",
)

print("completion: ", token_counter.llm_token_counts[0].completion[:100], "...\n")
print(
    "completion token count: ",
    token_counter.llm_token_counts[0].completion_token_count,
    "\n",
)

print("total token count", token_counter.llm_token_counts[0].total_token_count)

In [None]:
print(
    "Embedding Tokens: ",
    token_counter.total_embedding_token_count,
    "\n",
    "LLM Prompt Tokens: ",
    token_counter.prompt_llm_token_count,
    "\n",
    "LLM Completion Tokens: ",
    token_counter.completion_llm_token_count,
    "\n",
    "Total LLM Token Count: ",
    token_counter.total_llm_token_count,
    "\n",
    token_counter.llm_token_counts,>
)

<h1>Cost Calculator</h1>


In [None]:
# Define the costs per token in dollars
prompt_token_cost = 0.50 / 1_000_000  # $0.50 per 1M tokens
completion_token_cost = 1.50 / 1_000_000  # $1.50 per 1M tokens

# Define the number of tokens
prompt_tokens = 5866
completion_tokens = 48

# Calculate the cost
total_prompt_cost = prompt_tokens * prompt_token_cost
total_completion_cost = completion_tokens * completion_token_cost
total_cost = total_prompt_cost + total_completion_cost

# Print the results
print(f"Total cost: ${total_cost:.6f}")

<h1>Reset token counter</h1>


In [None]:
token_counter.reset_counts()

<h1>LLM Rerank</h1>


In [None]:
from llama_index.core.postprocessor import LLMRerank
from pprint import pprint
from llama_index.llms.openai import OpenAI

llm = OpenAI(
    model="gpt-4o-mini",
    temperature=0.0,
)

# llm = ChatOpenAI(
#     model=self.model_name,
#     temperature=0.0,
#     verbose=True,
#     streaming=True,
#     stream_usage=True,
# )

postprocessor = LLMRerank(choice_batch_size=10, top_n=5)
pprint(postprocessor)
# postprocessor = LLMRerank(choice_batch_size=10, top_n=5)
rerank_retrieve_nodes = postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)
print(type(rerank_retrieve_nodes))
rerank_retrieve_nodes

<h1> LLMRank error handling functions</h1>


In [None]:
import logging as logger
from llama_index.core.postprocessor import LLMRerank


def rerank_retrieve_nodes(retrieve_nodes, query_str, reranker):
    attempts = 0
    retries = 3
    while attempts < retries:
        try:
            # Perform reranking
            reranked_nodes = reranker.postprocess_nodes(
                retrieve_nodes, query_str=query_str
            )
            print(f"reranked_nodes = {reranked_nodes}")
            logger.info(f"reranked_nodes = {reranked_nodes}")
            return reranked_nodes  # Exit loop on success
        except (ValueError, IndexError) as e:
            # Log specific error details
            attempts += 1
            logger.error(
                f"Error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
        except Exception as e:
            # Catch any other unexpected errors
            attempts += 1
            logger.error(
                f"Unexpected error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
    logger.error(f"Reranking failed after all retries. on query: '{query_str}'")
    return []

In [None]:
reranker = LLMRerank(choice_batch_size=10, top_n=5)
rerank_retrieve_nodes(retrieve_nodes, query, reranker)

In [None]:
import time


def rerank_retrieve_nodes(retrieve_nodes, query_str, llm):
    attempts = 0
    retries = 3
    while attempts < retries:
        try:
            # Initialize LLMRerank postprocessor
            postprocessor = LLMRerank(choice_batch_size=10, top_n=5, llm=llm)

            # Perform reranking
            reranked_nodes = postprocessor.postprocess_nodes(
                retrieve_nodes, query_str=query_str
            )
            print("--------------------------------")
            # print(reranked_nodes)
            return reranked_nodes  # Exit loop on success
        except (ValueError, IndexError) as e:
            # Log specific error details
            attempts += 1
            logging.error(
                f"Error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
        except Exception as e:
            # Catch any other unexpected errors
            attempts += 1
            logging.error(
                f"Unexpected error during reranking on attempt {attempts}/{retries}: {type(e).__name__} - {e}"
            )
    logging.error("Reranking failed after all retries. Returning None.")
    return None

In [None]:
rerank_retrieve_nodes(retrieve_nodes, query, llm, retries=3, delay=2)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

# We choose a model with relatively high speed and decent accuracy.
postprocessor = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=5
)

retrieve_nodes = postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> SimilarityPostprocessor Reranker </h1>


In [None]:
from llama_index.core.postprocessor import SimilarityPostprocessor

postprocessor = SimilarityPostprocessor(similarity_cutoff=0.7)

postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> Long context reorder</h1>


In [None]:
from llama_index.core.postprocessor import LongContextReorder

postprocessor = LongContextReorder()

postprocessor.postprocess_nodes(retrieve_nodes, query_str=query)

<h1> Create nodes data and nodes from input dir(handle best performing funds too)</h1>


In [4]:
import re
import os
import pandas as pd

from langchain_text_splitters import MarkdownHeaderTextSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file.flat import FlatReader
from llama_index.core.schema import TextNode, RelatedNodeInfo, NodeRelationship
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)
from deep_translator import GoogleTranslator
from src.config import MONTH_FULL_NAMES, MONTH_PATTERN, YEAR_PATTERN
from src.utils import extract_month, extract_year


def creat_node_data_from_input_dir(inpur_dir):

    documents = SimpleDirectoryReader(
        input_dir=inpur_dir,
        file_extractor={
            ".md": FlatReader()
        },  # This disables the MarkdownReader for .md files
        recursive=True,
    ).load_data()

    nodes_data = []

    for document in documents:
        markdown_document = document.get_content()
        filename = document.metadata.get("filename")
        file_id = document.id_
        headers_to_split_on = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on
        )
        md_header_splits = markdown_splitter.split_text(markdown_document)

        node_data = {"file_id": file_id, "filename": filename, "node_text": []}

        for text in md_header_splits:
            headers_combined = []

            # Loop through metadata and concatenate headers
            for _, header in text.metadata.items():
                if header:
                    headers_combined.append(header)

            headers_combined = " of ".join(headers_combined[::-1])
            # Concatenate headers and page content
            concat_text = headers_combined + "\n" + text.page_content
            node_data["node_text"].append(concat_text)
        nodes_data.append(node_data)

    return nodes_data


# pattern = r"(Top Performing Funds and Returns for .+)\n(title\|subtitle\|percent\|timeperiod\n([\w\s\-\–().|%–:]+))"
# pattern = r"(# .+?)\n+(\| title\s+\| subtitle\s+\| percent\s+\| timeperiod\s+\|[\s\S]+?)(?=\n\s*#|\Z)"
pattern = r"^(Top/Best Performing Funds and Returns for .+?)\n(\|.+?)(?=\n[A-Z#]|$)"


def create_nodes_from_nodes_data(nodes_data):
    nodes = []
    for data in nodes_data:
        filename = data["filename"]
        month = extract_month(filename)
        year = extract_year(filename)
        if month and year:
            node_text = data["node_text"][0]
            match = re.search(pattern, node_text, re.DOTALL)
            if match and len(data["node_text"]) == 1:
                print(f'lenght of nodes {len(data["node_text"])}')
                report_title = match.group(1)
                report_content = match.group(2)
                node = TextNode(
                    text=report_title,
                    metadata={
                        "year": str(year),
                        "month": month,
                        "filename": filename,
                        "file_id": data["file_id"],
                        "text_metadata": report_content,
                    },
                )
                node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
                print("iam here")
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )
                nodes.append(node)
            else:
                for text in data["node_text"]:
                    node = TextNode(
                        text=(text),
                        metadata={
                            "year": str(year),
                            "month": month,
                            "filename": filename,
                            "file_id": data["file_id"],
                        },
                    )
                    node.excluded_embed_metadata_keys = ["file_id"]
                    node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                        node_id=data["file_id"], metadata={"filename": filename}
                    )
                    nodes.append(node)
        else:
            for text in data["node_text"]:
                node = TextNode(
                    text=(text),
                    metadata={
                        "filename": filename,
                        "file_id": data["file_id"],
                    },
                )
                node.excluded_embed_metadata_keys = ["file_id"]
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )

                nodes.append(node)

    return nodes

In [2]:
path = "company policies"

In [3]:
nodes_data = creat_node_data_from_input_dir(path)
nodes = create_nodes_from_nodes_data(nodes_data)

In [8]:
nodes

[TextNode(id_='64303ffd-cbfd-4256-8086-a14ad472513f', embedding=None, metadata={'filename': 'Anti Harassment Policy.md', 'file_id': 'e3d656ad-2d43-49cb-8007-09726026de4b'}, excluded_embed_metadata_keys=['file_id'], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e3d656ad-2d43-49cb-8007-09726026de4b', node_type=None, metadata={'filename': 'Anti Harassment Policy.md'}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text='Anti-Harassment Policy\n**Policy Number:** 013\n**Published Date:** March 2023\n**Revision Date:** January 2025\n**Department:** People & Culture\n**For:** All Employees', mimetype='text/plain', start_char_idx=None, end_char_idx=None, metadata_seperator='\n', text_template='{metadata_str}\n\n{content}'),
 TextNode(id_='e7d3a702-3134-4d9f-9ea2-fb84b14a5483', embedding=None, metadata={'filename': 'Anti Harassment Policy.md', 'file_id': 'e3d656ad-2d43-49cb-8007-09726026de4b'}, excluded_embed_m

<h1> Delete data from Qdrant</h1>


In [5]:
import logging
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# from src.utils import creat_node_data_from_input_dir, create_nodes_from_nodes_data
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
import qdrant_client
from llama_index.embeddings.openai import OpenAIEmbedding

In [9]:
def delete_data_from_qdrant(filename: str):
    # Initialize Qdrant client
    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    # Set the embedding model
    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    # Perform the delete operation
    try:
        response = client.delete(
            collection_name=COLLECTION_NAME,  # Use the variable directly
            points_selector=models.FilterSelector(
                filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="filename",
                            match=models.MatchValue(value=filename),
                        )
                    ]
                )
            ),
        )
        print(f"File for the year '{filename}' has been deleted successfully.")
    except Exception as e:
        print(f"Error deleting the file: {e}")

In [11]:
delete_data_from_qdrant("all_conventional_fund.md")

File for the year 'all_conventional_fund.md' has been deleted successfully.


<h1> Add data to Qdrant</h1>


In [6]:
from pprint import pprint


def add_data_to_qdrant(path):

    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)
    nodes_data = creat_node_data_from_input_dir(path)
    nodes = create_nodes_from_nodes_data(nodes_data)

    print(nodes)

    logging.info("no collection found")
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
    )
    index = VectorStoreIndex(nodes, storage_context=storage_context)
    return index

In [13]:
add_data_to_qdrant("latest_modified_fmr_data/All funds latest data/June_all_fund")

[TextNode(id_='2f1688a8-6e49-4c6f-aad1-b12bf3a20249', embedding=None, metadata={'filename': 'all_conventional_fund.md', 'file_id': 'edfc0e27-6aaf-473e-822a-5872b24c4107'}, excluded_embed_metadata_keys=['file_id'], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='edfc0e27-6aaf-473e-822a-5872b24c4107', node_type=None, metadata={'filename': 'all_conventional_fund.md'}, hash=None)}, metadata_template='{key}: {value}', metadata_separator='\n', text="\nFund Managers’ Report  \nJune, 2025  \nAlfalah Investments  \n_Note: The content for this report is currently unavailable. Please check back later for updates._\nAlfalah Asset Management Limited  \nRisk Profile of Conventional Collective Investment Schemes/Plans  \n| Fund Name                                 | Scheme Type                           | Risk Profile | Principal Risk           |\n| ----------------------------------------- | ------------------------------------- | ------------ |

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7c2e3c52e020>

<h1> Prcesssing md Files</h1>


In [None]:
import os


def replace_in_files(folder_path: str):
    """
    Reads all files in the given folder, replaces specific headers with their updated versions,
    and saves the changes back to the files.

    Args:
        folder_path (str): The path to the folder containing the files.
    """
    # Mapping of original headers to their replacements
    replacements = {
        "\nSindh Workers": "\n## Sindh Workers",
        "\n(Holdings as % of Total Assets)": "\n### (Holdings as % of Total Assets)",
        "\nHoldings as % of Total Assets": "\n### (Holdings as % of Total Assets)",
        "\nFund Statistics:": "\n### Fund Statistics:",
        "\nFund Statistic": "\n### Fund Statistic:",
        "\nFund Statistics": "\n### Fund Statistics:",
        "\nFund Stataistics": "\n### Fund Stataistics:",
        "\nfund statistics": "\n### fund statistics:",
        "\nTop Ten Holdings (as a % of total assets)": "\n### Top Ten Holdings (as a % of total assets)",
        "\nSector Allocation (as a % of total assets)": "\n### Sector Allocation (as a % of total assets)",
        "\nAsset Allocation (as % of Total Assets)": "\n### Asset Allocation (as % of Total Assets)",
        "\nRisk Profile:": "\n## Risk Profile:",
        "\nRisk Profile": "\n## Risk Profile:",
        "\nFund Performance": "\n### Fund Performance",
        "\nFund Performanace": "\n### Fund Performanace",
        "\nFund Performanace:": "\n### Fund Performanace:",
        "\nFund Perfomance": "\n### Fund Perfomance",
        "\nfund performance": "\n### Fund Performance",
        "\nPerformance": "\n### Performance",
        "\nAsset Allocation": "\n### Asset Allocation",
        "\n### RISK PROFILE OF ISLAMIC COLLECTIVE INVESTMENT SCHEMES/PLANS": "\n# RISK PROFILE OF ISLAMIC COLLECTIVE INVESTMENT SCHEMES/PLANS",
        "\n### RISK PROFILE OF CONVENTIONAL COLLECTIVE INVESTMENT SCHEMES/PLANS": "\n# RISK PROFILE OF CONVENTIONAL COLLECTIVE INVESTMENT SCHEMES/PLANS",
    }

    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Read the file
                with open(file_path, "r", encoding="utf-8") as f:
                    content = f.read()

                # Apply all replacements
                for original, replacement in replacements.items():
                    content = content.replace(original, replacement)

                # Write back the updated content
                with open(file_path, "w", encoding="utf-8") as f:
                    f.write(content)

                print(f"Processed file: {file_path}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")


# Example usage
folder_path = "latest_modified_fmr_data/single_modified_file"
replace_in_files(folder_path)

<h1> Create Filters</h1>


In [None]:
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)

In [None]:
def create_filters_for_all_data(month, year):
    filters_list = []

    # If a month is found, add a month filter
    if month:
        filters_list.append(
            MetadataFilter(key="month", operator=FilterOperator.EQ, value=month)
        )

    # If a year is found, add a year filter
    if year:
        filters_list.append(
            MetadataFilter(key="year", operator=FilterOperator.EQ, value=year)
        )

    # Return filters if any are found, otherwise None
    print(filters_list)
    if filters_list:
        return MetadataFilters(filters=filters_list, condition=FilterCondition.AND)
    return MetadataFilters(
        filters=[
            # MetadataFilter(key="year", operator=FilterOperator.NIN, value=list(range(2014, 2024))),
            MetadataFilter(
                key="year", operator=FilterOperator.IS_EMPTY, value=None
            )  # Avoid including `value`
        ],
        condition=FilterCondition.OR,
    )

In [None]:
create_filters_for_all_data("nov", "2024")

In [None]:
def create_filters_for_specific_files(file1, file2):
    filters_list = []

    # Add a filter for the first file
    if file1:
        filters_list.append(
            MetadataFilter(key="filename", operator=FilterOperator.EQ, value=file1)
        )

    # Add a filter for the second file
    if file2:
        filters_list.append(
            MetadataFilter(key="filename", operator=FilterOperator.EQ, value=file2)
        )

    # Combine the filters with an OR condition to get results from both files
    return MetadataFilters(filters=filters_list, condition=FilterCondition.OR)

In [None]:
def make_filter_for_specific_file(query):
    # Extract month and year from the query
    file1 = "Conventional_all_Fund_Data.md"
    file2 = "Islamic_all_Fund_Data.md"
    # Create and return filters based on extracted month and year
    return create_filters_for_specific_files(file1, file2)

In [None]:
from src.gen_pipeline import GenPipeline
from src.utils import make_filter

gen_pipeline = GenPipeline()
index = gen_pipeline._get_qdrant_index()

In [None]:
query = "tell me fund performance for all funds "
retriever = index.as_retriever(
    similarity_top_k=20, filters=make_filter_for_specific_file(query)
)
retrieve_nodes = retriever.retrieve(query)
retrieve_nodes