<h1> Create nodes data and nodes from input dir(handle best performing funds too)</h1>


In [None]:
import re
import os
import pandas as pd

from langchain_text_splitters import MarkdownHeaderTextSplitter
from llama_index.core import SimpleDirectoryReader
from llama_index.readers.file.flat import FlatReader
from llama_index.core.schema import TextNode, RelatedNodeInfo, NodeRelationship
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.core.vector_stores import (
    MetadataFilter,
    MetadataFilters,
    FilterOperator,
    FilterCondition,
)
from deep_translator import GoogleTranslator
from src.config import MONTH_FULL_NAMES, MONTH_PATTERN, YEAR_PATTERN
from src.utils import extract_month, extract_year


def creat_node_data_from_input_dir(inpur_dir):

    documents = SimpleDirectoryReader(
        input_dir=inpur_dir,
        file_extractor={
            ".md": FlatReader()
        },  # This disables the MarkdownReader for .md files
        recursive=True,
    ).load_data()

    nodes_data = []

    for document in documents:
        markdown_document = document.get_content()
        filename = document.metadata.get("filename")
        file_id = document.id_
        headers_to_split_on = [
            ("#", "Header 1"),
            # ("##", "Header 2"),
            # ("###", "Header 3"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=headers_to_split_on
        )
        md_header_splits = markdown_splitter.split_text(markdown_document)

        node_data = {"file_id": file_id, "filename": filename, "node_text": []}

        for text in md_header_splits:
            headers_combined = []

            # Loop through metadata and concatenate headers
            for _, header in text.metadata.items():
                if header:
                    headers_combined.append(header)

            headers_combined = " of ".join(headers_combined[::-1])
            # Concatenate headers and page content
            concat_text = headers_combined + "\n" + text.page_content
            node_data["node_text"].append(concat_text)
        nodes_data.append(node_data)

    return nodes_data

In [None]:
# Define patterns for both types of reports
pattern_top_performing_with_month_and_year = (
    r"^(Top/Best Performing Funds and Returns for .+?)\n(\|.+?)(?=\n[A-Z#]|$)"
)
pattern_top_performing = (
    r"^(Top/Best Performing Funds and Returns)\n(\|.+?)(?=\n[A-Z#]|$)"
)
pattern_fund_names = r"(#?\s*Name of all Funds offered by AAML.*?Profile)\n([\s\S]*)"


def create_nodes_from_nodes_data(nodes_data):
    nodes = []
    for data in nodes_data:
        filename = data["filename"]
        month = extract_month(filename)
        year = extract_year(filename)

        node_text = data["node_text"][0]

        # Try matching "Top Performing Funds with month and year"
        match_top_performing__with_month_and_year = re.search(
            pattern_top_performing_with_month_and_year, node_text, re.DOTALL
        )

        # Try matching "Top Performing Funds"
        match_top_performing = re.search(pattern_top_performing, node_text, re.DOTALL)

        print(match_top_performing__with_month_and_year)
        print("---------------------------------------")

        # Try matching "Name of all Funds offered by AAML"
        match_fund_names = re.search(pattern_fund_names, node_text, re.DOTALL)

        if month and year:
            if match_top_performing__with_month_and_year:
                report_title = match_top_performing__with_month_and_year.group(1)
                report_content = match_top_performing__with_month_and_year.group(2)
            elif match_fund_names:
                report_title = match_fund_names.group(1)
                report_content = match_fund_names.group(2)
            else:
                report_title = None
                report_content = None

            if report_title and report_content and len(data["node_text"]) == 1:
                node = TextNode(
                    text=report_title,
                    metadata={
                        "year": str(year),
                        "month": month,
                        "filename": filename,
                        "file_id": data["file_id"],
                        "text_metadata": report_content,
                    },
                )
                node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )
                nodes.append(node)

            else:
                for text in data["node_text"]:
                    node = TextNode(
                        text=text,
                        metadata={
                            "year": str(year),
                            "month": month,
                            "filename": filename,
                            "file_id": data["file_id"],
                        },
                    )
                    node.excluded_embed_metadata_keys = ["file_id"]
                    node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                        node_id=data["file_id"], metadata={"filename": filename}
                    )
                    nodes.append(node)
        else:
            print("bro i am here eeee")

            if match_top_performing:
                report_title = match_top_performing.group(1)
                report_content = match_top_performing.group(2)

            elif match_fund_names:
                print("✅ Match Found for Fund Names")
                # print("Title:", match_fund_names.group(1))
                # print("Content:", match_fund_names.group(2))
                report_title = match_fund_names.group(1)
                print("text:", report_title)
                report_content = match_fund_names.group(2)
            else:
                report_title = None
                report_content = None
            if report_title and report_content and len(data["node_text"]) == 1:
                node = TextNode(
                    text=report_title,
                    metadata={
                        "filename": filename,
                        "file_id": data["file_id"],
                        "text_metadata": report_content,
                    },
                )
                node.excluded_embed_metadata_keys = ["text_metadata", "file_id"]
                node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                    node_id=data["file_id"], metadata={"filename": filename}
                )
                nodes.append(node)
            else:
                for text in data["node_text"]:
                    node = TextNode(
                        text=text,
                        metadata={
                            "filename": filename,
                            "file_id": data["file_id"],
                        },
                    )
                    node.excluded_embed_metadata_keys = ["file_id"]
                    node.relationships[NodeRelationship.SOURCE] = RelatedNodeInfo(
                        node_id=data["file_id"], metadata={"filename": filename}
                    )
                    nodes.append(node)

    return nodes

In [None]:
from pprint import pprint

path = "latest_modified_fmr_data/Alfalah_assist_all_tabulor_md_data"
nodes_data = creat_node_data_from_input_dir(path)
nodes = create_nodes_from_nodes_data(nodes_data)
print(nodes)

<h1> Delete data from Qdrant</h1>


In [None]:
import logging
from llama_index.core import VectorStoreIndex, StorageContext, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# from src.utils import creat_node_data_from_input_dir, create_nodes_from_nodes_data
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
import qdrant_client
from llama_index.embeddings.openai import OpenAIEmbedding

In [None]:
def delete_data_from_qdrant(filename: str):
    # Initialize Qdrant client
    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    # Set the embedding model
    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    # Perform the delete operation
    try:
        response = client.delete(
            collection_name=COLLECTION_NAME,  # Use the variable directly
            points_selector=models.FilterSelector(
                filter=models.Filter(
                    must=[
                        models.FieldCondition(
                            key="filename",
                            match=models.MatchValue(value=filename),
                        )
                    ]
                )
            ),
        )
        print(f"File for the year '{filename}' has been deleted successfully.")
    except Exception as e:
        print(f"Error deleting the file: {e}")

In [None]:
delete_data_from_qdrant("QUERIES_FOR_CHATBOT.md")

<h1> Add data to Qdrant</h1>


In [None]:
from pprint import pprint


def add_data_to_qdrant(path):

    client = qdrant_client.QdrantClient(url="http://65.0.229.53:6333", port=6333)

    COLLECTION_NAME = "alfalah_investment"
    embed_model_name = "text-embedding-3-small"

    embed_model = OpenAIEmbedding(model_name=embed_model_name)
    Settings.embed_model = embed_model

    vector_store = QdrantVectorStore(client=client, collection_name=COLLECTION_NAME)
    nodes_data = creat_node_data_from_input_dir(path)
    nodes = create_nodes_from_nodes_data(nodes_data)

    print(nodes)

    logging.info("no collection found")
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
    )
    index = VectorStoreIndex(nodes, storage_context=storage_context)
    return index

In [None]:
add_data_to_qdrant("latest_modified_fmr_data/FAQs/queries for chatbot")