In [1]:
from dotenv import load_dotenv, find_dotenv
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import os

load_dotenv(find_dotenv())

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=os.getenv("GEMINI_API_KEY"))
vector = embeddings.embed_query("hello, world!")
vector[:5]

  from .autonotebook import tqdm as notebook_tqdm


[0.05168594419956207,
 -0.030764883384108543,
 -0.03062233328819275,
 -0.02802734449505806,
 0.01813092641532421]

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language
from langchain_core.documents import Document
import os

knowledge_base_path = r"D:\Github_personal\Knowledge_base_FB\chatbot\client\docs" #os.path.join("../../../../knowledge_base")

seperators = RecursiveCharacterTextSplitter.get_separators_for_language(Language.MARKDOWN)
splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=20,
    separators=seperators
)

docs = []

filepaths = []
for root, dirs, files in os.walk(knowledge_base_path):
    for file in files:
        if not file.endswith(".md"):
            continue
        # get path relative to knowledge_base
        filepath = os.path.join(root, file)
        filepath = os.path.relpath(filepath, knowledge_base_path)
        filepaths.append(filepath)

def get_docs_from_file_paths(filepaths):
    docs = []
    for filepath in filepaths:
        with open(os.path.join(knowledge_base_path, filepath), "r",encoding="utf-8") as f:
            content = f.read()
        chunks = splitter.split_text(content)
        docs.extend([Document(page_content=chunk, metadata={"filepath":filepath}) for chunk in chunks])
    return docs

docs = get_docs_from_file_paths(filepaths)
docs[100]


Document(metadata={'filepath': 'Azure360\\Azure Best Practices\\Management Infrastructure.md'}, page_content='| Resource quotas and limits | Enforce [resource quotas and limits](https://learn.microsoft.com/en-us/azure/azure-resource-manager/management/azure-subscription-service-limits) to control usage and costs within specific projects, ensuring resources are allocated appropriately. |')

In [4]:
len(docs)

5467

In [5]:
from langchain_community.vectorstores import FAISS

embeddings_path = r"D:\Github_personal\Knowledge_base_FB\chatbot\embeddings"#os.path.join(knowledge_base_path,"..","embeddings")

index = FAISS.from_documents(docs, embeddings)
index.save_local(embeddings_path)

In [6]:
index = FAISS.load_local(
    embeddings_path, embeddings, allow_dangerous_deserialization=True
)
retriever = index.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 5, "score_threshold": 0.5},
)

In [7]:
doc = retriever.invoke("Rag360")
doc

[Document(metadata={'filepath': 'AI360\\Overview.md'}, page_content='## Purpose of RAG360'),
 Document(metadata={'filepath': 'DevOps\\Kubernites\\Overview.md'}, page_content='## Purpose of RAG360'),
 Document(metadata={'filepath': 'RAG360\\Overview.md'}, page_content='## Purpose of RAG360'),
 Document(metadata={'filepath': 'RAG360\\Introduction.md'}, page_content='## Purpose of RAG360\n1. Knowledge on AI is not tied to an engineer or project, but available in the company for everyone to access and improve on.\n2. Collection of all knowledge that is gained by the Venture-AI team through the exploration and projects.'),
 Document(metadata={'filepath': 'RAG360\\Introduction.md'}, page_content='import ZoomImage from "../../src/components/Zooming/ZoomImage";\n\n**RAG360** is a comprehensive resource designed,')]

In [50]:

# Steps
# 1. get last embedding commit hash from /embeddings/commit_hash.txt
# 2. get the file diff from the last commit hash to the current commit hash
# 3. get the files that were added or modified or deleted
# 4. delete the embeddings of the added/modified/deleted files from the index
# 5. get the embeddings of the added and modified files from current commit hash
# 6. add the embeddings of the added and modified files to the index
# 7. save the current commit hash to /embeddings/commit_hash.txt

In [49]:
from git import Repo
import os

# Path to your repository
repo_path = '../../../../'
repo = Repo(repo_path)

# Path to the commit hash file
commit_hash_file = os.path.join(embeddings_path, 'commit_hash.txt')

# Read the last commit hash
with open(commit_hash_file, 'r') as file:
    last_commit_hash = file.read().strip()

# Get the diff from the last commit hash to the current commit hash
diff_index = repo.commit(last_commit_hash).diff('HEAD')

# Get the diff file paths
affected_markdown_files = set(os.path.normpath(item.a_path.replace('knowledge_base/', ''))
                               for item in diff_index 
                               if item.a_path.startswith('knowledge_base') and item.a_path.endswith('.md'))
affected_markdown_files

def get_doc_ids_from_filepaths(filepaths):
    doc_ids = []
    for doc_id in index.index_to_docstore_id.values():
        doc = index.docstore.search(doc_id)
        if doc.metadata['filepath'] in filepaths:
            doc_ids.append(doc_id)
    return doc_ids

# Get the doc ids of the affected markdown files
doc_ids_to_delete = get_doc_ids_from_filepaths(affected_markdown_files)

def get_current_affected_files():
    current_affected_files = set()
    for root, dirs, files in os.walk(knowledge_base_path):
        for file in files:
            if not file.endswith(".md"):
                continue
            filepath = os.path.join(root, file)
            # get path relative to knowledge_base
            filepath = os.path.relpath(filepath, knowledge_base_path)
            if filepath in affected_markdown_files:
                current_affected_files.add(filepath)
    return current_affected_files

doc_ids_to_delete

print(affected_markdown_files)
current_affected_files = get_current_affected_files()
docs_to_add = get_docs_from_file_paths(current_affected_files)
len(docs_to_add)

latest_commit_hash = repo.head.object.hexsha
latest_commit_hash

{'AI\\ThinkPad.md', 'ThinkPad.md', 'Devops.md', 'Python\\Books.md', 'Python\\Profiling.md', 'General.md', 'Python\\Python.md'}


'cc245121dec183aad09d143646c79c5740e6c652'

In [32]:
affected_markdown_files
normalized_filepaths = [filepath for filepath in affected_markdown_files]
print(normalized_filepaths)

['README.md', 'knowledge_base/AI/ThinkPad.md', 'knowledge_base/General.md', 'chatbot/README.md', 'knowledge_base/Python/Books.md', 'knowledge_base/ThinkPad.md', 'scripts/README_with_placeholders.md', 'CONTRIBUTING.md', 'CONTRIBUTORS.md', 'knowledge_base/Python/Python.md', 'knowledge_base/Devops.md', 'knowledge_base/Python/Profiling.md']
