## Data Preprocessing

1. Load the **PDFs or Markdown** files in Order
2. Extract the **data/content** from the PDF
3. Then perform chunking - *Because the context window for the LLMs are small*
4. Then pass it to the **Embedding Models**

In [1]:
# 1. Loading of the PDFs

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import os

FILE_PATH = "C:/Users/sayan/OneDrive/Desktop/internship-project/enterprise-knowledge-copilot/data"

# Function to load PDF files from a specified directory
def load_pdf_file(file_path=FILE_PATH):
    if not os.path.isdir(file_path):
        print(f"Error: The provided path '{file_path}' is not a directory.")
        return []

    loader = DirectoryLoader(
        file_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        recursive=True,
    )

    documents = loader.load()
    return documents


# Function to load Markdown files from a specified directory
def load_markdown_file(file_path=FILE_PATH):
    if not os.path.isdir(file_path):
        print(f"Error: The provided path '{file_path}' is not a directory.")
        return []

    loader = DirectoryLoader(
        file_path,
        glob="**/*.md",           
        loader_cls=TextLoader,
        loader_kwargs={"encoding": "utf-8"}
    )

    docs = loader.load()
    return docs

In [3]:
import re
import markdown
from bs4 import BeautifulSoup

# Function to clean markdown text
def clean_markdown(md_text: str) -> str:
    # 1. Remove YAML front matter
    md_text = re.sub(r"^---.*?---", "", md_text, flags=re.DOTALL)

    # 2. Convert markdown → HTML
    html = markdown.markdown(md_text)

    # 3. Parse HTML
    soup = BeautifulSoup(html, "html.parser")

    # 4. Remove unwanted tags
    for tag in soup(["script", "style", "iframe", "img", "table"]):
        tag.decompose()

    # 5. Get text
    text = soup.get_text(separator="\n")

    # 6. Remove markdown links but keep text
    text = re.sub(r"\[(.*?)\]\(.*?\)", r"\1", text)

    # 7. Normalize whitespace
    text = re.sub(r"\n{2,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)

    return text.strip()

In [4]:
# Filters documents to only include page_content and source metadata.
def filter_to_minimal_docs(docs):
    minimal_docs = []
    
    for doc in docs:
        full_path = doc.metadata['source']
        file_name = os.path.basename(full_path)

        minimal_doc = Document(
            page_content=clean_markdown(doc.page_content),
            metadata={
                "source": file_name
            }
        )
        minimal_docs.append(minimal_doc)

    return minimal_docs

In [5]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=100
    )
    split_docs = text_splitter.split_documents(minimal_docs)
    return split_docs

In [6]:
# Loading the Markdown files
docs = load_markdown_file()

# Filtering to only include page_content and source metadata.
minimal_docs = filter_to_minimal_docs(docs)

# Split the minimized documents into text chunks
text_chunks = text_split(minimal_docs)

## Performing the Vector Embedding on Text Data:

The Embedding Model is **Sentence-Transformers**

1. The chunks are processed and converted to vectors

In [7]:
from sentence_transformers import SentenceTransformer
embeddingModel = SentenceTransformer('all-MiniLM-L6-v2')

Loading weights: 100%|██████████| 103/103 [00:01<00:00, 86.18it/s, Materializing param=pooler.dense.weight]                              
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [8]:
last_index = len(text_chunks)

### Storing the vector embeddings for each text chunk

In [9]:
vectorEmbeddings = []

for id, chunk in enumerate(text_chunks):
    source = chunk.metadata['source']
    text = chunk.page_content
    embedding = embeddingModel.encode(text).tolist()

    data = {
        "id": id + 1,
        "vector": embedding,
        "meta": {
            "source": source,
            "text": text
        }
    }
    vectorEmbeddings.append(data)

In [10]:
len(vectorEmbeddings[0]["vector"])

384

### Setting up for connecting with Endee API powered by Flask

In [42]:
import requests

# Index name for Endee Vector Database
INDEX_NAME = "enterprise_knowledge_base"

# URL for Endee API service
ENDEE_URL = "http://127.0.0.1:8000" 

### Creating a Index in Endee Vector Database

In [43]:
# Payload for creating an index in Endee Vector DB
payload_for_create_index = {
    "index_name": INDEX_NAME,
    "dimension": len(vectorEmbeddings[0]["vector"]),
    "precision": "INT16D"
}

In [44]:
response_for_create_index = requests.post(
    f"{ENDEE_URL}/index/create",
    json=payload_for_create_index
)
print(f"Message for index creation: {response_for_create_index.json()}")

Message for index creation: {'index_name': 'enterprise_knowledge_base', 'status': 'index created'}


### Checking for the Existence of the Index in the Endee Vector Database

In [45]:
response_for_get_index = requests.post(
    f"{ENDEE_URL}/index/get",
    json={
        "index_name": INDEX_NAME
    })
print(response_for_get_index.json())

{'index_name': 'enterprise_knowledge_base', 'status': 'index loaded'}


### Inserting the Embedded Vectors into Endee Vector DB 

In [97]:
payload_to_insert_multiple_data = {
    "index_name": INDEX_NAME,
    "embedded_vectors": vectorEmbeddings
}

payload_to_insert_single_data = {
    "index_name": INDEX_NAME,
    "embedded_vectors": [vectorEmbeddings[0]]
}

In [98]:
response_for_multiple_insert = requests.post(
    f"{ENDEE_URL}/index/upsert",
    json=payload_to_insert_multiple_data
)

# response_for_single_insert = requests.post(
#     f"{ENDEE_URL}/index/upsert",
#     json=payload_to_insert_single_data
# )

print(response_for_multiple_insert.json())
# print(response_for_single_insert.json())

{'count': 413, 'status': 'vectors upserted'}


### Retrieving the Top K most relevant data

In [99]:
query = "How to become a developer at Gitlab?"
embedding_for_query = embeddingModel.encode(query).tolist()

In [100]:
payload = {
    "index_name": INDEX_NAME,
    "vector": embedding_for_query,
    "top_k": 5
}

In [101]:
# Sends a query to the Endee API
response = requests.post(
    f"{ENDEE_URL}/index/query",
    json=payload
)

In [105]:
data = response.json()