# Chunking, Embedding, and Search (Markdown)
### Use this notebook to perform chunking, embedding, and search on your documents and get similarity scores for your query.

In [None]:
%pip install langchain-community
%pip install langchain
%pip install unstructured > /dev/null
%pip install --quiet pandas sentence-transformers scikit-learn numpy
%pip install einops # For Nomic only


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np
import time
import os

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
hf_api_key = load_secrets("hf_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
def read_markdown_files(directory_path):
    """
    Reads and returns the content of all Markdown files in the given directory path.

    :param directory_path: Path to the directory whose Markdown files are to be read.
    :return: A list of dictionaries, each containing the file name and content of a Markdown file.
    """
    markdown_contents = []

    # Check if the given path is a directory
    if not os.path.isdir(directory_path):
        print(f"The path {directory_path} is not a valid directory.")
        return markdown_contents

    # Find all Markdown files in the directory
    markdown_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.md')]

    for file in markdown_files:
        file_path = os.path.join(directory_path, file)
        print(file_path)
        with open(file_path, 'r', encoding='utf-8') as md_file:
            content = md_file.read()
           # markdown_contents.append({"file_name": file, "content": content})
           # print(" **** got here")
            #print(content)
            markdown_contents.append(content)

    return markdown_contents

In [None]:
# Example usage
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'
markdown_document = read_markdown_files(directory_path)
md = ''.join(markdown_document)

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
   # ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
data = md_header_splits = markdown_splitter.split_text(md)
print(data)



In [None]:
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })
    if method == "MarkdownHeaderTextSplitter":

        headers_to_split_on = [
                               ("#", "Header 1"),
                               ("##", "Header 2"),
                               ("###", "Header 3"),
                               ("####", "Header 4"),
                               ("#####", "Header 5"),
                               #('\n\n<Tab name="',"Tab Name"),
                               #("\n\n<Tabs>\n\n","Tabs Container"),
                               #("<table>\n","Table"),
                               #("<tr>\n","Table Row"),
                               #("<th>\n","Header Cell"),
                               #("<td>\n","Data Cell"),
                               #("```\n\n","Code Block"),
                               #("\n\n***\n\n","Horizontal Rule"),
                               #("\n\n---\n\n","Horizontal Rule"),
                               #("\n\n","Whitespace"),
                               #("\n\n\n","Whitespace"),
                               #("\n","Newline"),
                               #(" ","Space"),
                               #("","Empty String"),
                               #("`","Backtick"),

                               # From packages/mongodb-rag-ingest/src/embed/chunkMd.ts
        ]

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(text)[:num_chunks]
        # Initialize an empty string to hold the concatenated page_content
        all_page_content = []

        # Iterate through each Document object in the list
        for doc in md_header_splits:
        # Concatenate the page_content of each Document to the string
            all_page_content.append(doc.page_content)

        # Split the concatenated string into chunks based on the chunk_
        #for i, chunk in enumerate(md_header_splits):
        for i, chunk in enumerate(all_page_content):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })
    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    t1a = time.perf_counter()


    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks)
    df['Embeddings'] = embeddings.tolist()

    t1b = time.perf_counter()
    print (f'Embedding calculation returned in {(t1b-t1a)*1000} ms')

    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    t1a = time.perf_counter()

    # Compute the query embedding
    query_embedding = model.encode([query])[0]

    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]

    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)

    t1b = time.perf_counter()
    print (f'Similarity search returned in {(t1b-t1a)*1000} ms')
    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]


In [None]:
model_mappings = {
    'BAAI/bge-small-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_small', 'index_name' : 'idx_plot_embedding_bge_small'},

    'sentence-transformers/all-mpnet-base-v2' : {'embedding_attr' : 'plot_embedding_mpnet_base_v2', 'index_name' : 'idx_plot_embedding_mpnet_base_v2'},

    # 'sentence-transformers/all-MiniLM-L12-v2' : {'embedding_attr' : 'plot_embedding_minilm_l12_v2', 'index_name' : 'idx_plot_embedding_minilm_l12_v2'},

    'sentence-transformers/all-MiniLM-L6-v2' : {'embedding_attr' : 'plot_embedding_minilm_l6_v2', 'index_name' : 'idx_plot_embedding_minilm_l6_v2'},

    ## bge-large takes too long and consumes too much memory!
    # 'BAAI/bge-large-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_large', 'index_name' : 'idx_plot_embedding_bge_large', 'embedding_length' : 1024},
}

In [None]:
# Initialize the sentence transformer model for embeddings

query = "SaaS users"
#query = "What is a CodeComponet"

#model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('BAAI/bge-small-en-v1.5')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
model = SentenceTransformer('nomic-ai/nomic-embed-text-v1',trust_remote_code=True)

#method = "RecursiveCharacterTextSplitter"
method = "MarkdownHeaderTextSplitter"

chunk_size = 1000
chunk_overlap = 25
num_chunks = 50
text = str(md)

df_with_embeddings = update_output(method, text, chunk_size, chunk_overlap, num_chunks, query)
df_with_embeddings

Embedding calculation returned in 56478.096300998004 ms
Similarity search returned in 76.28776899946388 ms


Unnamed: 0,Chunk #,Similarity,Text Chunk,Character Count,Token Count,Embeddings
36,36,0.558414,:::caution SaaS users should only use this to ...,608,92,"[0.013529987074434757, -0.007926667109131813, ..."
38,38,0.259418,- **primaryEmail** (string) [required]\nThe em...,994,150,"[0.07691776752471924, -0.05859971046447754, -0..."
40,40,0.255744,This kind of entity describes the infrastructu...,289,42,"[0.0025577698834240437, -0.07189366221427917, ..."
34,34,0.243376,- **owner** (string array) [required]\nAn arra...,472,63,"[0.02094290778040886, -0.03774764761328697, 0...."
46,46,0.24287,- **profile** [optional]\nOptional profile inf...,799,128,"[0.067266084253788, -0.04007098823785782, 0.00..."
32,32,0.21854,"This kind of entity groups Code Components, Pl...",259,39,"[-0.005278993863612413, -0.03062102384865284, ..."
44,44,0.213254,A Person describes an individual such as an em...,190,31,"[-0.0006960183964110911, 0.01870129071176052, ..."
48,48,0.203391,This kind of entity groups platforms and compo...,163,23,"[0.005435842089354992, -0.030816255137324333, ..."
24,24,0.173958,This kind of entity is used to describe docume...,135,22,"[0.006209986750036478, -0.04820038750767708, 0..."
11,11,0.165489,A human readable description of the entity to ...,70,12,"[-0.01930837146937847, -0.01088925451040268, 0..."
