# Chunking, Embedding, and Search (OpenContext Markdown)
## Loading from a Hugginface dataset
### Use this notebook to perform chunking, embedding, and search on your documents and get similarity scores for your query.

In [None]:
%pip install langchain-community
%pip install langchain
%pip install unstructured > /dev/null
%pip install pandas sentence-transformers scikit-learn numpy==1.24.4
%pip install einops # For Nomic only


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import time
import os

Need to load the markdown diorectory from the root directory of the repo. Ask the instructor for details.

In [None]:
def read_markdown_files(directory_path):
    """
    Reads and returns the content of all Markdown files in the given directory path.

    :param directory_path: Path to the directory whose Markdown files are to be read.
    :return: A list of dictionaries, each containing the file name and content of a Markdown file.
    """
    markdown_contents = []

    # Check if the given path is a directory
    if not os.path.isdir(directory_path):
        print(f"The path {directory_path} is not a valid directory.")
        return markdown_contents

    # Find all Markdown files in the directory
    markdown_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.md')]

    for file in markdown_files:
        file_path = os.path.join(directory_path, file)
        print(file_path)
        with open(file_path, 'r', encoding='utf-8') as md_file:
            content = md_file.read()
           # markdown_contents.append({"file_name": file, "content": content})
           # print(" **** got here")
            #print(content)
            markdown_contents.append(content)

    return markdown_contents

 To run this exercise you need to unzip the dataset in the same folder as this a directory (catalog-yaml-format) - Instead of using  the google drive dataaset.

In [None]:
# Example usage
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'
markdown_document = read_markdown_files(directory_path)
md = ''.join(markdown_document)

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
   # ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
data = md_header_splits = markdown_splitter.split_text(md)
print(data)



In [None]:
def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })
    if method == "MarkdownHeaderTextSplitter":

        headers_to_split_on = [
                               ("#", "Header 1"),
                               ("##", "Header 2"),
                               ("###", "Header 3"),
                               ("####", "Header 4"),
                               ("#####", "Header 5"),
                               #('\n\n<Tab name="',"Tab Name"),
                               #("\n\n<Tabs>\n\n","Tabs Container"),
                               #("<table>\n","Table"),
                               #("<tr>\n","Table Row"),
                               #("<th>\n","Header Cell"),
                               #("<td>\n","Data Cell"),
                               #("```\n\n","Code Block"),
                               #("\n\n***\n\n","Horizontal Rule"),
                               #("\n\n---\n\n","Horizontal Rule"),
                               #("\n\n","Whitespace"),
                               #("\n\n\n","Whitespace"),
                               #("\n","Newline"),
                               #(" ","Space"),
                               #("","Empty String"),
                               #("`","Backtick"),

                               # From packages/mongodb-rag-ingest/src/embed/chunkMd.ts
        ]

        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(text)[:num_chunks]
        # Initialize an empty string to hold the concatenated page_content
        all_page_content = []

        # Iterate through each Document object in the list
        for doc in md_header_splits:
        # Concatenate the page_content of each Document to the string
            all_page_content.append(doc.page_content)

        # Split the concatenated string into chunks based on the chunk_
        #for i, chunk in enumerate(md_header_splits):
        for i, chunk in enumerate(all_page_content):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })
    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    t1a = time.perf_counter()


    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks)
    df['Embeddings'] = embeddings.tolist()

    t1b = time.perf_counter()
    print (f'Embedding calculation returned in {(t1b-t1a)*1000} ms')

    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    t1a = time.perf_counter()

    # Compute the query embedding
    query_embedding = model.encode([query])[0]

    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]

    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)

    t1b = time.perf_counter()
    print (f'Similarity search returned in {(t1b-t1a)*1000} ms')
    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]


In [None]:
model_mappings = {
    'BAAI/bge-small-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_small', 'index_name' : 'idx_plot_embedding_bge_small'},

    'sentence-transformers/all-mpnet-base-v2' : {'embedding_attr' : 'plot_embedding_mpnet_base_v2', 'index_name' : 'idx_plot_embedding_mpnet_base_v2'},

    # 'sentence-transformers/all-MiniLM-L12-v2' : {'embedding_attr' : 'plot_embedding_minilm_l12_v2', 'index_name' : 'idx_plot_embedding_minilm_l12_v2'},

    'sentence-transformers/all-MiniLM-L6-v2' : {'embedding_attr' : 'plot_embedding_minilm_l6_v2', 'index_name' : 'idx_plot_embedding_minilm_l6_v2'},

    ## bge-large takes too long and consumes too much memory!
    # 'BAAI/bge-large-en-v1.5' : {'embedding_attr' : 'plot_embedding_bge_large', 'index_name' : 'idx_plot_embedding_bge_large', 'embedding_length' : 1024},
}

In [None]:
# Initialize the sentence transformer model for embeddings

query = "SaaS users"
#query = "What is a CodeComponet"

#model = SentenceTransformer('all-MiniLM-L6-v2')
model = SentenceTransformer('BAAI/bge-small-en-v1.5')
#model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
#model = SentenceTransformer('nomic-ai/nomic-embed-text-v1',trust_remote_code=True)

#method = "RecursiveCharacterTextSplitter"
method = "MarkdownHeaderTextSplitter"

chunk_size = 1000
chunk_overlap = 25
num_chunks = 50
text = str(md)

df_with_embeddings = update_output(method, text, chunk_size, chunk_overlap, num_chunks, query)
df_with_embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding calculation returned in 37312.063538000075 ms
Similarity search returned in 36.42664500000592 ms


Unnamed: 0,Chunk #,Similarity,Text Chunk,Character Count,Token Count,Embeddings
36,36,0.763518,:::caution SaaS users should only use this to ...,608,92,"[0.05116114392876625, -0.04744412377476692, -0..."
44,44,0.636932,A Person describes an individual such as an em...,190,31,"[0.021428724750876427, -0.004950426984578371, ..."
46,46,0.62011,- **profile** [optional]\nOptional profile inf...,799,128,"[-0.01046505942940712, 0.014629196375608444, 0..."
34,34,0.60727,- **owner** (string array) [required]\nAn arra...,472,63,"[-0.030688634142279625, -0.02943725883960724, ..."
38,38,0.603923,- **primaryEmail** (string) [required]\nThe em...,994,150,"[0.0024872722569853067, 0.009096251800656319, ..."
45,45,0.589819,:::tip A key that ends with a question mark is...,779,97,"[-0.015483852475881577, -0.015824923291802406,..."
37,37,0.587447,:::tip A key that ends with a question mark is...,805,96,"[-0.02477002702653408, -0.014937344007194042, ..."
41,41,0.584295,:::tip A key that ends with a question mark is...,909,110,"[-0.07807139307260513, -0.04872863367199898, -..."
30,30,0.58119,- **type** (string) [required]\nThe type of da...,1741,219,"[-0.060526374727487564, -0.04456181824207306, ..."
49,49,0.576347,```yaml\napiVersion: opencontext.com/v1alpha1\...,539,62,"[0.0013910040725022554, -0.07570458948612213, ..."
