In [None]:
!pip install langchain-community
!pip install langchain
%pip install --quiet pandas sentence-transformers scikit-learn numpy



In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

import os

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive

In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os

def read_markdown_files(directory_path):
    """
    Reads and returns the content of all Markdown files in the given directory path.

    :param directory_path: Path to the directory whose Markdown files are to be read.
    :return: A list of dictionaries, each containing the file name and content of a Markdown file.
    """
    markdown_contents = []

    # Check if the given path is a directory
    if not os.path.isdir(directory_path):
        print(f"The path {directory_path} is not a valid directory.")
        return markdown_contents

    # Find all Markdown files in the directory
    markdown_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.md')]

    for file in markdown_files:
        file_path = os.path.join(directory_path, file)
        with open(file_path, 'r', encoding='utf-8') as md_file:
            content = md_file.read()
           # markdown_contents.append({"file_name": file, "content": content})
           # print(" **** got here")
            #print(content)
            markdown_contents.append(content)

    return markdown_contents




In [None]:
# Example usage
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'  # Replace 'path/to/your/directory' with the actual directory path
markdown_document = read_markdown_files(directory_path)
md = markdown_document[0]
print(markdown_document)



In [None]:
from langchain.document_loaders import TextLoader


In [None]:
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
   # ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
data = md_header_splits = markdown_splitter.split_text(md)
md_header_splits



In [None]:
from langchain.document_loaders import TextLoader

In [None]:
%pip install unstructured > /dev/null


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np

# Initialize the sentence transformer model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

def tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the input text based on the selected method and provided parameters.
    """
    num_chunks = int(num_chunks)
    output = []

    # Ensure text is provided
    if not text.strip():
        return pd.DataFrame(columns=['Chunk #', 'Text Chunk', 'Character Count', 'Token Count'])

    if method == "RecursiveCharacterTextSplitter":
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len, is_separator_regex=False)
        tokenized_texts = text_splitter.split_text(text)[:num_chunks]
        for i, chunk in enumerate(tokenized_texts):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                'Character Count': len(chunk),
                'Token Count': len(chunk.split())
            })
    if method == "MarkdownHeaderTextSplitter":
        headers_to_split_on = [
                               ("#", "Header 1"),
                               ("##", "Header 2"),
                               ("###", "Header 3"),
        ]
        markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
        md_header_splits = markdown_splitter.split_text(text)
        for i, chunk in enumerate(md_header_splits):
            output.append({
                'Chunk #': i,
                'Text Chunk': chunk,
                #'Character Count': len(chunk),
                #'Token Count': len(chunk.split())
            })
    print
    df = pd.DataFrame(output)
    return df

def calculate_embeddings(df):
    """
    Calculates embeddings for each text chunk in the dataframe.
    """
    if df.empty:
        return df

    chunks = df['Text Chunk'].tolist()
    embeddings = model.encode(chunks[0])
    df['Embeddings'] = embeddings.tolist()
    return df

def search_similar_chunks(query, df_with_embeddings):
    """
    Search for chunks similar to the query embedding.
    """
    # Compute the query embedding
    query_embedding = model.encode([query])[0]

    # Calculate similarity scores
    chunk_embeddings = np.vstack(df_with_embeddings['Embeddings'])
    similarity_scores = cosine_similarity([query_embedding], chunk_embeddings)[0]

    # Insert similarity scores into the dataframe after 'Chunk #'
    df_with_embeddings.insert(1, 'Similarity', similarity_scores)

    # Return the dataframe sorted by similarity scores in descending order
    return df_with_embeddings.sort_values(by='Similarity', ascending=False)

def process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks):
    """
    Tokenizes the text and calculates embeddings.
    """
    df = tokenize_text(method, text, chunk_size, chunk_overlap, num_chunks)
    df_with_embeddings = calculate_embeddings(df)
    return df_with_embeddings

def update_output(method, text, chunk_size, chunk_overlap, num_chunks, query):
    df_with_embeddings = process_and_embed(method, text, chunk_size, chunk_overlap, num_chunks)
    if query:
        df_with_embeddings = search_similar_chunks(query, df_with_embeddings)
        # Update the headers to reflect the new column order after similarity search
        return df_with_embeddings[['Chunk #', 'Similarity', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]
    return df_with_embeddings[['Chunk #', 'Text Chunk', 'Character Count', 'Token Count', 'Embeddings']]


In [None]:
from builtins import len

for i, chunk in enumerate(data):
    if i >= 20:  # Stop after processing 20 chunks
        break
    print("Chunk Number:", i, "\n")
    print("Page Content:", chunk.page_content, "\n")
    print("Character Count:", len(str(chunk)), "\n")
    print("Token Count:", len(str(chunk).split()), "\n")
    #print("Metadata:", str(chunk.metadata), "\n"

Chunk Number: 0 

Page Content: ---
sidebar_position: 3
--- 

Character Count: 44 

Token Count: 2 

Chunk Number: 1 

Page Content: # Entity Reference  
Entities commonly have a need to reference other entities. For example, a CodeComponent entity may want to declare who its owner is by mentioning a Team or Person entity, and a Person entity may want to declare what Team entities it is a member of. This describes how to write those references in your yaml entity declaration files.  
Each entity in OpenContext is uniquely identified by the triplet of its [kind, namespace, and name](common). But that's a lot to type out manually, and in a lot of circumstances, both the kind and the namespace are fixed, or possible to deduce, or could have sane default values.  
Each reference can be expressed in one of two ways: as a compact string, or as a compound reference structure. 

Character Count: 823 

Token Count: 128 

Chunk Number: 2 

Page Content: ## **String References**  
This is the mos

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter



In [None]:
method = "RecursiveCharacterTextSplitter"
method = "MarkdownHeaderTextSplitter"
chunk_size = 1000
chunk_overlap = 25
num_chunks = 20
text = str(data)
query = "What is a CodeComponent"

update_output(method, text, chunk_size, chunk_overlap, num_chunks, query)

 **** got here 2 *******
 **** got here 2 *******


TypeError: 'Document' object is not subscriptable