In [1]:
from dotenv import load_dotenv
load_dotenv()

True

# Development

## Ingestion

In [1]:
import os
import json
import fitz
import pdf2md
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Load PDF

In [2]:
pdf_path = "pdfs/BC_via_Search_VPT.pdf"
pdf_doc = fitz.open(pdf_path)
markdown_text = pdf2md.to_markdown(pdf_doc)

In [3]:
# Split the document based on markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown_text)
len(md_header_splits)

10

In [4]:
# Split the md sections further based on tokens
chunk_size = 300
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='cl100k_base',
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

# Split
splits = text_splitter.split_documents(md_header_splits)
len(splits)

20

In [35]:
splits[0]

Document(page_content='## Behavioral Cloning via Search in Video PreTraining Latent Space  \n### Amogh Raut\nIndian Institute of Technology BHU', metadata={'Header 2': 'Behavioral Cloning via Search in Video PreTraining Latent Space', 'Header 3': 'Amogh Raut'})

### Embed PDF Nodes

In [5]:
from openai import OpenAI

oai_client = OpenAI()

In [6]:
# Sample Embedding
response = oai_client.embeddings.create(
    input="This is a sample text",
    model="text-embedding-3-small"
)

response.data[0].embedding

[0.035431310534477234,
 0.01364643219858408,
 0.0161751639097929,
 0.010260256938636303,
 -0.035925429314374924,
 -0.05821896344423294,
 -0.011989676393568516,
 -0.013762695714831352,
 0.01108136773109436,
 -0.0038039973005652428,
 0.028978684917092323,
 -0.006681519560515881,
 -0.037698451429605484,
 -0.027002204209566116,
 0.06196846067905426,
 0.06115461885929108,
 -0.06574703007936478,
 0.010267523117363453,
 -0.01083430740982294,
 0.042784977704286575,
 0.014198684133589268,
 0.005289990454912186,
 0.02123262733221054,
 0.0278015173971653,
 0.029298409819602966,
 -0.011837081052362919,
 -0.012963383458554745,
 -0.004875801969319582,
 0.06545636802911758,
 -0.034849993884563446,
 0.027409126982092857,
 -0.04107009246945381,
 0.00948274414986372,
 -0.029385607689619064,
 -0.014772734604775906,
 0.06714218854904175,
 0.050196778029203415,
 0.010122193954885006,
 -0.004181853961199522,
 -0.03572196885943413,
 0.027394594624638557,
 -0.029400140047073364,
 0.045052118599414825,
 0.0246

In [7]:
chunks = []

for i, chunk in tqdm(enumerate(splits), total=len(splits)):
    response = oai_client.embeddings.create(
        input=chunk.page_content,
        model="text-embedding-3-small"
    )

    _chunk = {
        "id": i,
        "embedding": response.data[0].embedding,
        "metadata": {
            "text": chunk.page_content
        }
    }
    chunks.append(_chunk)

# Note:
# This is a simple example of how to index the nodes using openai embeddings one by one.
# In practice, you should batch the embeddings and index them in bulk and not use a for loop.

100%|██████████| 20/20 [00:07<00:00,  2.74it/s]


### Index the Embeddings to a Vector Index

In [8]:
from upstash_vector import Index, Vector

index = Index.from_env()

In [37]:
doc_id = os.path.basename(pdf_path).split('.')[0]

# Convert the chunks to vector objects
vectors = []
for chunk in chunks:
    chunk['metadata']['doc_id'] = doc_id
    chunk_id = f"{doc_id}_{chunk['id']}"

    vector = Vector(
        id=chunk_id, vector=chunk["embedding"], metadata=chunk["metadata"]
    )
    vectors.append(vector)

In [39]:
vectors[0]

Vector(id='BC_via_Search_VPT_0', vector=[0.008760029450058937, 0.017546890303492546, 0.03463766723871231, 0.011181446723639965, 0.0016165139386430383, -0.041398853063583374, 0.05221138894557953, 0.039789047092199326, -0.011946105398237705, 0.037508487701416016, 0.032732728868722916, -0.0927785113453865, -0.01921035721898079, 0.00027542782481759787, 0.01270405575633049, -0.0275813490152359, 0.02953994832932949, -0.009712498635053635, -0.0043095857836306095, 0.04628193378448486, 0.01844569854438305, -0.017761530354619026, 0.016124894842505455, 0.005946222227066755, 0.01856643334031105, -0.062031205743551254, 0.021557990461587906, -0.004789174068719149, 0.0023140967823565006, 0.023194627836346626, 0.00469862250611186, -0.040754932910203934, -0.00792829692363739, -0.00943749025464058, -0.004940093494951725, 0.014354106970131397, -0.012073547579348087, -0.010872900485992432, -0.03871584311127663, 0.033805932849645615, -0.010658259503543377, -0.08816373348236084, 0.023100722581148148, 0.0346

In [40]:
# Upsert the vectors to the index
index.upsert(vectors)

'Success'

## Retrieval

In [41]:
query = "What is the VPT model"
TOP_K = 3

In [42]:
# Get the embeddings for the query
query_vector = oai_client.embeddings.create(
    input=query,
    model="text-embedding-3-small"
).data[0].embedding

In [43]:
# Execute the query
query_result = index.query(
    vector=query_vector,
    include_metadata=True,
    include_vectors=False,
    top_k=TOP_K
)

In [44]:
for result in query_result:
    print("Score:", result.score)
    print("ID:", result.id)
    print("Metadata:", result.metadata)
    print()

Score: 0.7776315
ID: BC_via_Search_VPT_9
Metadata: {'text': '**VPT**  \n**current frame embedding**  \n**1x1024**  \n|MLP with 8641-\u200b dim output (combinations of keyboard actions)|MLP with 121-\u200b dim output (11x11 clusters of mouse actions)|\n|---|---|\n|probabilistic prediction||  \n**loss** **loss**  \n**one hot**  \n**one hot**  \n**encoded label**  \n**encoded label**  \nFigure 2: VPT architecture.  \ng p\ndataset of _situation_ points. Once the reference _situation_ has\nbeen selected, we copy its corresponding actions. After each\ntime-step we update the current and reference _situations_ , by\nupdating the queue of embedding vectors of images for the\ncurrent _situation_ , while shifting to the next time-step in the\nrecorded trajectory from the dataset for the reference _situa-_\n_tion_ . To assess the similarity, we compute the L1 distance\nbetween the current _situation_ and the reference _situation_ .\nIn most cases, the reference and the current _situations_ will\n

In [45]:
def build_result_str(metadata):
    '''
    Build a string from the metadata dictionary of a query result for adding to the context of the LLM.
    '''
    text = metadata['text']
    _meta = {
        k: v for k, v in metadata.items() if k != 'text'
    }
    
    meta_str = "\n".join([f"{k}: {v}" for k, v in _meta.items()])
    return f"{meta_str}\n\n{text}"

In [46]:
print(build_result_str(query_result[0].metadata))

doc_id: BC_via_Search_VPT

**VPT**  
**current frame embedding**  
**1x1024**  
|MLP with 8641-​ dim output (combinations of keyboard actions)|MLP with 121-​ dim output (11x11 clusters of mouse actions)|
|---|---|
|probabilistic prediction||  
**loss** **loss**  
**one hot**  
**one hot**  
**encoded label**  
**encoded label**  
Figure 2: VPT architecture.  
g p
dataset of _situation_ points. Once the reference _situation_ has
been selected, we copy its corresponding actions. After each
time-step we update the current and reference _situations_ , by
updating the queue of embedding vectors of images for the
current _situation_ , while shifting to the next time-step in the
recorded trajectory from the dataset for the reference _situa-_
_tion_ . To assess the similarity, we compute the L1 distance
between the current _situation_ and the reference _situation_ .
In most cases, the reference and the current _situations_ will
evolve differently over time, thus, their L1 distance will di-
ver

In [47]:
context_prompt = """Retrieved context to answer the query is as follows:
{context_str}
"""

def build_context_prompt(retrieval_results):
    context_str = "\n\n---------------------\n\n".join([build_result_str(r.metadata) for r in retrieval_results])
    return context_prompt.format(
        context_str=context_str
    )

In [48]:
print(build_context_prompt(query_result))

Retrieved context to answer the query is as follows:
doc_id: BC_via_Search_VPT

**VPT**  
**current frame embedding**  
**1x1024**  
|MLP with 8641-​ dim output (combinations of keyboard actions)|MLP with 121-​ dim output (11x11 clusters of mouse actions)|
|---|---|
|probabilistic prediction||  
**loss** **loss**  
**one hot**  
**one hot**  
**encoded label**  
**encoded label**  
Figure 2: VPT architecture.  
g p
dataset of _situation_ points. Once the reference _situation_ has
been selected, we copy its corresponding actions. After each
time-step we update the current and reference _situations_ , by
updating the queue of embedding vectors of images for the
current _situation_ , while shifting to the next time-step in the
recorded trajectory from the dataset for the reference _situa-_
_tion_ . To assess the similarity, we compute the L1 distance
between the current _situation_ and the reference _situation_ .
In most cases, the reference and the current _situations_ will
evolve differ

In [49]:
def context_retrieval(search_query: str) -> str:
    '''
    This function let's you semantically retrieve relevant context chunks from a given document based on a query.

    Arguments:
        query (str): The query to search for in the document. Based on the original user query, write a good search query
                     which is more logically sound to retrieve the relevant information from the document.

    Returns:
        str: The retrieved context chunks from the document based on the search query formatted as a string.
    '''
    # Get the embeddings for the search query
    query_vector = oai_client.embeddings.create(
        input=search_query,
        model="text-embedding-3-small"
    ).data[0].embedding

    # Execute the query
    query_result = index.query(
        vector=query_vector,
        include_metadata=True,
        include_vectors=False,
        top_k=3
    )

    return build_context_prompt(query_result)

In [51]:
print(context_retrieval(query))

Retrieved context to answer the query is as follows:
doc_id: BC_via_Search_VPT

**VPT**  
**current frame embedding**  
**1x1024**  
|MLP with 8641-​ dim output (combinations of keyboard actions)|MLP with 121-​ dim output (11x11 clusters of mouse actions)|
|---|---|
|probabilistic prediction||  
**loss** **loss**  
**one hot**  
**one hot**  
**encoded label**  
**encoded label**  
Figure 2: VPT architecture.  
g p
dataset of _situation_ points. Once the reference _situation_ has
been selected, we copy its corresponding actions. After each
time-step we update the current and reference _situations_ , by
updating the queue of embedding vectors of images for the
current _situation_ , while shifting to the next time-step in the
recorded trajectory from the dataset for the reference _situa-_
_tion_ . To assess the similarity, we compute the L1 distance
between the current _situation_ and the reference _situation_ .
In most cases, the reference and the current _situations_ will
evolve differ

## Response Generation

In [52]:
system_prompt = """You are a Q&A bot. You are here to answer questions based on the context retrieved
from a vector index of the chunks of a document. You are prohibited from using prior knowledge and you
can only use the context given. If you need more information, please ask the user. If you cannot answer 
the question from the context, you can tell the user that you cannot answer the question. You can also 
ask for more information from the user."""

system_message = {
    'role': 'system',
    'content': system_prompt
}

In [53]:
# A mapping of the tool name to the function that should be called
available_functions = {
    "context_retrieval": context_retrieval,
}

# Here we have only one function, but you can have multiple as well

In [54]:
# Define a JSON schema for the tools that the LLM can use
# Here we define a schema for the context_retrieval function
tools_schema = [
    {
        "type": "function",
        "function": {
            "name": "context_retrieval",
            "description": "This function let's you semantically retrieve relevant context chunks from a given document based on a query. Based on the original user query, write a good search query which is more logically sound to retrieve the relevant information from the document. You might even have to break down the user query into multiple search queries and call this function multiple times separately. This function finally returns the retrieved context chunks from the document based on the search query formatted as a string.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The sub-query to search for in the document."
                    }
                },
                "required": ["search_query"],
            },
        },
    }
]

In [55]:
def conversation_turn(user_message, messages, tools, model='gpt-3.5-turbo', temperature=0.2, max_tokens=512, verbose=True, **kwargs):

    # Add user message to messages list
    messages.append({
        'role': 'user',
        'content': user_message
    })

    if verbose:
        print("\n<< User Message >>")
        print(user_message)
    
    # Send the conversation and available tools/functions to the model
    response = oai_client.chat.completions.create(
        model=model,
        messages=messages,
        tools=tools,
        tool_choice="auto",  # auto is default, but we'll be explicit
        **kwargs
    )
    response_message = response.choices[0].message
    tool_calls = response_message.tool_calls

    # Add the response to the messages list
    messages.append(response_message)

    # Check if the model wanted to call a function
    if tool_calls:

        # Call each of the functions
        for tool_call in tool_calls:
            function_name = tool_call.function.name
            function_to_call = available_functions[function_name]
            function_args = json.loads(tool_call.function.arguments)

            if verbose:
                print(f"\n<< Calling Function `{function_name}` with Args: {function_args} >>")

            # Call the function
            function_response = function_to_call(**function_args)

            # if verbose:
            #     print("<< Function Response >>")
            #     print(function_response)

            # Add the function response to the messages list
            messages.append(
                {
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "name": function_name,
                    "content": function_response,
                }
            )

        # Get a new response from the model based on the function response
        second_response = oai_client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        second_response_message = second_response.choices[0].message
        messages.append(second_response_message)

        if verbose:
            print("\n<< Response >>")
            print(second_response_message.content)

        return second_response_message, messages

    if verbose:
        print("\n<< Response >>")
        print(response_message.content)

    return response_message, messages

In [64]:
messages = [
    system_message
]

In [65]:
response, messages = conversation_turn(
    "how does the BC via search work here?",
    messages,
    tools_schema
)


<< User Message >>
how does the BC via search work here?

<< Calling Function `context_retrieval` with Args: {'search_query': 'BC via search'} >>

<< Response >>
In the context provided, the search-based behavioral cloning (BC) aims to replicate an expert's behavior accurately by copying solutions from previous experiences. By utilizing a feature divergence criterion, the agent can quickly recognize when the copied actions are not feasible due to physical constraints, prompting a new search to address the issue. This method involves predicting discrete actions and mouse control based on latent space features, ensuring fidelity in reproducing the expert's behaviors.


In [68]:
response, messages = conversation_turn(
    "explain how the VPT model is used here",
    messages,
    tools_schema
)


<< User Message >>
explain how the VPT model is used here

<< Calling Function `context_retrieval` with Args: {'search_query': 'VPT model'} >>

<< Response >>
The Video PreTraining (VPT) model is utilized in the process discussed to encode a "situation" into a latent space. The model employs the IMPALA convolutional neural network (CNN) as the backbone for encoding individual images, resulting in a 1024-dimensional vector for each image. These vectors are then processed through four transformer blocks, with a memory stack storing the last 128 embeddings for each transformer block. The output of the last transformer block consists of 129 embedding vectors, each 1024-dimensional. The architecture discards 128 output embedding vectors of the last transformer block and focuses on the current frame's embedding vector.

In this approach, two Multi-Layer Perceptron (MLP) output heads utilize the current frame's embedding vector to predict actions. The first output head predicts discrete acti