In [47]:
from dotenv import load_dotenv
load_dotenv()

True

# Development

## Ingestion

In [48]:
import os
import json
import pdf2md
from langchain_text_splitters import MarkdownHeaderTextSplitter
from tqdm.auto import tqdm

### Load PDF

In [6]:
pdf_path = "pdfs/Mamba.pdf"
markdown_text = pdf2md.to_markdown(pdf_path)

In [14]:
# Split the document based on markdown headers
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
md_header_splits = markdown_splitter.split_text(markdown_text)
md_header_splits

[Document(page_content='# Mamba: Linear-Time Sequence Modeling with Selective State Spaces  \n### Albert Gu *\n1 and Tri Dao * 2  \n1 Machine Learning Department, Carnegie Mellon University\n2 Department of Computer Science, Princeton University\n`agu@cs.cmu.edu` , `tri@tridao.me`  \nAbstract  \nFoundation models, now powering most of the exciting applications in deep learning, are almost universally\nbased on the Transformer architecture and its core attention module. Many subquadratic-time architectures\nsuch as linear attention, gated convolution and recurrent models, and structured state space models (SSMs)\nhave been developed to address Transformers’ computational ineﬃciency on long sequences, but they have not\nperformed as well as attention on important modalities such as language. We identify that a key weakness of\nsuch models is their inability to perform content-based reasoning, and make several improvements. First, simply\nletting the SSM parameters be functions of the inp

### Embed PDF Nodes

In [15]:
from openai import OpenAI

oai_client = OpenAI()

In [None]:
# Sample Embedding
response = oai_client.embeddings.create(
    input="This is a sample text",
    model="text-embedding-3-small"
)

response.data[0].embedding

In [22]:
chunks = []

for i, chunk in tqdm(enumerate(md_header_splits), total=len(md_header_splits)):
    response = oai_client.embeddings.create(
        input=chunk.page_content,
        model="text-embedding-3-small"
    )

    _chunk = {
        "id": i,
        "embedding": response.data[0].embedding,
        "metadata": {
            "text": chunk.page_content
        }
    }
    chunks.append(_chunk)

# Note:
# This is a simple example of how to index the nodes using openai embeddings one by one.
# In practice, you should batch the embeddings and index them in bulk and not use a for loop.

100%|██████████| 40/40 [00:16<00:00,  2.47it/s]


In [None]:
chunks[0]

### Index the Embeddings to a Vector Index

In [24]:
from upstash_vector import Index, Vector

index = Index.from_env()

In [25]:
# Convert the chunks to vector objects
vectors = []

for chunk in chunks:
    vector = Vector(
        id=chunk['id'],
        vector=chunk['embedding'],
        metadata=chunk['metadata']
    )
    vectors.append(vector)

In [26]:
# Upsert the vectors to the index
index.upsert(vectors)

'Success'

## Retrieval

In [30]:
query = "What is the Jamba language model?"
TOP_K = 3

In [31]:
# Get the embeddings for the query
query_vector = oai_client.embeddings.create(
    input=query,
    model="text-embedding-3-small"
).data[0].embedding

In [32]:
# Execute the query
query_result = index.query(
    vector=query_vector,
    include_metadata=True,
    include_vectors=False,
    top_k=TOP_K
)

In [33]:
for result in query_result:
    print("Score:", result.score)
    print("ID:", result.id)
    print("Metadata:", result.metadata)
    print()

Score: 0.7320111
ID: 38
Metadata: {'text': '### E.4 ### Audio Details  \n**E.4.1** **YouTubeMix Audio Pretraining**  \n**Model.**\nWe use a model with 3 blocks per stage ( 3 × 5 = 15 total Mamba blocks), pooling factor 푝= 16 , and\nouter dimension 퐷= 64 , for about 3.5M parameters.  \n**Dataset.** The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of\n256 .  \nThe dataset consists of clips of up to 1 minute long, or length 960000 , which is subsampled and divided into\nsegments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16 ,  \n34  \n-----  \nTable 14: YouTubeMix length scaling sequence lengths and batch sizes.  \nSequence length Batch size Tokens / batch  \nAudio Waveforms - SSM Parameterization  \nAudio Waveforms - SSM Parameterization  \n468 × 2048 = 958464 1 958464\n234 × 2048 = 479232 2 958464\n117 × 2048 = 239616 4 958464\n59 × 2048 = 120832 8 966656\n30 × 2048 = 61440 16 98304

In [34]:
def build_result_str(metadata):
    '''
    Build a string from the metadata dictionary of a query result for adding to the context of the LLM.
    '''
    text = metadata['text']
    _meta = {
        k: v for k, v in metadata.items() if k != 'text'
    }
    
    meta_str = "\n".join([f"{k}: {v}" for k, v in _meta.items()])
    return f"{meta_str}\n\n{text}"

In [35]:
print(build_result_str(query_result[1].metadata))



# Mamba: Linear-Time Sequence Modeling with Selective State Spaces  
### Albert Gu *
1 and Tri Dao * 2  
1 Machine Learning Department, Carnegie Mellon University
2 Department of Computer Science, Princeton University
`agu@cs.cmu.edu` , `tri@tridao.me`  
Abstract  
Foundation models, now powering most of the exciting applications in deep learning, are almost universally
based on the Transformer architecture and its core attention module. Many subquadratic-time architectures
such as linear attention, gated convolution and recurrent models, and structured state space models (SSMs)
have been developed to address Transformers’ computational ineﬃciency on long sequences, but they have not
performed as well as attention on important modalities such as language. We identify that a key weakness of
such models is their inability to perform content-based reasoning, and make several improvements. First, simply
letting the SSM parameters be functions of the input addresses their weakness with di

In [36]:
context_prompt = """Retrieved context to answer the query is as follows:
{context_str}
"""

def build_context_prompt(retrieval_results):
    context_str = "\n\n---------------------\n\n".join([build_result_str(r.metadata) for r in retrieval_results])
    return context_prompt.format(
        context_str=context_str
    )

In [37]:
print(build_context_prompt(query_result))

Retrieved context to answer the query is as follows:


### E.4 ### Audio Details  
**E.4.1** **YouTubeMix Audio Pretraining**  
**Model.**
We use a model with 3 blocks per stage ( 3 × 5 = 15 total Mamba blocks), pooling factor 푝= 16 , and
outer dimension 퐷= 64 , for about 3.5M parameters.  
**Dataset.** The data is mu-law encoded at 8 bits, so the model is modeling discrete tokens with a vocab size of
256 .  
The dataset consists of clips of up to 1 minute long, or length 960000 , which is subsampled and divided into
segments of any desired sequence length. Since the architecture involves two stages of pooling by a factor of 16 ,  
34  
-----  
Table 14: YouTubeMix length scaling sequence lengths and batch sizes.  
Sequence length Batch size Tokens / batch  
Audio Waveforms - SSM Parameterization  
Audio Waveforms - SSM Parameterization  
468 × 2048 = 958464 1 958464
234 × 2048 = 479232 2 958464
117 × 2048 = 239616 4 958464
59 × 2048 = 120832 8 966656
30 × 2048 = 61440 16 983040
15 × 2

In [38]:
def context_retrieval(search_query: str) -> str:
    '''
    This function let's you semantically retrieve relevant context chunks from a given document based on a query.

    Arguments:
        query (str): The query to search for in the document. Based on the original user query, write a good search query
                     which is more logically sound to retrieve the relevant information from the document.

    Returns:
        str: The retrieved context chunks from the document based on the search query formatted as a string.
    '''
    # Get the embeddings for the search query
    query_vector = oai_client.embeddings.create(
        input=search_query,
        model="text-embedding-3-small"
    ).data[0].embedding

    # Execute the query
    query_result = index.query(
        vector=query_vector,
        include_metadata=True,
        include_vectors=False,
        top_k=3
    )

    return build_context_prompt(query_result)

In [39]:
print(context_retrieval("on what hardware was mamba trained on?"))

Retrieved context to answer the query is as follows:


### E.5 ### Efciency Benchmark  
**Scan Operation.** We compare the core operation of selective SSMs, which is the parallel scan (Section 3.3 ),
against convolution and attention, measured on an A100 80GB PCIe GPU. Note that these do not include the cost
of other operations outside of this core operation, such as computing the convolutional kernel in global-convolution
models, or computing the QKV projections in attention.  
As a baseline, we implement a standard parallel scan in PyTorch with no kernel fusion. This requires materializing
the parameters **_A_** , **_B_** , **_C_** in HBM.  
Our scan implementation fuses the discretization step and the parallel scan, avoiding the cost of materializing all
the large parameters in HBM.  
For convolution, we use the standard implementation in PyTorch, which separately performs FFTs on the inputs
and the ﬁlters, multiply them in frequency domain, then performs an inverse FFT to obtain th

## Response Generation

In [40]:
system_prompt = """You are a Q&A bot. You are here to answer questions based on the context retrieved
from a vector index of the chunks of a document. You are prohibited from using prior knowledge and you
can only use the context given. If you need more information, please ask the user. If you cannot answer 
the question from the context, you can tell the user that you cannot answer the question. You can also 
ask for more information from the user."""

system_message = {
    'role': 'system',
    'content': system_prompt
}

In [41]:
# A mapping of the tool name to the function that should be called
available_functions = {
    "context_retrieval": context_retrieval,
}

# Here we have only one function, but you can have multiple as well

In [42]:
# Define a JSON schema for the tools that the LLM can use
# Here we define a schema for the context_retrieval function
tools_schema = [
    {
        "type": "function",
        "function": {
            "name": "context_retrieval",
            "description": "This function let's you semantically retrieve relevant context chunks from a given document based on a query. Based on the original user query, write a good search query which is more logically sound to retrieve the relevant information from the document. You might even have to break down the user query into multiple search queries and call this function multiple times separately. This function finally returns the retrieved context chunks from the document based on the search query formatted as a string.",
            "parameters": {
                "type": "object",
                "properties": {
                    "search_query": {
                        "type": "string",
                        "description": "The sub-query to search for in the document."
                    }
                },
                "required": ["search_query"],
            },
        },
    }
]

In [43]:
def conversation_turn(user_message, messages, tools, model='gpt-3.5-turbo', temperature=0.2, max_tokens=512, verbose=True, **kwargs):

    # Add user message to messages list
    messages.append({
        'role': 'user',
        'content': user_message
    })

    if verbose:
        print("\n<< User Message >>")
        print(user_message)
    
    # Send the conversation and available tools/functions to the model
    response = oai_client.chat.completions.create(
        model=model,
        messages=messages,
        tools=tools,
        tool_choice="auto",  # auto is default, but we'll be explicit
        **kwargs
    )
    response_message = response.choices[0].message
    tool_calls = response_message.tool_calls

    # Add the response to the messages list
    messages.append(response_message)

    # Check if the model wanted to call a function
    if tool_calls:

        # Call each of the functions
        for tool_call in tool_calls:
            function_name = tool_call.function.name
            function_to_call = available_functions[function_name]
            function_args = json.loads(tool_call.function.arguments)

            if verbose:
                print(f"\n<< Calling Function `{function_name}` with Args: {function_args} >>")

            # Call the function
            function_response = function_to_call(**function_args)

            # if verbose:
            #     print("<< Function Response >>")
            #     print(function_response)

            # Add the function response to the messages list
            messages.append(
                {
                    "tool_call_id": tool_call.id,
                    "role": "tool",
                    "name": function_name,
                    "content": function_response,
                }
            )

        # Get a new response from the model based on the function response
        second_response = oai_client.chat.completions.create(
            model=model,
            messages=messages,
            **kwargs
        )
        second_response_message = second_response.choices[0].message
        messages.append(second_response_message)

        if verbose:
            print("\n<< Response >>")
            print(second_response_message.content)

        return second_response_message, messages

    if verbose:
        print("\n<< Response >>")
        print(response_message.content)

    return response_message, messages

In [44]:
messages = [
    system_message
]

In [45]:
response, messages = conversation_turn(
    "hardware of jamba and what is the conclusion?",
    messages,
    tools_schema
)


<< User Message >>
hardware of jamba and what is the conclusion?



<< Calling Function `context_retrieval` with Args: {'search_query': 'hardware of jamba'} >>

<< Response >>
The document provides information about benchmarking the speed and memory of a Selective State Space Model (SSM) called Mamba. The hardware used for the benchmarks included an A100 80GB PCIe GPU. The speed benchmark compared the efficiency of the SSM scan operation with convolution and attention implementations. It mentions that the Mamba SSM scan was faster than the best attention implementation known (FlashAttention-2) beyond a sequence length of 2K. Additionally, the end-to-end inference throughput of Mamba was measured against a standard Transformer (GPT3 architecture) at different sizes.

However, the document does not explicitly provide a conclusion about the hardware of the Mamba system.


In [46]:
response, messages = conversation_turn(
    "I was talking about JAMBA with a J, anything on that you have?",
    messages,
    tools_schema
)


<< User Message >>
I was talking about JAMBA with a J, anything on that you have?

<< Calling Function `context_retrieval` with Args: {'search_query': 'hardware of jamba'} >>

<< Response >>
The information retrieved from the document does not contain details about any hardware related to Jamba specifically. The document primarily discusses the benchmarks, efficiency, speed, and memory of a Selective State Space Model (SSM) called Mamba. It compares the performance of Mamba against attention and convolution implementations on an A100 80GB PCIe GPU. Unfortunately, there is no mention of hardware related to Jamba in the provided context.
