In [1]:
import dotenv
%load_ext dotenv
%dotenv

In [2]:
import nest_asyncio
nest_asyncio.apply()

In [3]:
from llama_index.core import SimpleDirectoryReader

# load lora_paper.pdf documents
documents = SimpleDirectoryReader(input_files=["./datasets/lora_paper.pdf"]).load_data()

from llama_index.core.node_parser import SentenceSplitter

# chunk_size of 1024 is a good default value
splitter = SentenceSplitter(chunk_size=1024)
# Create nodes from documents
nodes = splitter.get_nodes_from_documents(documents)

In [4]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# LLM model
Settings.llm = OpenAI(model="gpt-3.5-turbo")
# embedding model
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

In [6]:
from llama_index.core import SummaryIndex, VectorStoreIndex

# summary index
summary_index = SummaryIndex(nodes)
# vector store index
vector_index = VectorStoreIndex(nodes)

# summary query engine
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)

# vector query engine
vector_query_engine = vector_index.as_query_engine()

In [7]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

In [8]:
from llama_index.core.tools import QueryEngineTool

In [10]:
summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to the LoRA paper."
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from the the LoRA paper."
    ),
)

#### Agent Worker

In [12]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner


agent_worker = FunctionCallingAgentWorker.from_tools(
    tools=[summary_tool, vector_tool],
    llm=llm,
    verbose=True,
)

agent = AgentRunner(agent_worker)

In [13]:
response = agent.query(
    "Explain to me what is the LoRA and why it's being used. Are esiting solutions not enough?",
)

print(str(response))

Added user message to memory: Explain to me what is the LoRA and why it's being used. Are esiting solutions not enough?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input": "LoRA and its use cases"}
=== Function Output ===
LoRA can be applied to all weight matrices and training all biases, allowing for the recovery of the expressiveness of full fine-tuning by setting the LoRA rank to the rank of the pre-trained weight matrices. It converges to training the original model as the number of trainable parameters increases. Additionally, LoRA can be used in downstream tasks without additional inference latency by computing and storing specific values, enabling quick switching between tasks with minimal memory overhead.
=== LLM Response ===
LoRA, which stands for Low-Rank Adaptation, is a technique that can be applied to all weight matrices and training all biases. It allows for the recovery of the expressiveness of full fine-tuning by setting the LoRA rank to th

In [14]:
response = agent.chat(
    "Explain to me what is the LoRA and why it's being used. Are esiting solutions not enough?",
)

print(str(response))

Added user message to memory: Explain to me what is the LoRA and why it's being used. Are esiting solutions not enough?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input": "LoRA and its use cases"}
=== Function Output ===
LoRA can be applied to all weight matrices and training all biases, allowing for the recovery of the expressiveness of full fine-tuning by setting the LoRA rank to the rank of the pre-trained weight matrices. It converges to training the original model as the number of trainable parameters increases. When deployed in production, LoRA does not introduce additional inference latency, as computations can be performed as usual. Switching to another downstream task involves quick operations with minimal memory overhead. Additionally, LoRA has been evaluated against various models such as DeBERTa XXL and GPT-2 medium/large, showcasing its competitive performance in both NLU and NLG tasks.
=== Calling Function ===
Calling function: query_engine_

#### Lower Level Understanding

In [15]:
agent_worker = FunctionCallingAgentWorker.from_tools(
    tools=[summary_tool, vector_tool],
    llm=llm,
    verbose=True,
)

agent = AgentRunner(agent_worker)

In [36]:
task = agent.create_task(
    "Explain to me what is the LoRA and why it's being used."
    "Are existing solutions not enough?"
)

In [37]:
step_output =  agent.run_step(
    task.task_id, input="What is the LoRA paper about?"
)

Added user message to memory: What is the LoRA paper about?
=== Calling Function ===
Calling function: query_engine_tool with args: {"input": "summary of the LoRA paper"}
=== Function Output ===
LoRA is a method that allows for efficient adaptation of pre-trained models to new tasks without requiring full-rank gradient updates. By setting the LoRA rank appropriately, it can approximate the expressiveness of full fine-tuning while avoiding additional inference latency. The paper demonstrates that LoRA can perform competitively with a small rank, suggesting that a low-rank adaptation matrix may be sufficient for effective adaptation. However, the effectiveness of a small rank may vary depending on the task or dataset, as highlighted by the example of different languages requiring different adaptation approaches.


In [38]:
step_output = agent.run_step(task.task_id)


=== LLM Response ===
The LoRA paper introduces a method for efficiently adapting pre-trained models to new tasks without the need for full-rank gradient updates. By setting the LoRA rank appropriately, it can approximate the expressiveness of full fine-tuning while minimizing additional inference latency. The paper shows that LoRA can achieve competitive performance even with a small rank, indicating that a low-rank adaptation matrix may be effective for adaptation. However, the effectiveness of a small rank may vary based on the task or dataset, as demonstrated by the need for different adaptation approaches for different languages.


In [39]:
completed_steps = agent.get_completed_steps(task.task_id)
print(task.task_id)

if len(completed_steps) > 0:
    print(completed_steps[0].output.sources[0].raw_output)

e31bf7cf-7ba4-4aec-a289-c7eefec4451d
LoRA is a method that allows for efficient adaptation of pre-trained models to new tasks without requiring full-rank gradient updates. By setting the LoRA rank appropriately, it can approximate the expressiveness of full fine-tuning while avoiding additional inference latency. The paper demonstrates that LoRA can perform competitively with a small rank, suggesting that a low-rank adaptation matrix may be sufficient for effective adaptation. However, the effectiveness of a small rank may vary depending on the task or dataset, as highlighted by the example of different languages requiring different adaptation approaches.


In [40]:
print(step_output.is_last)

True


In [41]:
upcoming_steps = agent.get_upcoming_steps(task.task_id)

if len(upcoming_steps) > 0:
    print(upcoming_steps[0].input)

In [42]:
step_output = agent.run_step(task.task_id)
print(step_output.is_last)

IndexError: pop from an empty deque

In [34]:
response = agent.finalize_response(task.task_id)
print(str(response))

assistant: LoRA is a method that allows for efficient adaptation of pre-trained models to new tasks without requiring full-rank updates to weight matrices. It enables quick adaptation to downstream tasks with minimal additional inference latency and memory overhead. Existing solutions in NLP have limitations in handling tasks with limited training samples efficiently, and LoRA addresses this by providing a more computationally efficient adaptation method. While few-shot learning and prompt engineering can be beneficial, they may not always offer optimal performance compared to fine-tuning with a larger set of training examples. Therefore, LoRA's approach fills a gap in the existing solutions by providing a more efficient way to adapt pre-trained models to new tasks.


IndexError: pop from an empty deque