# Lab 7: Llama Stack

<p style="background-color:#fff6e4; padding:15px; border-width:3px; border-color:#f5ecda; border-style:solid; border-radius:6px"> ⏳ <b>Note <code>(Kernel Starting)</code>:</b> This notebook takes about 30 seconds to be ready to use. You may start and watch the video while you wait.</p>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from dotenv import load_dotenv
import os
_ = load_dotenv() #loads 'TOGETHER_API_KEY'

In [3]:
#!pip install llama-stack==0.1.0 llama-stack-client==0.1.0

<p style="background-color:#fff6ff; padding:15px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px"> 💻 &nbsp; <b>Access <code>requirements.txt</code> and <code>utils.py</code> files:</b> 1) click on the <em>"File"</em> option on the top menu of the notebook and then 2) click on <em>"Open"</em>. For more help, please see the <em>"Appendix - Tips and Help"</em> Lesson.</p>

In [4]:
!llama stack build --list-templates

+------------------------------+-----------------------------------------------------------------------------+
[1m[97m| Template Name                | Description                                                                 |[0m
+------------------------------+-----------------------------------------------------------------------------+
| bedrock                      | Use AWS Bedrock for running LLM inference and safety                        |
+------------------------------+-----------------------------------------------------------------------------+
| cerebras                     | Use Cerebras for running LLM inference                                      |
+------------------------------+-----------------------------------------------------------------------------+
| experimental-post-training   | Experimental template for post training                                     |
+------------------------------+-----------------------------------------------------------

In [5]:
!llama stack list-apis

+-------------------+
[1m[97m| API               |[0m
+-------------------+
| inference         |
+-------------------+
| safety            |
+-------------------+
| agents            |
+-------------------+
| vector_io         |
+-------------------+
| datasetio         |
+-------------------+
| scoring           |
+-------------------+
| eval              |
+-------------------+
| post_training     |
+-------------------+
| tool_runtime      |
+-------------------+
| telemetry         |
+-------------------+
| models            |
+-------------------+
| shields           |
+-------------------+
| vector_dbs        |
+-------------------+
| datasets          |
+-------------------+
| scoring_functions |
+-------------------+
| eval_tasks        |
+-------------------+
| tool_groups       |
+-------------------+
| inspect           |
+-------------------+


In [6]:
!llama-stack-client configure --endpoint https://llama-stack.together.ai

Done! You can now use the Llama Stack Client CLI with endpoint https://llama-stack.together.ai


In [7]:
!llama-stack-client models list

╭──────────────────────────────────────────────────────────────────────────────╮
│ [1;31mFailed to list models[0m                                                        │
│                                                                              │
│ [33mError Type:[0m APIStatusError                                                   │
│ [33mDetails:[0m Error code: 426 - {'error': {'message': 'Client version 0.1.0 is    │
│ not compatible with server version 0.2.6. Please update your client.'}}      │
╰──────────────────────────────────────────────────────────────────────────────╯


In [8]:
import os
from llama_stack_client import LlamaStackClient

client = LlamaStackClient(base_url=f"https://llama-stack.together.ai")

models = client.models.list()
print(models)

APIStatusError: Error code: 426 - {'error': {'message': 'Client version 0.1.0 is not compatible with server version 0.2.6. Please update your client.'}}

#  Llama Stack Inference

In [9]:
LLAMA_STACK_API_TOGETHER_URL="https://llama-stack.together.ai"
LLAMA31_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"

from llama_stack_client import LlamaStackClient
import json

def run_main():
    client = LlamaStackClient(
        base_url=LLAMA_STACK_API_TOGETHER_URL,
    )

    response = client.inference.chat_completion(
        model_id=LLAMA31_8B_INSTRUCT,
        messages=[
            {"role": "system", "content": "Who wrote the book Innovator's Dilemma? How about Charlotte's Web?"},
            {"role": "user", "content": "which book was published first?"}            
        ],
        x_llama_stack_provider_data=json.dumps({"together_api_key": os.getenv('TOGETHER_API_KEY')})
    )

    print(response.completion_message.content)
    
run_main()

APIStatusError: Error code: 426 - {'error': {'message': 'Client version 0.1.0 is not compatible with server version 0.2.6. Please update your client.'}}

# Llama Stack Agent

In [10]:
import os
from llama_stack_client import LlamaStackClient
from llama_stack_client.lib.agents.agent import Agent
from llama_stack_client.lib.agents.event_logger import EventLogger
from llama_stack_client.types.agent_create_params import AgentConfig

In [11]:
async def run_main():
    client = LlamaStackClient(
        base_url=LLAMA_STACK_API_TOGETHER_URL,
    )    
    
    agent_config = AgentConfig(
        model=LLAMA31_8B_INSTRUCT,
        instructions="You are a helpful assistant",
        enable_session_persistence=False,
    )

    agent = Agent(client, agent_config)
    session_id = agent.create_session("test-session")

    prompts = [
        "Who wrote the book Charlotte's Web?",
        "Three best quotes?",
    ]

    for prompt in prompts:
        print(f"User> {prompt}")
        response = agent.create_turn(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            session_id=session_id,
        )

        for log in EventLogger().log(response):
            log.print()
        
await run_main()

APIStatusError: Error code: 426 - {'error': {'message': 'Client version 0.1.0 is not compatible with server version 0.2.6. Please update your client.'}}

# Llama Stack with Llama 3.2 vision model

In [None]:
from PIL import Image
import matplotlib.pyplot as plt

def display_image(path):
  img = Image.open(path)
  plt.imshow(img)
  plt.axis('off')
  plt.show()

display_image("./content/Llama_Repo.jpeg")

In [None]:
LLAMA32_11B_INSTRUCT = "meta-llama/Llama-3.2-11B-Vision-Instruct"

import base64

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        base64_string = base64.b64encode(image_file.read()).decode("utf-8")
        base64_url = f"data:image/png;base64,{base64_string}"
        return base64_url

async def run_main(image_path, prompt):
    base64_image = encode_image(image_path)
    
    client = LlamaStackClient(
        base_url=LLAMA_STACK_API_TOGETHER_URL,
    )    
    
    agent_config = AgentConfig(
        model=LLAMA32_11B_INSTRUCT,
        instructions="You are a helpful assistant",
        enable_session_persistence=False,
    )

    agent = Agent(client, agent_config)
    session_id = agent.create_session("test-session")

    response = agent.create_turn(
        messages=[{
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": {
                         "url": {
                              "uri": encode_image(image_path)
                         }
                    }
                },
                {
                    "type": "text",
                    "text": prompt,
                }
            ]
        }],
        session_id=session_id,
    )

    for log in EventLogger().log(response):
        log.print()

In [None]:
await run_main("./content/Llama_Repo.jpeg",
         "How many different colors are those llamas? What are those colors?")

In [None]:
import mimetypes
from termcolor import cprint
from llama_stack_client.lib.inference.event_logger import EventLogger

async def run_main(image_path: str, prompt):
    client = LlamaStackClient(
        base_url=LLAMA_STACK_API_TOGETHER_URL,
    )    

    message = {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": {
                     "url": {
                          "uri": encode_image(image_path)
                     }
                }
            },
            {
                "type": "text",
                "text": prompt,
            }
        ]       
    }

    cprint("User> Sending image for analysis...", "green")
    response = client.inference.chat_completion(
        messages=[message],
        model_id=LLAMA32_11B_INSTRUCT,
        stream=False,
    )

    print(response.completion_message.content.lower().strip())

In [None]:
await run_main("./content/Llama_Repo.jpeg",
     "How many different colors are those llamas?\
     What are those colors?")