In [None]:
!pip install streamlit pandas seaborn matplotlib pyngrok -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Set page configuration
st.set_page_config(page_title="Continuous Batching Comparison Dashboard", page_icon="📈", layout="wide")

# Title and description
st.title("Continuous Batching Comparison Dashboard")
st.write("This dashboard visualizes latency and throughput across different batch sizes for distilgpt2 and Llama-3-8B-Instruct-GPTQ-4-Bit. Select a model to view line charts comparing batch sizes. Lower latency and higher throughput are better.")

# Load CSV data
csv_data = """Model,Batch Size,Quantized,Hardware,Latency (s),Throughput (tokens/s),Notes
distilgpt2,1,No,GPU,0.36,140.71,"VLLM v0 engine (fallback on T4), XFormers backend"
distilgpt2,5,No,GPU,0.64,350.42,"VLLM v0 engine (T4 GPU), dynamic batching, 5 prompts processed in parallel"
distilgpt2,1,No,GPU,0.56,36.07,"VLLM v0 engine, XFormers backend"
distilgpt2,2,No,GPU,0.44,44.54,"VLLM v0 engine, optimal latency"
distilgpt2,4,No,GPU,0.49,92.36,"VLLM v0 engine, incomplete outputs observed"
distilgpt2,8,No,GPU,0.53,174.88,"VLLM v0 engine, only 5 prompts processed"
distilgpt2,16,No,GPU,0.70,208.97,"VLLM v0 engine, only 5 prompts processed, max tokens=64"
Llama-3-8B-Instruct-GPTQ-4-Bit,1,Yes (4-bit),GPU,6.92,21.67,"VLLM engine, T4 GPU"
Llama-3-8B-Instruct-GPTQ-4-Bit,2,Yes (4-bit),GPU,6.69,44.53,"VLLM engine, T4 GPU"
Llama-3-8B-Instruct-GPTQ-4-Bit,4,Yes (4-bit),GPU,7.67,92.72,"VLLM engine, T4 GPU"
Llama-3-8B-Instruct-GPTQ-4-Bit,8,Yes (4-bit),GPU,10.32,144.33,"VLLM engine, T4 GPU"
Llama-3-8B-Instruct-GPTQ-4-Bit,16,Yes (4-bit),GPU,18.68,159.81,"VLLM engine, T4 GPU, 7.4x throughput improvement"
"""
df = pd.read_csv(io.StringIO(csv_data))

# Sidebar model selection
st.sidebar.header("Select Model")
model = st.sidebar.radio("Choose a model to visualize:", options=df["Model"].unique(), index=0)

# Filter data by selected model
filtered_df = df[df["Model"] == model].sort_values("Batch Size")

# Latency line chart
st.subheader(f"Latency vs Batch Size for {model} (Lower is Better)")
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=filtered_df, x="Batch Size", y="Latency (s)", marker="o", ax=ax)
ax.set_xlabel("Batch Size")
ax.set_ylabel("Latency (seconds)")
ax.set_title(f"Latency Across Batch Sizes for {model}")
for i, row in filtered_df.iterrows():
    ax.annotate(f"{row['Latency (s)']:.2f}", (row["Batch Size"], row["Latency (s)"]), textcoords="offset points", xytext=(0,10), ha="center")
ax.grid(True)
plt.tight_layout()
st.pyplot(fig)

# Throughput line chart
st.subheader(f"Throughput vs Batch Size for {model} (Higher is Better)")
fig, ax = plt.subplots(figsize=(10, 6))
sns.lineplot(data=filtered_df, x="Batch Size", y="Throughput (tokens/s)", marker="o", ax=ax)
ax.set_xlabel("Batch Size")
ax.set_ylabel("Throughput (tokens/second)")
ax.set_title(f"Throughput Across Batch Sizes for {model}")
for i, row in filtered_df.iterrows():
    ax.annotate(f"{row['Throughput (tokens/s)']:.2f}", (row["Batch Size"], row["Throughput (tokens/s)"]), textcoords="offset points", xytext=(0,10), ha="center")
ax.grid(True)
plt.tight_layout()
st.pyplot(fig)

# Raw data display
st.subheader(f"Raw Data for {model}")
st.write(filtered_df[["Batch Size", "Quantized", "Hardware", "Latency (s)", "Throughput (tokens/s)", "Notes"]])

# Acknowledgements
st.subheader("Acknowledgements")
st.write("Built with [Streamlit](https://streamlit.io) and visualized using [Seaborn](https://seaborn.pydata.org). Data sourced from continuous batching experiments.")

Writing app.py


In [None]:
from pyngrok import ngrok
import subprocess

# Set your Ngrok authtoken
!ngrok authtoken 2wqNxcPxPAmko5XPrXkDZUPWYPb_4ydJ85uD4WpkCicbZ3dAw  # Replace with your actual token

# Terminate any existing Ngrok tunnels
ngrok.kill()

# Start Streamlit in the background
subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])

# Create a public URL with Ngrok
public_url = ngrok.connect(8501, bind_tls=True)
print(f"Your Streamlit app is live at: {public_url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Your Streamlit app is live at: NgrokTunnel: "https://4794-34-142-245-120.ngrok-free.app" -> "http://localhost:8501"


In [None]:
from pyngrok import ngrok
ngrok.kill()
print("All Ngrok tunnels have been terminated.")

All Ngrok tunnels have been terminated.
