In [1]:
!pip install streamlit pandas seaborn matplotlib -q
!npm install localtunnel

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m53.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K
added 22 packages in 3s
[1G[0K⠦[1G[0K
[1G[0K⠦[1G[0K3 packages are looking for funding
[1G[0K⠦[1G[0K  run `npm fund` for details
[1G[0K⠦[1G[0K

In [2]:
%%writefile app.py
import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Set page configuration
st.set_page_config(page_title="Inference Optimization Dashboard", page_icon="📊", layout="wide")

# Title and description
st.title("Inference Optimization Dashboard")
st.write("This dashboard visualizes performance metrics (latency, throughput, memory usage) for various models, comparing CPU/GPU and quantization configurations. Use the sidebar to filter models or quantization types.")

# Data preparation
data = {
    "Model": ["Whisper-base", "Whisper-base", "Whisper-medium", "Whisper-medium", "Whisper-medium", "Whisper-medium", "Whisper-medium", "Gemma 2-2B", "Mistral 7B", "Paligemma", "MiniCPM-V 2.6", "MiniCPM-V 2.6"],
    "Prompt": ["Transcription", "Transcription", "Transcription", "Transcription", "Transcription", "Transcription", "Transcription", "Transcription", "What is the meaning of life?", "Describe the image.", "What is the meaning of life?", "What is the meaning of life?"],
    "Batch Size": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    "Quantized": ["No", "No", "No (FP32)", "No (FP32)", "Yes (INT8)", "Yes (4-bit)", "Yes (FP16)", "Yes (4-bit)", "Yes (4-bit)", "Yes (4-bit)", "No (FP16)", "Yes (4-bit)"],
    "Tensor Parallel": ["No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No"],
    "Latency (s)": [0.2498, 5.9084, 1.3950, 45.3059, 29.2023, 2.7448, 1.0, 2.7448, 217.40, 2.10, 1.82, 2.05],
    "Throughput (tokens/s)": [None, None, None, None, None, 24.44, None, 24.44, 0.23, 0.95, 16.48, 14.63],
    "Notes": [
        "GPU (T4), peak memory 330.93 MB",
        "CPU, memory 0.46 MB",
        "GPU (T4), peak memory 4103.10 MB",
        "CPU, memory 5.41 MB",
        "CPU, memory 8463.5625 MB",
        "GPU (T4), peak memory 4060.91 MB, bitsandbytes",
        "GPU (T4), peak memory 2.1 GB",
        "GPU (T4), peak memory 4060.91 MB, bitsandbytes",
        "GPU (T4), memory 3945.98 MB",
        "GPU (T4), memory 3380.03 MB",
        "GPU (T4), memory 8972.4 MB",
        "GPU (T4), memory 3245.7 MB, 63.8% memory reduction"
    ],
    "Hardware": ["GPU", "CPU", "GPU", "CPU", "CPU", "GPU", "GPU", "GPU", "GPU", "GPU", "GPU", "GPU"],
    "Memory (MB)": [330.93, 0.46, 4103.10, 5.41, 8463.5625, 4060.91, 2100, 4060.91, 3945.98, 3380.03, 8972.4, 3245.7]
}
df = pd.DataFrame(data)

# Sidebar filters
st.sidebar.header("Filter Options")
selected_models = st.sidebar.multiselect("Select Models", options=df["Model"].unique(), default=df["Model"].unique())
quantization_filter = st.sidebar.multiselect("Select Quantization Types", options=df["Quantized"].unique(), default=df["Quantized"].unique())

# Filter data
filtered_df = df[df["Model"].isin(selected_models) & df["Quantized"].isin(quantization_filter)]

# Latency comparison
st.subheader("Latency Comparison (Lower is Better)")
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(data=filtered_df, x="Latency (s)", y="Model", hue="Notes", ax=ax)
ax.set_xlabel("Latency (seconds)")
ax.set_ylabel("Model")
ax.set_title("Latency by Model and Configuration")
plt.tight_layout()
st.pyplot(fig)

# Throughput comparison (where available)
st.subheader("Throughput Comparison (Higher is Better)")
throughput_df = filtered_df[filtered_df["Throughput (tokens/s)"].notnull()]
if not throughput_df.empty:
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=throughput_df, x="Throughput (tokens/s)", y="Model", hue="Notes", ax=ax)
    ax.set_xlabel("Throughput (tokens/second)")
    ax.set_ylabel("Model")
    ax.set_title("Throughput by Model and Configuration")
    plt.tight_layout()
    st.pyplot(fig)
else:
    st.write("No throughput data available for selected filters.")

# Latency vs Memory Scatter Plot
st.subheader("Latency vs Memory Usage")
fig, ax = plt.subplots(figsize=(10, 6))
sns.scatterplot(data=filtered_df, x="Latency (s)", y="Memory (MB)", hue="Model", style="Hardware", size="Memory (MB)", ax=ax)
ax.set_xlabel("Latency (seconds)")
ax.set_ylabel("Memory Usage (MB)")
ax.set_title("Latency vs Memory Usage by Model and Hardware")
plt.tight_layout()
st.pyplot(fig)

# Raw data display
st.subheader("Raw Data")
st.write(filtered_df)

# Acknowledgements
st.subheader("Acknowledgements")
st.write("Built with [Streamlit](https://streamlit.io) and visualized using [Seaborn](https://seaborn.pydata.org). Data sourced from model inference experiments.")

Writing app.py


In [3]:
!pip install streamlit pandas seaborn matplotlib pyngrok -q

In [4]:
from pyngrok import ngrok
import subprocess

# Set your Ngrok authtoken
!ngrok authtoken 2wqNxcPxPAmko5XPrXkDZUPWYPb_4ydJ85uD4WpkCicbZ3dAw  # Replace with your actual token

# Terminate any existing Ngrok tunnels
ngrok.kill()

# Start Streamlit in the background
subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])

# Create a public URL with Ngrok
public_url = ngrok.connect(8501, bind_tls=True)
print(f"Your Streamlit app is live at: {public_url}")

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
Your Streamlit app is live at: NgrokTunnel: "https://c346-34-16-234-84.ngrok-free.app" -> "http://localhost:8501"


In [5]:
# from pyngrok import ngrok
# ngrok.kill()
# print("All Ngrok tunnels have been terminated.")

2wqNxcPxPAmko5XPrXkDZUPWYPb_4ydJ85uD4WpkCicbZ3dAw