In [1]:
import json
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import ast

from utils import get_df_for_benchmark, performance_score, shorten_gpu_name

salad_green = "#53a626"

In [2]:
benchmark_id = "asr-benchmark-0"

# if data-1.csv exists, load it
try:
    print("Loading data from data-1.csv")
    df = pd.read_csv("./data-1.csv")
    # df["timestamp"] = pd.to_datetime(df["timestamp"])
except FileNotFoundError:
    # otherwise, get the data from the API
    print("Cached CSV not found. Loading data from DynamoDB")
    df = get_df_for_benchmark(benchmark_id)
    df.to_csv("./data-1.csv", index=False)
except Exception as e:
    print("Error loading data from CSV.")
    print(e)

df.info()

Loading data from data-1.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6211843 entries, 0 to 6211842
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   word_count                int64  
 1   audio_url                 object 
 2   gpu-name                  object 
 3   salad-machine-id          object 
 4   salad-container-group-id  object 
 5   processing-time           float64
 6   audio-length              float64
 7   realtime-factor           float64
 8   model-id                  object 
 9   timestamp                 object 
dtypes: float64(3), int64(1), object(6)
memory usage: 473.9+ MB


In [3]:
df.head(5)

Unnamed: 0,word_count,audio_url,gpu-name,salad-machine-id,salad-container-group-id,processing-time,audio-length,realtime-factor,model-id,timestamp
0,5,https://salad-benchmark-assets.download/cv-cor...,NVIDIA GeForce RTX 3060,7bdde317-315a-3754-a2ca-7f4fcb039a66,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.522115,2.208,4.228951,distil-whisper/distil-large-v2,2024-01-19 15:41:04.384
1,6,https://salad-benchmark-assets.download/cv-cor...,NVIDIA GeForce RTX 3090,c1f1bfa1-4176-5656-8666-900b737ce222,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.297142,3.072,10.338493,distil-whisper/distil-large-v2,2024-01-19 15:41:04.384
2,12,https://salad-benchmark-assets.download/cv-cor...,NVIDIA GeForce RTX 3090,c1f1bfa1-4176-5656-8666-900b737ce222,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.367148,6.84,18.630111,distil-whisper/distil-large-v2,2024-01-19 15:41:04.384
3,6,https://salad-benchmark-assets.download/cv-cor...,NVIDIA GeForce RTX 3060,7bdde317-315a-3754-a2ca-7f4fcb039a66,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.501609,2.592,5.167367,distil-whisper/distil-large-v2,2024-01-19 15:41:04.384
4,10,https://salad-benchmark-assets.download/cv-cor...,NVIDIA GeForce RTX 3090,c1f1bfa1-4176-5656-8666-900b737ce222,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.361526,5.472,15.135854,distil-whisper/distil-large-v2,2024-01-19 15:41:04.384


In [4]:
# Convert number fields to numbers
df["processing-time"] = pd.to_numeric(df["processing-time"])
df["audio-length"] = pd.to_numeric(df["audio-length"])
df["realtime-factor"] = pd.to_numeric(df["realtime-factor"])
df["word_count"] = pd.to_numeric(df["word_count"])
df["gpu-name"] = df["gpu-name"].apply(shorten_gpu_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6211843 entries, 0 to 6211842
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   word_count                int64  
 1   audio_url                 object 
 2   gpu-name                  object 
 3   salad-machine-id          object 
 4   salad-container-group-id  object 
 5   processing-time           float64
 6   audio-length              float64
 7   realtime-factor           float64
 8   model-id                  object 
 9   timestamp                 object 
dtypes: float64(3), int64(1), object(6)
memory usage: 473.9+ MB


In [6]:
# sort the df by gpu name
df = df.sort_values(by=["gpu-name"])

df.head()

Unnamed: 0,word_count,audio_url,gpu-name,salad-machine-id,salad-container-group-id,processing-time,audio-length,realtime-factor,model-id,timestamp
1015677,5,https://salad-benchmark-assets.download/cv-cor...,CUDA is not available,b280852f-ca32-d059-a47c-fe5d9e0cc245,5b91a69e-c1af-4eba-858c-d7804ebf07c1,24.699671,2.616,0.105912,distil-whisper/distil-large-v2,2024-01-19 17:34:40.128
990538,8,https://salad-benchmark-assets.download/cv-cor...,CUDA is not available,b280852f-ca32-d059-a47c-fe5d9e0cc245,5b91a69e-c1af-4eba-858c-d7804ebf07c1,25.421842,5.16,0.202975,distil-whisper/distil-large-v2,2024-01-19 17:32:29.056
719292,6,https://salad-benchmark-assets.download/cv-cor...,CUDA is not available,b280852f-ca32-d059-a47c-fe5d9e0cc245,5b91a69e-c1af-4eba-858c-d7804ebf07c1,26.311758,3.096,0.117666,distil-whisper/distil-large-v2,2024-01-19 17:06:16.192
866654,9,https://salad-benchmark-assets.download/cv-cor...,CUDA is not available,b280852f-ca32-d059-a47c-fe5d9e0cc245,5b91a69e-c1af-4eba-858c-d7804ebf07c1,26.304561,3.888,0.147807,distil-whisper/distil-large-v2,2024-01-19 17:21:33.696
1181843,22,https://salad-benchmark-assets.download/cv-cor...,CUDA is not available,c6803b68-1e41-fa53-8c3a-f94ee2f0c099,5b91a69e-c1af-4eba-858c-d7804ebf07c1,23.00583,13.488,0.586286,distil-whisper/distil-large-v2,2024-01-19 17:52:08.704


In [7]:
# I want the number of unique machine Ids that have a gpu name of "CUDa is not available"
bad_nodes = df[df["gpu-name"] == "CUDA is not available"]["salad-machine-id"].unique()

print(f"Found {len(bad_nodes)} nodes with no CUDA available:")
print(bad_nodes)

Found 4 nodes with no CUDA available:
['b280852f-ca32-d059-a47c-fe5d9e0cc245'
 'c6803b68-1e41-fa53-8c3a-f94ee2f0c099'
 '0e903c8f-881b-cf5f-8268-8ba4c9d2a5d1'
 '962697ea-8198-fa5f-8de4-3b30190c2f95']


In [8]:
# How many rows have a gpu name of "CUDA is not available"?
num_bad_jobs = len(df[df["gpu-name"] == "CUDA is not available"])
print(f"Found {num_bad_jobs} jobs with no CUDA available")

Found 446 jobs with no CUDA available


In [10]:
data2 = "./data-2.csv"

try:
    print(f"Loading data from {data2}")
    df = pd.read_csv(data2)
    # df["timestamp"] = pd.to_datetime(df["timestamp"])
except FileNotFoundError:
    # otherwise, get the data from the API
    print("Cached CSV not found. Sorting data by GPU name.")
    
    df = df[df["gpu-name"] != "CUDA is not available"]
    # reset the index
    df = df.reset_index(drop=True)
    df.to_csv(data2, index=False)
except Exception as e:
    print("Error loading data from CSV.")
    print(e)

df.info()


Loading data from ./data-2.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6211397 entries, 0 to 6211396
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   word_count                int64  
 1   audio_url                 object 
 2   gpu-name                  object 
 3   salad-machine-id          object 
 4   salad-container-group-id  object 
 5   processing-time           float64
 6   audio-length              float64
 7   realtime-factor           float64
 8   model-id                  object 
 9   timestamp                 object 
dtypes: float64(3), int64(1), object(6)
memory usage: 473.9+ MB


In [11]:
df.head()

Unnamed: 0,word_count,audio_url,gpu-name,salad-machine-id,salad-container-group-id,processing-time,audio-length,realtime-factor,model-id,timestamp
0,8,https://salad-benchmark-assets.download/cv-cor...,RTX 2080,f40a40ae-7cab-765c-b7eb-936fcccc16a1,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.424131,3.768,8.884056,distil-whisper/distil-large-v2,2024-01-19 16:59:42.976
1,5,https://salad-benchmark-assets.download/cv-cor...,RTX 2080,9f001f54-13e0-085f-a0a3-56dc10c75988,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.373203,3.336,8.938835,distil-whisper/distil-large-v2,2024-01-19 16:24:45.824
2,15,https://salad-benchmark-assets.download/cv-cor...,RTX 2080,f40a40ae-7cab-765c-b7eb-936fcccc16a1,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.444092,5.208,11.72729,distil-whisper/distil-large-v2,2024-01-19 17:12:49.408
3,8,https://salad-benchmark-assets.download/cv-cor...,RTX 2080,f40a40ae-7cab-765c-b7eb-936fcccc16a1,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.393454,3.456,8.783741,distil-whisper/distil-large-v2,2024-01-19 16:59:42.976
4,12,https://salad-benchmark-assets.download/cv-cor...,RTX 2080,251e26e1-526d-9f57-9baa-3964d9249517,5b91a69e-c1af-4eba-858c-d7804ebf07c1,0.459638,6.672,14.515771,distil-whisper/distil-large-v2,2024-01-20 01:44:00.256


In [13]:
# Getting the value counts
gpu_type_counts = df["gpu-name"].value_counts()

# Convert the Series to DataFrame and reset the index
gpu_type_counts_df = gpu_type_counts.reset_index()
gpu_type_counts_df.columns = ['gpu-name', 'count']

# Sort the DataFrame by 'gpu-name'
gpu_type_counts_df = gpu_type_counts_df.sort_values(by='gpu-name')

# Create the bar graph
whisper_fig = px.bar(gpu_type_counts_df, x='gpu-name', y='count', title="Number of Inferences Completed by GPU", color_discrete_sequence=[salad_green])

# Show the figure
whisper_fig.show()

whisper_fig.update_layout(
    title_text=f"Number of Inferences Completed by GPU",
    title_font=dict(size=32),
    xaxis_title="GPU",
    xaxis_title_font=dict(size=24),
    yaxis_title="Number of Inferences",
    yaxis_title_font=dict(size=24),
    xaxis=dict(tickfont=dict(size=18)),
    yaxis=dict(tickfont=dict(size=18)),
)
whisper_fig.write_image("images/gpu_type_counts.jpg", width=1920, height=768)
gpu_type_counts_df.to_csv("gpu_type_counts.csv", index=False)

In [19]:
whisper_df = df[df['model-id'] == 'openai/whisper-large-v3']
whisper_df.reset_index(drop=True, inplace=True)
distil_df = df[df['model-id'] == 'distil-whisper/distil-large-v2']
distil_df.reset_index(drop=True, inplace=True)

In [20]:
whisper_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2364838 entries, 0 to 2364837
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   word_count                int64  
 1   audio_url                 object 
 2   gpu-name                  object 
 3   salad-machine-id          object 
 4   salad-container-group-id  object 
 5   processing-time           float64
 6   audio-length              float64
 7   realtime-factor           float64
 8   model-id                  object 
 9   timestamp                 object 
dtypes: float64(3), int64(1), object(6)
memory usage: 180.4+ MB


In [21]:
distil_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3846559 entries, 0 to 3846558
Data columns (total 10 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   word_count                int64  
 1   audio_url                 object 
 2   gpu-name                  object 
 3   salad-machine-id          object 
 4   salad-container-group-id  object 
 5   processing-time           float64
 6   audio-length              float64
 7   realtime-factor           float64
 8   model-id                  object 
 9   timestamp                 object 
dtypes: float64(3), int64(1), object(6)
memory usage: 293.5+ MB


In [23]:
unique_machines = df["salad-machine-id"].unique()
whisper_unique_machines = whisper_df["salad-machine-id"].unique()
distil_unique_machines = distil_df["salad-machine-id"].unique()
print(f"Found {len(unique_machines)} unique machines total:")
print(f"  {len(whisper_unique_machines)} unique machines for Whisper Large v3")
print(f"  {len(distil_unique_machines)} unique machines for Distil Whisper Large v2")

Found 376 unique machines total:
  192 unique machines for Whisper Large v3
  197 unique machines for Distil Whisper Large v2


In [25]:
whisper_num_inferences_per_gpu = whisper_df["gpu-name"].value_counts()
distil_num_inferences_per_gpu = distil_df["gpu-name"].value_counts()

whisper_num_inferences_per_gpu_df = whisper_num_inferences_per_gpu.reset_index()
whisper_num_inferences_per_gpu_df.columns = ['gpu-name', 'count']
whisper_num_inferences_per_gpu_df = whisper_num_inferences_per_gpu_df.sort_values(by='gpu-name')

distil_num_inferences_per_gpu_df = distil_num_inferences_per_gpu.reset_index()
distil_num_inferences_per_gpu_df.columns = ['gpu-name', 'count']
distil_num_inferences_per_gpu_df = distil_num_inferences_per_gpu_df.sort_values(by='gpu-name')

whisper_fig = go.Figure(data=[
    go.Bar(name='Whisper Large v3', x=whisper_num_inferences_per_gpu_df["gpu-name"], y=whisper_num_inferences_per_gpu_df["count"]),
    go.Bar(name='Distil Whisper Large v2', x=distil_num_inferences_per_gpu_df["gpu-name"], y=distil_num_inferences_per_gpu_df["count"])
])

# Change the bar mode
whisper_fig.update_layout(barmode='group')

whisper_fig.show()

In [27]:
all_gpus = sorted(df["gpu-name"].unique())
num_unique_gpus = len(all_gpus)
print(f"Found {num_unique_gpus} unique GPUs:")
for gpu in all_gpus:
    print(f"  {gpu}")

Found 23 unique GPUs:
  RTX 2080
  RTX 2080 Ti
  RTX 3060
  RTX 3060 Ti
  RTX 3070
  RTX 3070 Laptop
  RTX 3070 Ti
  RTX 3070 Ti Laptop
  RTX 3080
  RTX 3080 Laptop
  RTX 3080 Ti
  RTX 3080 Ti Laptop
  RTX 3090
  RTX 3090 Ti
  RTX 4060
  RTX 4060 Laptop
  RTX 4060 Ti
  RTX 4070
  RTX 4070 Laptop
  RTX 4070 Ti
  RTX 4080
  RTX 4090
  RTX 4090 Laptop


In [30]:
for gpu in all_gpus:
    whisper_gpu_df = whisper_df[whisper_df["gpu-name"] == gpu]
    distil_gpu_df = distil_df[distil_df["gpu-name"] == gpu]
    
    # within each gpu type, plot the relationship between audio length and processing time
    whisper_fig = px.scatter(whisper_gpu_df, x="audio-length", y="realtime-factor", title=f"Whisper Large v3 on {gpu}", color_discrete_sequence=[salad_green])
    whisper_fig.update_layout(
        title_text=f"Whisper Large v3 on {gpu}",
        title_font=dict(size=32),
        xaxis_title="Audio Length (seconds)",
        xaxis_title_font=dict(size=24),
        yaxis_title="Realtime Factor",
        yaxis_title_font=dict(size=24),
        xaxis=dict(tickfont=dict(size=18)),
        yaxis=dict(tickfont=dict(size=18)),
    )
    whisper_fig.write_image(f"images/whisper_large_v3_{gpu}.jpg", width=1920, height=768)
    whisper_correlation = whisper_gpu_df["audio-length"].corr(whisper_gpu_df["realtime-factor"])
    print(f"Whisper Large v3 on {gpu} correlation between audio length and realtime factor: {whisper_correlation}")
    
    distil_fig = px.scatter(distil_gpu_df, x="audio-length", y="realtime-factor", title=f"Distil Whisper Large v2 on {gpu}", color_discrete_sequence=[salad_green])
    distil_fig.update_layout(
        title_text=f"Distil Whisper Large v2 on {gpu}",
        title_font=dict(size=32),
        xaxis_title="Audio Length (seconds)",
        xaxis_title_font=dict(size=24),
        yaxis_title="Realtime Factor",
        yaxis_title_font=dict(size=24),
        xaxis=dict(tickfont=dict(size=18)),
        yaxis=dict(tickfont=dict(size=18)),
    )
    distil_fig.write_image(f"images/distil_whisper_large_v2_{gpu}.jpg", width=1920, height=768)
    distil_correlation = distil_gpu_df["audio-length"].corr(distil_gpu_df["realtime-factor"])
    print(f"Distil Whisper Large v2 on {gpu} correlation between audio length and realtime factor: {distil_correlation}")

Whisper Large v3 on RTX 2080 correlation between audio length and realtime factor: 0.09724764815617529
Distil Whisper Large v2 on RTX 2080 correlation between audio length and realtime factor: 0.39617412574865984
Whisper Large v3 on RTX 2080 Ti correlation between audio length and realtime factor: 0.5315534189050348
Distil Whisper Large v2 on RTX 2080 Ti correlation between audio length and realtime factor: 0.5402173667360685
Whisper Large v3 on RTX 3060 correlation between audio length and realtime factor: 0.36960255261667346
Distil Whisper Large v2 on RTX 3060 correlation between audio length and realtime factor: 0.39566006473919585
Whisper Large v3 on RTX 3060 Ti correlation between audio length and realtime factor: 0.26862158476218406
Distil Whisper Large v2 on RTX 3060 Ti correlation between audio length and realtime factor: 0.4049652822839436
Whisper Large v3 on RTX 3070 correlation between audio length and realtime factor: 0.28658466822253603
Distil Whisper Large v2 on RTX 3070 