In [None]:
import mlflow
import pandas as pd
from scipy.stats import ttest_ind

## I. Comparing Baseline training configuration to Experiment: batch_size_64

In the baseline experiment, the GPU memory is only ~25% utilized.  If we increase the batch size to more fully utilize the GPU memory,
is training speed faster?Notes
3 jobs of the same experiment were run concurrently on the same compute (gpu-cluster-64-cores)each training job runs for 10 epochs

In [None]:
baseline = "baseline_training_runtime"
experiment = "batch_size_64"

In [None]:
mlflow.get_experiment_by_name(experiment)

In [None]:
# query mlflow for experiment jobs and metrics
baseline_df = mlflow.search_runs(
    experiment_names=[baseline],
    # filter_string="attributes.status = 'Finished'",
).assign(experiment_name=baseline)

experiment_df = mlflow.search_runs(
    experiment_names=[experiment],
    # filter_string="attributes.status = 'Finished'",
).assign(experiment_name=experiment)

df = pd.concat([baseline_df, experiment_df], ignore_index=True)

In [None]:
# calculate minutes elapsed for each job
df = df.assign(
    duration=lambda df: (df.end_time - df.start_time),
    minutes=lambda df: (df.end_time - df.start_time).astype(int) / (1e9 * 60),
)
df[["experiment_name", "minutes"]]

In [None]:
# test that baseline sample mean duration is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].minutes
sample_b = df[df.experiment_name == experiment].minutes
stats = ttest_ind(sample_a, sample_b, equal_var=False, alternative="greater")

In [None]:
print(stats)
print(stats.confidence_interval())
# The confidence interval does not include zero, so we can conclude that
# sample mean of the baseline is greater than the sample mean of the experiment

In [None]:
stats_df = (
    df[["experiment_name", "minutes"]]
    .groupby("experiment_name")
    .agg(
        mean=("minutes", "mean"),
        var=("minutes", "var"),
        std=("minutes", "std"),
        min=("minutes", "min"),
        max=("minutes", "max"),
    )
    .reset_index()
)
stats_df

In [None]:
baseline_mean = stats_df[stats_df.experiment_name == baseline]["mean"].iloc[0]
experiment_mean = stats_df[stats_df.experiment_name == experiment]["mean"].iloc[0]

diff = baseline_mean - experiment_mean
diff_percent = diff / baseline_mean * 100
print(f"{diff_percent:.2f}% faster")
print(f"{diff:.2f} minutes faster on average")