# Comparing Baseline to DDP

In [None]:
import json
import math

import mlflow
import pandas as pd
from scipy.stats import ttest_ind

In [None]:
# baseline = "371-baseline-reruns"
baseline = "346-larger-dataset-reruns"
experiment = "DDP"

## Query for Runs

we've tagged each job we want to compare with `"experiment": "DDP"` to make searching simpler


In [None]:
# query mlflow for runs (jobs)
baseline_df = mlflow.search_runs(
    experiment_names=[baseline],
    filter_string="tags.comparison='DDP'",
).assign(experiment_name=baseline)

experiment_df = mlflow.search_runs(
    experiment_names=[experiment],
    filter_string="tags.comparison='DDP'",
).assign(experiment_name=experiment)

df = pd.concat([baseline_df, experiment_df])

### Load Metrics Files

We need to download each `overall_metrics.json` for each experiment and load it into the dataframe.

In [None]:
client = mlflow.tracking.MlflowClient()

metrics_list = []
for run_id in df.run_id:
    artifact_path = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="overall_metrics.json")

    with open(artifact_path) as f:
        metrics = json.load(f)
        metrics_df = pd.DataFrame(metrics, index=[run_id])
        metrics_list.append(metrics_df)

metrics_df = pd.concat(metrics_list, axis=0)

In [None]:
metrics_df

In [None]:
df = df.merge(metrics_df, how="left", left_on="run_id", right_index=True).assign(
    duration=lambda df: (df.end_time - df.start_time),
    minutes=lambda df: (df.end_time - df.start_time).astype(int) / (1e9 * 60),
)

## Comparisons

We will be comparing the following metrics between the baseline and the experiment.

In [None]:
df[
    [
        "experiment_name",
        "duration",
        "f1_score",
        "precision",
        "recall",
        "false_positive_rate",
        "false_negative_rate",
        "signal2noise_ratio",
    ]
]

### I. Training Duration

In [None]:
# test that baseline sample mean duration is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].minutes
sample_b = df[df.experiment_name == experiment].minutes
stats_duration = ttest_ind(sample_a, sample_b, equal_var=False, alternative="greater")

In [None]:
print(stats_duration)
print(stats_duration.confidence_interval())
# The confidence interval does not include zero, so we can conclude that
# sample mean of the baseline is greater than the sample mean of the experiment
avg_difference = sample_a.mean() - sample_b.mean()
avg_percent = (sample_a.mean() - sample_b.mean()) / sample_a.mean() * 100
print(f"The experiment is {avg_difference:0.2f} ({avg_percent:0.2f}%) minutes faster on average.")

### II. F1 Score

In [None]:
# test that baseline sample mean F1 Score is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].f1_score
sample_b = df[df.experiment_name == experiment].f1_score
stats_f1_score = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_f1_score)
print(stats_f1_score.confidence_interval())
# The confidence interval does include zero, so we cannot reject the null hypothesis
# and conclude the sample mean of the baseline is no different than the sample mean of the experiment

### III. Precision

In [None]:
# test that baseline sample mean Precision is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].precision
sample_b = df[df.experiment_name == experiment].precision
stats_precision = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_precision)
print(stats_precision.confidence_interval())
# The confidence interval does include zero, so we cannot reject the null hypothesis
# and conclude the sample mean of the baseline precision is no different than the sample mean of the experiment precision

### IV. Recall

In [None]:
# test that baseline sample mean Recall is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].recall
sample_b = df[df.experiment_name == experiment].recall
stats_recall = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_recall)
print(stats_recall.confidence_interval())
# The confidence interval does include zero, so we cannot reject the null hypothesis
# and conclude the sample mean of the baseline recall is no different than the sample mean of the experiment recall

### V. False Positive Rate

In [None]:
# test that baseline sample mean FPR is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].false_positive_rate
sample_b = df[df.experiment_name == experiment].false_positive_rate
stats_fpr = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_fpr)
print(stats_fpr.confidence_interval())
# The confidence interval does include zero, so we cannot reject the null hypothesis
# and conclude the sample mean of the baseline FPR is no different than the sample mean of the experiment FPR

### VI. False Negative Rate

In [None]:
# test that baseline sample mean FNR is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].false_negative_rate
sample_b = df[df.experiment_name == experiment].false_negative_rate
stats_fnr = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_fnr)
print(stats_fnr.confidence_interval())
# The confidence interval does include zero, so we cannot reject the null hypothesis
# and conclude the sample mean of the baseline FNR is no different than the sample mean of the experiment FNR

### VI. Signal2Noise Ratio

In [None]:
# test that baseline sample mean Signal2Noise Ratio is 'greater' than the experiment sample mean
# for now, we ignore multiple comparison corrections because there's only a single comparison :)
# multiple comparison test: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.dunnett.html
sample_a = df[df.experiment_name == baseline].signal2noise_ratio
sample_b = df[df.experiment_name == experiment].signal2noise_ratio
stats_snr = ttest_ind(sample_a, sample_b, equal_var=False, alternative="two-sided")

In [None]:
print(stats_snr)
print(stats_snr.confidence_interval())
# The confidence interval does not include zero, so we reject the null hypothesis
# and conclude the sample mean of the baseline SNR is different than the sample mean of the experiment SNR
avg_difference = sample_a.mean() - sample_b.mean()
print(f"The experiment has a larger SNR by {avg_difference:0.3f} on average.")

In [None]:
!conda install tabulate -y

In [None]:
report_df = df[
    [
        "run_id",
        "experiment_name",
        "minutes",
        "total_samples",
        "total_pixels",
        "total_positives",
        "total_negatives",
        "mean_squared_error",
        "true_positives",
        "true_negatives",
        "false_positives",
        "false_negatives",
        "true_positive_rate",
        "true_negative_rate",
        "false_positive_rate",
        "false_negative_rate",
        "recall",
        "precision",
        "f1_score",
        "signal2noise_ratio",
    ]
].set_index("run_id")
print(report_df.to_markdown())

In [None]:
test_df = pd.DataFrame(
    {
        "metric": [
            "duration",
            "f1_score",
            "precision",
            "recall",
            "false_positive_rate",
            "false_negative_rate",
            "signal2noise_ratio",
        ],
        "test": [
            "one-sided t-test",
            "two-sided t-test",
            "two-sided t-test",
            "two-sided t-test",
            "two-sided t-test",
            "two-sided t-test",
            "two-sided t-test",
        ],
        "CI_lower": [
            stats_duration.confidence_interval().low,
            stats_f1_score.confidence_interval().low,
            stats_precision.confidence_interval().low,
            stats_recall.confidence_interval().low,
            stats_fpr.confidence_interval().low,
            stats_fnr.confidence_interval().low,
            stats_snr.confidence_interval().low,
        ],
        "CI_upper": [
            stats_duration.confidence_interval().high,
            stats_f1_score.confidence_interval().high,
            stats_precision.confidence_interval().high,
            stats_recall.confidence_interval().high,
            stats_fpr.confidence_interval().high,
            stats_fnr.confidence_interval().high,
            stats_snr.confidence_interval().high,
        ],
        "conclusion": [
            "The experiment is 162.04 minutes (23.68%) faster on average",
            "no difference",
            "no difference",
            "no difference",
            "no difference",
            "no difference",
            "The experiment has a larger SNR by 0.504 on average",
        ],
    }
).set_index("metric")

In [None]:
print(test_df.to_markdown())