In [56]:
import os
import cv2
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.stats import ks_2samp
import ot  # Optimal Transport for EMD
import timeit
from scipy import stats

In [50]:
def emd_distance(hist1, hist2):
    hist1 = hist1 / np.sum(hist1)
    hist2 = hist2 / np.sum(hist2)
    
    # Assume bin centers are just their indices (0, 1, 2, ..., len(hist1)-1)
    bin_centers = np.arange(len(hist1))
    
    # Compute cost matrix using absolute differences
    cost_matrix = np.abs(bin_centers[:, None] - bin_centers)
    
    # Normalize cost matrix so max distance is 1
    cost_matrix = cost_matrix / np.max(cost_matrix)
    
    return ot.emd2(hist1, hist2, cost_matrix)


def wasserstein_from_freqs(fq, fr):
    """
    Computes the 1D Wasserstein distance given two histograms.
    
    hist1 and hist2 are arrays representing probability distributions 
    (they must sum to 1).
    """
    
    bins = np.arange(len(fq))  # Assume bins are indexed as 0, 1, ..., N-1
    return stats.wasserstein_distance(bins, bins, fq, fr)



def DS_from_freqs(fq, fr):
    
    max_q = len(fq) - 1
    max_r = len(fr) - 1

    fq_sum = np.sum(fq)
    fr_sum = np.sum(fr)
    
    mean_q = np.dot(np.arange(len(fq)), fq) / fq_sum
    mean_r = np.dot(np.arange(len(fr)), fr) / fr_sum

    Sq = (max_q - mean_q) / max_q
    Sr = (max_r - mean_r) / max_r
    
    return np.abs(Sq - Sr)


def DS_from_data(X1, X2):
    min1, max1, avg1 = np.min(X1), np.max(X1), np.mean(X1)
    min2, max2, avg2 = np.min(X2), np.max(X2), np.mean(X2)
    
    S1 = ((max1 - avg1) / (max1 - min1))
    S2 = ((max2 - avg2) / (max2 - min2))
    return  np.abs(S1 - S2)




In [55]:
num_bins = [1, 2, 3] 
MaxVals = [5]

iterations = 1000
for mx in MaxVals:
    
    max_val = 10**mx

    #print("Comparing compute times for histograms with a max_val of", max_val, '.\n')

    for nb in num_bins:
        
        size = 10**nb
        
        ds_freq_times = []
        wd_freq_times = []
        times_slower_freq = []

        for iteration in range(iterations):
            
            # Generate random histograms (frequency distributions)
            fq = np.random.randint(1, max_val, size)
            fr = np.random.randint(1, max_val, size)
            
            # Measure the time for |Δ𝒮| from freqs
            start_time1 = timeit.default_timer()
            dat = DS_from_freqs(fq, fr)
            end_time1 = timeit.default_timer()
            ds_freq_times.append((end_time1 - start_time1)*1000)
            
            # Measure the time for wasserstein from freqs
            start_time3 = timeit.default_timer()
            dat = wasserstein_from_freqs(fq, fr)
            end_time3 = timeit.default_timer()
            wd_freq_times.append((end_time3 - start_time3)*1000)
            
            
            times_slower_freq.append(wd_freq_times[-1] / ds_freq_times[-1])
            

        # Report mean and standard deviation of computation times
        ds_freq_mean = np.nanmean(ds_freq_times)
        ds_freq_std = np.nanstd(ds_freq_times)

        wd_freq_mean = np.nanmean(wd_freq_times)
        wd_freq_std = np.nanstd(wd_freq_times)
        
        times_slower_freq_mean = np.nanmean(times_slower_freq)
        times_slower_freq_std = np.nanstd(times_slower_freq)

        print(max_val, size, f"{ds_freq_mean:.3f}", f"{ds_freq_std:.3f}", f"{wd_freq_mean:.3f}", f"{wd_freq_std:.3f}", f"{times_slower_freq_mean:.3f}", f"{times_slower_freq_std:.3f}")
        print('\n')
        
        
    #print("\n")

100000 10 0.024 0.015 0.094 0.045 4.013 1.217


100000 100 0.021 0.006 0.095 0.036 4.512 0.968


100000 1000 0.026 0.012 0.242 0.076 9.550 1.954




In [None]:
10 10 0.026 0.013 0.105 0.044 4.134 1.153
10 10 0.067 0.065 0.072 0.167 1.020 1.326
10 100 0.028 0.011 0.133 0.054 4.856 1.465
10 100 0.070 0.027 0.155 0.067 2.267 0.739
10 1000 0.035 0.009 0.298 0.044 8.699 1.269
10 1000 0.113 0.022 1.183 0.160 10.648 1.148
10 10000 0.090 0.217 3.842 0.521 47.042 6.561
10 10000 0.510 0.065 18.427 1.440 36.363 2.826


100 10 0.025 0.014 0.097 0.021 4.016 0.581
100 10 0.061 0.014 0.123 0.049 2.041 0.393
100 100 0.030 0.013 0.127 0.032 4.428 0.854
100 100 0.092 0.024 1.084 1.256 12.021 10.429
100 1000 0.044 0.009 0.313 0.033 7.248 0.807
100 1000 0.312 0.040 13.111 1.353 42.163 3.507
100 10000 0.091 0.018 7.895 0.761 87.998 9.556
100 10000 2.771 0.282 205.516 16.458 74.385 3.294

1000 10 0.029 0.018 0.108 0.055 3.943 1.338
1000 10 0.086 0.037 0.991 0.307 12.114 3.390
1000 100 0.038 0.016 0.140 0.036 3.882 0.796
1000 100 0.264 0.068 11.489 1.408 44.720 6.776
1000 1000 0.059 0.026 0.335 0.056 5.999 1.311
1000 1000 2.277 0.290 160.341 17.167 70.849 6.389
1000 10000 0.141 0.051 57.470 7.641 429.787 83.778
1000 10000 25.811 4.607 2328.363 429.389 90.805 8.012

In [49]:
samp_size = [10**2, 10**3, 10**4, 10**5, 10**6] 
MaxVals =   [1]

iterations = 10**4
for max_val in MaxVals:
    
    for size in samp_size:
        
        ds_data_times = []
        wd_data_times = []
        times_slower_data = []
        
        for iteration in range(iterations):
            
            # Generate random samples (NOT DISTRIBUTIONS)
            X1 = np.random.uniform(0, max_val, size)
            X2 = np.random.uniform(0, max_val, size)
            
            # Measure the time for |Δ𝒮| from data
            start_time2 = timeit.default_timer()
            dat = DS_from_data(X1, X2)
            end_time2 = timeit.default_timer()
            ds_data_times.append((end_time2 - start_time2)*1000)
            
            # Measure the time for wasserstein from data
            start_time4 = timeit.default_timer()
            dat = stats.wasserstein_distance(X1, X2)
            end_time4 = timeit.default_timer()
            wd_data_times.append((end_time4 - start_time4)*1000)
            
            times_slower_data.append(wd_data_times[-1] / ds_data_times[-1])
            

        # Report mean and standard deviation of computation times
        ds_data_mean = np.nanmean(ds_data_times)
        ds_data_std = np.nanstd(ds_data_times)

        wd_data_mean = np.nanmean(wd_data_times)
        wd_data_std = np.nanstd(wd_data_times)

        times_slower_data_mean = np.nanmean(times_slower_data)
        times_slower_data_std = np.nanstd(times_slower_data)

        print(max_val, size, f"{ds_data_mean:.3f}", f"{ds_data_std:.3f}", f"{wd_data_mean:.3f}", f"{wd_data_std:.3f}", f"{times_slower_data_mean:.3f}", f"{times_slower_data_std:.3f}")
        #print('\n')
            
    print("\n")


1 100 0.040 0.015 0.059 0.021 1.496 0.291
1 1000 0.047 0.016 0.350 0.051 7.635 0.866
1 10000 0.078 0.035 4.214 0.537 57.032 9.900
1 100000 0.298 0.070 53.615 3.655 184.255 22.003
1 1000000 2.704 0.455 738.961 64.907 276.610 27.097




In [None]:
1 100 0.040 0.015 0.059 0.021 1.496 0.291

1 1000 0.047 0.016 0.350 0.051 7.635 0.866

1 10000 0.078 0.035 4.214 0.537 57.032 9.900

1 100000 0.298 0.070 53.615 3.655 184.255 22.003

1 1000000 2.704 0.455 738.961 64.907 276.610 27.097
