In [2]:
import os
import cv2
import numpy as np
from scipy.optimize import linear_sum_assignment
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from scipy.stats import ks_2samp
import ot  # Optimal Transport for EMD
import timeit
from scipy import stats

In [20]:
def DS_from_data(X1, X2, min1, max1, min2, max2):
    avg1 = np.mean(X1)
    avg2 = np.mean(X2)
    
    S1 = ((max1 - avg1) / (max1 - min1))
    S2 = ((max2 - avg2) / (max2 - min2))
    return  S1 - S2


# Comparing samples from continuous distributions

WD = Wasserstein distance

In [33]:
samp_size = [10**2, 10**3, 10**4, 10**5, 10**6] 
MaxVals =   [1]

iterations = 10**4
for max_val in MaxVals:
    
    for size in samp_size:
        
        ds_times = []
        wd_times = []
        times_slower = []
        
        for iteration in range(iterations):
            
            min_, max_ = 0, max_val
            
            # Generate random samples 
            x1 = np.random.uniform(0, max_val, size)
            x2 = np.random.uniform(0, max_val, size)
            
            # Measure the time for |Δ𝒮| from freqs
            start_time1 = timeit.default_timer()
            dat = DS_from_data(x1, x2, min_, max_, min_, max_)
            end_time1 = timeit.default_timer()
            ds_times.append((end_time1 - start_time1)*1000)
            
            # Measure the time for wasserstein from freqs
            start_time2 = timeit.default_timer()
            dat = stats.wasserstein_distance(x1, x2)
            end_time2 = timeit.default_timer()
            wd_times.append((end_time2 - start_time2)*1000)
            
            times_slower.append(wd_times[-1] / ds_times[-1])
            

        # Report mean and standard deviation of computation times
        ds_mean = np.nanmean(ds_times)
        ds_std = np.nanstd(ds_times)

        wd_mean = np.nanmean(wd_times)
        wd_std = np.nanstd(wd_times)

        times_slower_mean = np.nanmean(times_slower)
        times_slower_std = np.nanstd(times_slower)

        #print(max_val, size, f"{ds_mean:.3f}", f"{ds_std:.3f}", f"{wd_mean:.3f}", 
        #      f"{wd_std:.3f}", f"{times_slower_mean:.3f}", f"{times_slower_std:.3f}")

        print('Max val:', max_val, 'Sample size:', size)
        print(f"avg time for |Δ𝒮| {ds_mean:.3f} ± {ds_std:.3f}") 
        print(f"avg time for WD {wd_mean:.3f} ± {wd_std:.3f}")
        print(f"WD is {times_slower_mean:.3f} ± {times_slower_std:.3f} times slower than |Δ𝒮|\n")

            
    print("\n")


Max val: 1 Sample size: 100
avg time for |Δ𝒮| 0.020 ± 0.007
avg time for WD 0.058 ± 0.017
WD is 2.887 ± 0.544 times slower than |Δ𝒮|

Max val: 1 Sample size: 1000
avg time for |Δ𝒮| 0.027 ± 0.013
avg time for WD 0.370 ± 0.121
WD is 14.331 ± 4.526 times slower than |Δ𝒮|

Max val: 1 Sample size: 10000
avg time for |Δ𝒮| 0.044 ± 0.030
avg time for WD 4.198 ± 0.561
WD is 106.279 ± 22.595 times slower than |Δ𝒮|

Max val: 1 Sample size: 100000
avg time for |Δ𝒮| 0.156 ± 0.040
avg time for WD 52.224 ± 3.303
WD is 343.921 ± 42.615 times slower than |Δ𝒮|

Max val: 1 Sample size: 1000000
avg time for |Δ𝒮| 0.948 ± 0.043
avg time for WD 665.836 ± 7.714
WD is 703.622 ± 24.952 times slower than |Δ𝒮|





# Comparing samples from discrete distributions

WD = Wasserstein distance

In [35]:
samp_size = [10**2, 10**3, 10**4, 10**5, 10**6] 
MaxVals =   [10, 100, 1000]

iterations = 10**2
for max_val in MaxVals:
    
    for size in samp_size:
        
        ds_times = []
        wd_times = []
        times_slower = []
        
        for iteration in range(iterations):
            
            min_, max_ = 0, max_val
            
            # Generate random samples 
            x1 = np.random.randint(1, max_val, size)
            x2 = np.random.randint(1, max_val, size)
            
            # Measure the time for |Δ𝒮| from freqs
            start_time1 = timeit.default_timer()
            dat = DS_from_data(x1, x2, min_, max_, min_, max_)
            end_time1 = timeit.default_timer()
            ds_times.append((end_time1 - start_time1)*1000)
            
            # Measure the time for wasserstein from freqs
            start_time2 = timeit.default_timer()
            dat = stats.wasserstein_distance(x1, x2)
            end_time2 = timeit.default_timer()
            wd_times.append((end_time2 - start_time2)*1000)
            
            times_slower.append(wd_times[-1] / ds_times[-1])
            

        # Report mean and standard deviation of computation times
        ds_mean = np.nanmean(ds_times)
        ds_std = np.nanstd(ds_times)

        wd_mean = np.nanmean(wd_times)
        wd_std = np.nanstd(wd_times)

        times_slower_mean = np.nanmean(times_slower)
        times_slower_std = np.nanstd(times_slower)

        #print(max_val, size, f"{ds_mean:.3f}", f"{ds_std:.3f}", f"{wd_mean:.3f}", 
        #      f"{wd_std:.3f}", f"{times_slower_mean:.3f}", f"{times_slower_std:.3f}")

        print('Max val:', max_val, 'Sample size:', size)
        print(f"avg time for |Δ𝒮| {ds_mean:.3f} ± {ds_std:.3f}") 
        print(f"avg time for WD {wd_mean:.3f} ± {wd_std:.3f}")
        print(f"WD is {times_slower_mean:.3f} ± {times_slower_std:.3f} times slower than |Δ𝒮|\n")

        
    print("\n")


Max val: 10 Sample size: 100
avg time for |Δ𝒮| 0.038 ± 0.022
avg time for WD 0.113 ± 0.124
WD is 2.787 ± 1.046 times slower than |Δ𝒮|

Max val: 10 Sample size: 1000
avg time for |Δ𝒮| 0.033 ± 0.020
avg time for WD 0.326 ± 0.097
WD is 10.607 ± 2.149 times slower than |Δ𝒮|

Max val: 10 Sample size: 10000
avg time for |Δ𝒮| 0.056 ± 0.037
avg time for WD 3.134 ± 0.800
WD is 61.321 ± 11.051 times slower than |Δ𝒮|

Max val: 10 Sample size: 100000
avg time for |Δ𝒮| 0.250 ± 0.051
avg time for WD 38.178 ± 2.049
WD is 156.703 ± 20.004 times slower than |Δ𝒮|

Max val: 10 Sample size: 1000000
avg time for |Δ𝒮| 1.833 ± 0.107
avg time for WD 458.888 ± 12.020
WD is 250.906 ± 9.921 times slower than |Δ𝒮|



Max val: 100 Sample size: 100
avg time for |Δ𝒮| 0.022 ± 0.004
avg time for WD 0.080 ± 0.070
WD is 3.367 ± 1.807 times slower than |Δ𝒮|

Max val: 100 Sample size: 1000
avg time for |Δ𝒮| 0.027 ± 0.007
avg time for WD 0.333 ± 0.077
WD is 12.172 ± 0.519 times slower than |Δ𝒮|

Max val: 100 Sample size: 1