# Parallelization with Dask

In [1]:
import numpy as np
import ipyparallel as ipp
import itertools
from distributed import progress
import pandas as pd
from typing import NamedTuple

import smpsite.smpsite as smp

## 1. Dask Setup

In [2]:
rc = ipp.Cluster(n=3).start_and_connect_sync()

Starting 3 engines with <class 'ipyparallel.cluster.launcher.LocalEngineSetLauncher'>


  0%|          | 0/3 [00:00<?, ?engine/s]

In [3]:
dask_client = rc.become_dask()
dask_client

0,1
Connection method: Direct,
Dashboard: /user/facusapienza21/proxy/8787/status,

0,1
Comm: tcp://192.168.17.58:45047,Workers: 0
Dashboard: /user/facusapienza21/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
# Check to see if the threads are ready
dview = rc[:]
len(dview)

3

## 2. Macro function definition

In [56]:
def ipp_simulate_estimations(n,
                             k,
                             kappa_within_site, 
                             site_lat,
                             site_long,
                             outlier_rate, 
                             secular_method,
                             kappa_secular, 
                             ignore_outliers, 
                             seed):
    
    class Params(NamedTuple):
        """
        Macro to encapsulate all the parameters in the sampling model.
        """

        # Number of sites
        n : int
        # Number of samples per site
        k : int

        # Concentration parameter within site
        kappa_within_site : float    

        # Latitude and longitude of site
        site_lat  : float 
        site_long : float

        # Proportion of outliers to be sampled from uniform distribution
        outlier_rate : float

        # Method to sample secular variation. Options are ("tk03", "G", "Fisher")
        secular_method : str 
        kappa_secular : float    # Just needed for Fisher sampler
        
    params = Params(n=n,
                    k=k, 
                    kappa_within_site=kappa_within_site,
                    site_lat=site_lat, 
                    site_long=site_long, 
                    outlier_rate=outlier_rate, 
                    secular_method=secular_method,
                    kappa_secular=kappa_secular)
    
    df_tot = smp.simulate_estimations(params, 
                                      n_iters=1000,
                                      ignore_outliers=ignore_outliers, 
                                      seed=seed)
    
    stats = dict(df_tot['error_angle'].describe(percentiles=[.05, .25, .50, .75, .95]))
    df = pd.DataFrame.from_dict({'error_angle_mean': [stats['mean']], 
                                 'error_angle_median': [stats['50%']], 
                                 'error_angle_95': [stats['95%']],
                                 'error_angle_std': [stats['std']]})
    
    df['n_tot'] = n * k 
    df['n'] = n
    df['k'] = k
    df['kappa_within_site'] = kappa_within_site
    df['site_lat'] = site_lat
    df['site_long'] = site_long
    df['outlier_rate'] = outlier_rate
    df['secular_method'] = secular_method
    df['kappa_secular'] = kappa_secular
    df['ignore_outliers'] = ignore_outliers
    
    return df

## 3. Parameter space exploration

In [64]:
min_n, max_n = 1, 300

params_iter = {'n': np.arange(1, 41, 1),
               'k': np.arange(1, 21, 1), 
               'kappa_within_site': 10 ** np.linspace(1,3,1),
               'site_lat': [30.0],
               'site_long': [0.0], 
               'outlier_rate': [0.0],
               'secular_method': ["G"], 
               'kappa_secular': [None],
               'ignore_outliers': [False]}

params_iter_mesh = np.meshgrid(*[params_iter[key] for key in params_iter.keys()])

for i, key in enumerate(params_iter.keys()):
    params_iter[key] = params_iter_mesh[i].ravel()
    
all_n_tot = params_iter['n'] * params_iter['k']
valid_index = (min_n <= all_n_tot) & (all_n_tot <= max_n)

n_tasks = np.sum(valid_index)
print("Total number of simulations: ", n_tasks)

indices = np.arange(n_tasks)
np.random.shuffle(indices)

for key in params_iter.keys():
    params_iter[key] = params_iter[key][valid_index]
    # Shuffle
    params_iter[key] = params_iter[key][indices]

params_iter["seed"] = np.random.randint(0, 2**32-1, n_tasks)

Total number of simulations:  577


## 4. Run Simulation

In [58]:
task = dask_client.map(ipp_simulate_estimations, 
                       params_iter['n'],
                       params_iter['k'],
                       params_iter['kappa_within_site'], 
                       params_iter['site_lat'], 
                       params_iter['site_long'], 
                       params_iter['outlier_rate'],
                       params_iter['secular_method'], 
                       params_iter['kappa_secular'], 
                       params_iter['ignore_outliers'], 
                       params_iter['seed'])

res = dask_client.submit(pd.concat, task)

progress(res)

VBox()

In [59]:
df_all = res.result()
# df_all.to_csv('outputs/run_compared_4000samples.csv')

In [60]:
df_all

Unnamed: 0,error_angle_mean,error_angle_median,error_angle_95,error_angle_std,n_tot,n,k,kappa_within_site,site_lat,site_long,outlier_rate,secular_method,kappa_secular,ignore_outliers
0,14.122582,13.329890,27.742470,7.425255,13,1,13,10.0,30.0,0.0,0.0,G,,False
0,10.395443,9.806610,20.917339,5.723314,9,9,1,10.0,30.0,0.0,0.0,G,,False
0,17.280755,15.689302,35.645657,9.583187,3,3,1,10.0,30.0,0.0,0.0,G,,False
0,8.188252,7.621620,16.800650,4.485665,18,6,3,10.0,30.0,0.0,0.0,G,,False
0,12.578811,11.730220,24.931701,6.797665,10,2,5,10.0,30.0,0.0,0.0,G,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,10.417143,9.369512,21.432937,5.884800,10,5,2,10.0,30.0,0.0,0.0,G,,False
0,13.880164,12.629885,27.816486,7.689912,18,1,18,10.0,30.0,0.0,0.0,G,,False
0,13.250358,12.346505,26.099292,7.064937,20,1,20,10.0,30.0,0.0,0.0,G,,False
0,13.879134,13.229521,26.992426,7.295405,15,1,15,10.0,30.0,0.0,0.0,G,,False


In [55]:
all_n_tot.shape

(8000,)

In [49]:
valid_index

array([False, False, False, ..., False, False, False])

In [8]:
min_n_tot = 50
max_n_tot = 50

all_n = np.arange(1, 41, 1)
all_k = np.arange(1, 20, 1)
all_kappa_within_site = 10 ** np.linspace(1,3,10)
all_site_lat = np.arange(0, 81, 10)
all_site_long = [0]
all_outlier_rate = [0.0]
all_secular_method = ["Fisher"]
all_kappa_secular = 10 ** np.linspace(1,3,10)
all_ignore_outliers = [True]


# all_angular_within_site = np.arange(1, 20, 10)
all_kappa_within_site = 10 ** np.linspace(1,3,10)
all_site_lat = np.arange(0, 91, 10)
all_outlier_rate = [0.00, 0.05, 0.10, 0.20]#, 0.05, 0.25]
all_n = [50,10] #np.arange(1,21)
all_k = [1,5] #np.arange(1,41)
all_site_lon = [0]
all_ignore_outliers = [True]

In [24]:
a = [1,2,3]
b = [1.1, 1.2,1.3]

c = [a,b]
c_ = np.meshgrid(*c)
for l in c:
    print(l)
    l = c[1]
a

[1, 2, 3]
[1.1, 1.2, 1.3]


[1, 2, 3]

In [None]:
all_kappa_within_site, all_site_lat, all_outlier_rate, all_n, \
all_k, all_site_lon, all_ignore_outliers \
= np.meshgrid(all_kappa_within_site, all_site_lat, \
                       all_outlier_rate, all_n, all_k, \
                         all_site_lon, all_ignore_outliers)

all_kappa_within_site = all_kappa_within_site.ravel()
all_site_lat = all_site_lat.ravel()
all_outlier_rate = all_outlier_rate.ravel()
all_n = all_n.ravel()
all_k = all_k.ravel()
all_site_lon = all_site_lon.ravel()
all_ignore_outliers = all_ignore_outliers.ravel()

all_N = all_n * all_k 
valid_index = (min_N <= all_N) & (all_N <= max_N) 

all_kappa_within_site = all_kappa_within_site[valid_index]
all_site_lat = all_site_lat[valid_index]
all_outlier_rate = all_outlier_rate[valid_index]
all_n = all_n[valid_index]
all_k = all_k[valid_index]
all_site_lon = all_site_lon[valid_index]
all_ignore_outliers = all_ignore_outliers[valid_index]


n_tasks = len(all_n)
all_seed = np.random.randint(0, 2**32-1, n_tasks)

In [5]:
def ipp_simulate_estimations(kappa_within_site, 
                             site_lat,
                             outlier_rate,
                             n,
                             k,
                             site_lon,
                             ignore_outliers,
                             seed):
    
    # If we don't include the definition of the class inside the code, this returns this weird message error... why? 
    class Params(NamedTuple):

        kappa_within_site : float    
        site_lat : float
        outlier_rate : float
        n : int
        k : int    
        site_lon : float

    params = Params(kappa_within_site=kappa_within_site, 
                    site_lat=site_lat,
                    outlier_rate=outlier_rate,
                    n=n,
                    k=k,
                    site_lon=site_lon)
    
    df_tot = smp.simulate_estimations(params, 
                                      n_iters=4000, 
                                      ignore_outliers=ignore_outliers,
                                      seed=seed)
    
    stats = dict(df_tot['error_angle'].describe(percentiles=[.05, .25, .50, .75, .95]))
    df = pd.DataFrame.from_dict({'error_angle_mean': [stats['mean']], 
                                 'error_angle_median': [stats['50%']], 
                                 'error_angle_95': [stats['95%']]
                                 'error_angle_std': [stats['std']]})
    
    df['n_tot'] = n * k 
    df['n'] = n
    df['k'] = k
    df['kappa_within_site'] = kappa_within_site
    df['site_lat'] = site_lat
    df['outlier_rate'] = outlier_rate
    df['ignore_outliers'] = ignore_outliers
    df["site_long"] = site_lon
    
    return df

We need to set the structure for the parallel simulations with different parameters. 

In [6]:
min_N = 50
max_N = 50

# all_angular_within_site = np.arange(1, 20, 10)
all_kappa_within_site = 10 ** np.linspace(1,3,10)
all_site_lat = np.arange(0, 91, 10)
all_outlier_rate = [0.00, 0.05, 0.10, 0.20]#, 0.05, 0.25]
all_n = [50,10] #np.arange(1,21)
all_k = [1,5] #np.arange(1,41)
all_site_lon = [0]
all_ignore_outliers = [True]

all_kappa_within_site, all_site_lat, all_outlier_rate, all_n, \
all_k, all_site_lon, all_ignore_outliers \
= np.meshgrid(all_kappa_within_site, all_site_lat, \
                       all_outlier_rate, all_n, all_k, \
                         all_site_lon, all_ignore_outliers)

all_kappa_within_site = all_kappa_within_site.ravel()
all_site_lat = all_site_lat.ravel()
all_outlier_rate = all_outlier_rate.ravel()
all_n = all_n.ravel()
all_k = all_k.ravel()
all_site_lon = all_site_lon.ravel()
all_ignore_outliers = all_ignore_outliers.ravel()

all_N = all_n * all_k 
valid_index = (min_N <= all_N) & (all_N <= max_N) 

all_kappa_within_site = all_kappa_within_site[valid_index]
all_site_lat = all_site_lat[valid_index]
all_outlier_rate = all_outlier_rate[valid_index]
all_n = all_n[valid_index]
all_k = all_k[valid_index]
all_site_lon = all_site_lon[valid_index]
all_ignore_outliers = all_ignore_outliers[valid_index]


n_tasks = len(all_n)
all_seed = np.random.randint(0, 2**32-1, n_tasks)

In [7]:
all_ignore_outliers = all_k == 5

In [8]:
len(all_kappa_within_site), len(all_site_lat), len(all_outlier_rate), \
len(all_n), len(all_k), len(all_site_lon), len(all_ignore_outliers)

(800, 800, 800, 800, 800, 800, 800)

In [9]:
task = dask_client.map(ipp_simulate_estimations, all_kappa_within_site, \
                       all_site_lat, all_outlier_rate, all_n, \
                       all_k, all_site_lon, all_ignore_outliers, all_seed)

res = dask_client.submit(pd.concat, task)

progress(res)

VBox()

In [None]:
df_all = res.result()
df_all.to_csv('outputs/run_compared_4000samples.csv')