In [1]:
%load_ext autotime
import pandas as pd
import numpy as np
import concurrent.futures

In [38]:
population = pd.read_csv('dat/2005_2009_ver2_42065_synth_people.txt')

time: 166 ms


In [39]:
population.head()

Unnamed: 0,p_id,hh_id,serialno,stcotrbg,age,sex,race,sporder,relate,school_id,workplace_id
0,416175660,261526469,2005000002176,420659503002,86,2,1,1,0,,
1,416175661,261526469,2005000002176,420659503002,92,1,1,2,1,,
2,416175676,261533970,2005000002176,420659508002,86,2,1,1,0,,
3,416175677,261533970,2005000002176,420659508002,92,1,1,2,1,,
4,416175678,261526897,2005000002176,420659503003,86,2,1,1,0,,


time: 116 ms


# Task: randomly sample the population and summarize the age distribution

This is an example function to execute on the population dataframe

In [20]:
def get_max_age(sample_size):
    #print(id(population))
    #print('Sample number %d' % n)
    return population.loc[np.random.randint(0, len(population), sample_size), 'age'].max()

time: 3.3 ms


In [30]:
# Here we use a process pool and the map function
def test_process_pool(num_samples, sample_size):
    with concurrent.futures.ProcessPoolExecutor() as executor:
        samples = list(range(num_samples))
        for sample_number, max_age in zip(
            samples,executor.map(get_max_age, [sample_size]*num_samples)):
            pass

time: 7.18 ms


In [29]:
# Note the consistent syntax
def test_threading_pool(num_samples, sample_size):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        samples = list(range(num_samples))
        for sample_number, max_age in zip(
            samples,executor.map(get_max_age, [sample_size]*num_samples)):
            pass

time: 7.45 ms


## Testing performance of the process pool compared to threading

Note that there is a significant amount of overhead for the process pool relative to the threading pool

In [31]:
test_process_pool(10,10)

time: 75.1 ms


In [32]:
test_threading_pool(10,10)

time: 28.8 ms


But when the amount of work increases the independent execution in the process pool wins out

In [36]:
test_process_pool(100,1000)

time: 1.79 s


In [35]:
test_threading_pool(100,1000)

time: 4.92 s
