In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 
from tqdm import tqdm_notebook, tnrange

%matplotlib inline
sns.set_context('notebook')
mp.jupyter.init()


In [2]:
pkg = mp.jupyter.open_package()
#pkg = mp.jupyter.open_source_package()
pkg

In [3]:
rasp = pkg.resource('rasp_tracts_sd').dataframe()
len(rasp)

370520

In [4]:
rasp.head()

Unnamed: 0,geoid,col_name,value_est,value_margin,sex,raceeth,age_range,pov,age_min,age_max,overlaping
0,14000US06073000100,B17001A_001,2455,214,both,white,00-120,all,0,120,1
1,14000US06073000100,B17001A_002,102,70,both,white,00-120,below,0,120,1
2,14000US06073000100,B17001A_003,12,16,male,white,00-120,below,0,120,1
3,14000US06073000100,B17001A_004,0,12,male,white,00-004,below,0,4,1
4,14000US06073000100,B17001A_005,0,12,male,white,05-005,below,5,5,1


In [13]:
def make_replicates(rasp, N=40):
    
    reps = pd.DataFrame({'estimate':rasp.value_est, 'stddev':rasp.value_margin/1.645})
    
    for i in tnrange(N, desc = 'Make replicate'):

        # There are a whole lot of estimates of 0, and the ramdom sampling process above gives half of them 
        # a non-zero value. 
        #n = np.where(rasp.value_est==0,0,
        #            np.random.normal(rasp.value_est,  rasp.value_margin/1.645).round(0).astype(int))
        n = np.random.normal(reps.estimate,  reps.stddev)

        reps['rep{:02}'.format(i+1)] = n #.round(0).astype(int)
        
    return reps

In [14]:
reps = make_replicates(rasp)

reps_only = reps[list(c for c in reps.columns if c.startswith('rep'))]
reps['reps_mean'] = reps_only.mean(axis=1) #.round(0).astype(int)
reps['reps_std'] = reps_only.std(axis=1)

HBox(children=(IntProgress(value=0, description='Make replicate', max=40, style=ProgressStyle(description_widt…




In [8]:
n.sum()

35540912.20214871

In [9]:
# Check that the mean and std of the replicates is similar to the estimate
t = reps[['estimate','reps_mean','stddev','reps_std']].copy()
t['est_diff'] = t.estimate - t.reps_mean
t['std_diff'] = t.stddev - t.reps_std
t.est_diff.sum(), t.est_diff.sum()/len(t) , t.std_diff.sum(), t.std_diff.sum()/len(t)

(-1324.051558357416,
 -0.003573495515376811,
 26005.15955129841,
 0.07018557581587609)

In [10]:
# Check again, after rounding. 
# Check that the mean and std of the replicates is similar to the estimate

t = reps[['estimate','reps_mean','stddev','reps_std']].copy().round(0).astype(int)
t['reps_mean'] = reps_only.mean(axis=1).round(0).astype(int)
t['reps_std'] = reps_only.std(axis=1).round(0).astype(int)

t['est_diff'] = t.estimate - t.reps_mean
t['std_diff'] = t.stddev - t.reps_std
t.est_diff.sum(), t.est_diff.sum()/len(t) , t.std_diff.sum(), t.std_diff.sum()/len(t)

(-1508, -0.0040699557378818954, -40180, -0.10844218935550037)

In [11]:
# Check that the rasp and reps are aligned
m = rasp.join(reps)
assert all(m.value_est == m.estimate)