In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

path = 'https://github.com/oregon-data-science/DSCI101/raw/main/data/'

"imports complete" 

## Estimating the median income of SF gov employees

In [None]:
sf = Table.read_table(path + 'san_francisco_2015.csv')
sf

In [None]:
# Let's explore the data!!!! Who is making the most money
sf.sort('Total Compensation', descending=True).show(5)

In [None]:
# Who is making the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
## drop bad data by calculating the salary based on minimum wage in SF (2015)
min_salary = 10 * 20 * 52
sf = sf.where('Total Compensation', are.above(min_salary))

In [None]:
## Reminder: the median is the 50th percentile 
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [None]:
## Assume we only have a random sample 300 of public employees 
our_sample = sf.sample(300, with_replacement=False)
our_sample.show(5)

In [None]:
percentile(50, our_sample.column('Total Compensation'))

In [None]:
## real population 
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');

In [None]:
## Our sample
our_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Sample Distribution');

In [None]:
## back to slides

# Variability of the Estimate

In [None]:
## let's define a function that takes our samples and calculated an estimate of the median 
## in preparation for simulating our sample process

def generate_sample_median(samp_size):
    our_sample = sf.sample(samp_size, with_replacement=False)
    return percentile(50, our_sample.column('Total Compensation'))

In [None]:
sample_median = generate_sample_median(300)
sample_median

In [None]:
error = sample_median - pop_median
error

# Quantifying Uncertainty

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(300)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

## Real median
plots.scatter(pop_median, 0, color="red");

In [None]:
err_bins = np.arange(-15000, 12501, 2500)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

## 0=0 error
plots.scatter(0, 0, color="red");

In [None]:
## back to slides

# Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

In [None]:
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

In [None]:
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.scatter(sample_median, 0, color="blue");

## Confidence Intervals

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(sample_median, 0, color="blue", zorder=2);