In [None]:
from datascience import *
%matplotlib inline
path_data = '../../../assets/data/'
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=np.VisibleDeprecationWarning)

# Lecture 23 #

## Percentiles ##

The pth percentile is first value on the sorted list that is at least as large as p% of the elements.

In [None]:
# Manually compute the 55th percentile of this array.

x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data into numerical order.

np.sort(x)

In [None]:
# Step 2. Figure out where 55th percentile would be.

np.arange(1, 7)/6

In [None]:
#Call the number that represents the value in the 55% percentile.

np.sort(x).item(3)

In [None]:
# Alternatively: One line of code

percentile(55, x)

### Discussion Question:

Which are True, when s = [1, 5, 7, 3, 9]?

* percentile(10, s) == 0

* percentile(39, s) == percentile(40, s)

* percentile(40, s) == percentile(41, s)

* percentile(50, s) == 5



In [None]:
#Create your array.

s = 

In [None]:
# Find the percentile for each value. Try using code instead of hard coding like above. 

np.arange(

In [None]:
#Before running, what is your guess? Why?

percentile(10, s) == 0

In [None]:
#Before running, what is your guess? Why?

percentile(39, s) == percentile(40, s)

In [None]:
#Before running, what is your guess? Why?

percentile(40, s) == percentile(41, s)

In [None]:
#Before running, what is your guess? Why?

percentile(50, s) == 5

## Total Compensation in Population 

The table being imported is public data about the employees of the city and county of San Francisco in 2019. 
There are no personal identifiers, just information about each individual position. 

In [None]:
sf = Table.read_table('san_francisco_2019.csv')
sf.show(3)

In [None]:
# Find who made the most money



In [None]:
# Find who made the least money



In [None]:
# Focus our research on employees who worked at least half time for the minimum wage of 2019
# for the full year, excluding two weeks for vacation.
# $15/hr, 20 hr/wk, 50 weeks

min_salary = 15 * 20 * 50

#Create a table that only shows employees who are above the minimum salary.

sf = 
sf

In [None]:
# How many employees are left in our data with the restriction?




In [None]:
# Display a histogram of the cleaned up data using the given bins.

sf_bins = np.arange(0, 726000, 25000)



#disregard the error that comes up. 

## Parameter: Median Total Compensation 

Median is a common measure used when talking about income because it is a middle value.

If we used the average (mean) value it would be affected by the employees with very large values and skew the appearance of incomes for the city/county jobs. 

What is the percentile of the median?

In [None]:
# Since we do have the entire population (restricted) we can get the population median. 

pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

## Estimating the Parameter (Pretend it is Unknown) 

If we did not have access to the entire population, we would have to estimate the population median.

Usually, you will not have the entire population. So we will use this scenario to see how well our sampling does with estimating the parameter.

In [None]:
# Create a sample from the restricted table of 400, without replacment. 

our_sample = sf.sample(400, with_replacement=False)


In [None]:
# Display a histagram of your sample data using the sf_bins.




In [None]:
#Find the median of your sample. 

percentile(50, our_sample.column('Total Compensation') )

#How close is it to the population parameter?
#How close was your neighbor's value to the parameter?

But in the real world we won't be able to keep going back to the population. 
Why?

We need to know how to generate a new random sample *without going back to the population?*

# Bootstrap

Creating a sample from an existing sample.

Sample randomly
 - from the original sample
 - with replacement (allows for repeats of the same record)
 - the same number of times as the original sample size

### Example Bootstrap

In [None]:
#start with a "data set"

original = make_array(1,2,3,4,5)
original

In [None]:
#Use existing "data set" to create a new "data set"
# Create a table
table = Table().with_columns('Original', original)

# Add an array as a new column
new_column = table.sample()
table.with_columns(
    'Bootstrap 1', new_column.column(0)
)

In [None]:
#Use a loop to add four new bootstraps
for i in np.arange(1,5):
    new_column = table.sample()
    table = table.with_columns(
            f'Bootstrap {i}', new_column.column(0)
    )
table

In [None]:
# Default behavior of tbl.sample:
# at random with replacement,
# the same number of times as rows of tbl

bootstrap_sample = our_sample.sample()


In [None]:
# Create a histogram of the bootstrap samples using the sf_bins.



## Bootstrap Sample Median
This is one estimate of the population median.

In [None]:
# Find the median income from the bootstrap sample



In [None]:
#Create a function that finds a bootstrap sample, then returns the median of that sample.

def one_bootstrap_median():
    
    # draw the bootstrap sample
    resample = our_sample.sample()
    
    # return the median total compensation in the bootstrap sample
    return percentile(50, resample.column('Total Compensation'))

In [None]:
# Test the function

one_bootstrap_median()


In [None]:
# Generate the medians of 1000 bootstrap samples

num_repetitions = 1000

bstrap_medians = make_array()

for i in np.arange(num_repetitions):
    bstrap_medians = np.append(bstrap_medians, one_bootstrap_median())
    

In [None]:
# Create a table that stores the medians under the column Bootstrap Sample Median

resampled_medians = 


In [None]:
median_bins=np.arange(120000, 160000, 2000)

#Display a histogram of the resampled medians using the given bins. 



# Plotting the original parameter; do not change. 
parameter_green = '#32CD32'
plots.ylim(-0.000005, 0.00014)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2)
plots.title('Bootstrap Medians and the Parameter (Green Dot)');

## Percentile Method: Middle 95% of the Bootstrap Estimates

95% of our data is between two values, what are those values?

In [None]:
#There is 5% left over in tails. 

#How much is in the lower tail?
left = percentile(2.5, bstrap_medians)

#To find the upper value we add the lower tail + the 95%.
right = percentile(97.5, bstrap_medians)

#Display those values.
make_array(left, right)

In [None]:
resampled_medians.hist(bins = median_bins)

# Plotting parameters and the 95%, do not change.
plots.ylim(-0.000005, 0.00014)
plots.plot(make_array(left, right), make_array(0, 0), color='yellow', lw=3, zorder=1)
plots.scatter(pop_median, 0, color=parameter_green, s=40, zorder=2);

***Did the Bootstrap Method give a good estimate to the population parameter?***