In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Comparing Two Samples

In [None]:
births = Table.read_table('baby.csv')

In [None]:
births

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

# Test Statistic

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
def difference_of_means(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups"""
    
    #table with the two relevant columns
    reduced = table.select(label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
staff = Table().with_columns(
    'Names', make_array('Jim', 'Pam', 'Dwight', 'Michael'),
    'Ages', make_array(29, 28, 34, 41)
)

In [None]:
staff.sample()

In [None]:
staff.sample(with_replacement = False)

In [None]:
staff.with_column('Shuffled', staff.sample(with_replacement = False).column(0))

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

# Permutation Test

In [None]:
def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

# Randomized Control Experiment

In [None]:
botox = Table.read_table('bta.csv')
botox.show()

In [None]:
botox.pivot('Result', 'Group')

In [None]:
botox.group('Group', np.average)

# Testing the Hypothesis

In [None]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

In [None]:
one_simulated_difference(botox, 'Result', 'Group')

In [None]:
simulated_diffs = make_array()

for i in np.arange(10000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)

In [None]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)

In [None]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)

## Example: Benford's Law

In [None]:
digits = np.arange(1, 10)
benford_model = np.log10(1 + 1/digits)

In [None]:
benford = Table().with_columns(
    'First digit', digits,
    'Benford model prob', benford_model)
benford.barh('First digit')

In [None]:
# You don't have to understand how this function works, since it uses Python features from beyond Math 121.
def first_digit(num):
    return int(str(num)[0])

In [None]:
first_digit(32)

In [None]:
first_digit(17719087)

In [None]:
# County populations from the census data
counties = Table.read_table('counties.csv')
counties = counties.where('SUMLEV', 50).select(5,6,9).relabeled(0,'State').relabeled(1,'County').relabeled(2,'Population')
counties.show(3)

In [None]:
first_digits = counties.apply(first_digit, 'Population')
counties = counties.with_column('First digit', first_digits)
counties.show(3)

In [None]:
num_counties = counties.num_rows

In [None]:
by_digit = counties.group('First digit')
proportions = by_digit.column('count')/num_counties
by_digit = by_digit.with_columns(
    'Observed proportion', proportions,
    'Benford predicted proportion', benford_model
)
by_digit.drop('count').barh('First digit')

Null hypothesis: ____

Alternative hypothesis: ____

Test statistic: ___

Fill in the blank with "Bigger" or "Smaller":

___ values of the test statistic favor the alternative

In [None]:
observed_tvd = sum(abs(proportions - benford_model))/2
observed_tvd

In [None]:
sample_proportions(num_counties, benford_model)

In [None]:
simulated_frequencies = sample_proportions(num_counties, benford_model)
tvd_from_sim = sum(abs(simulated_frequencies - benford_model))/2
tvd_from_sim

In [None]:
def simulate_county_first_digits():
    simulated_frequencies = sample_proportions(num_counties, benford_model)
    tvd_from_sim = sum(abs(simulated_frequencies - benford_model))/2
    return tvd_from_sim

In [None]:
simulated_tvds = make_array()

for i in np.arange(10000):
    simulated_tvds = np.append(simulated_tvds, simulate_county_first_digits())

In [None]:
Table().with_column("TVD Predicted by Benford's Law", simulated_tvds).hist(0)

In [None]:
np.count_nonzero(simulated_tvds >= observed_tvd) / 10000

Are the data consistent with the null hypothesis?