In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

path = 'https://github.com/oregon-data-science/DSCI101/raw/main/data/'

"imports complete" 

# Comparing Two Samples

In [None]:
## baby birth weights + info about their mothers from a hospital in the US, 
## including smoker/non-smoker 
births = Table.read_table(path + 'baby.csv')

In [None]:
births

In [None]:
## subset to the data of interest
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

In [None]:
##  Count up the number of smoker/non smokers
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.group('Maternal Smoker', np.average)

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

In [None]:
## Is there a difference? 

In [None]:
## back to slides

# Test Statistic

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
## Let's make a function so we are ready to repeate this calculation in our simulation

def difference_of_means(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups"""
    
    #table with the two relevant columns
    reduced = table.select(label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
## back to slides

# Random Permutation (Shuffling)

In [None]:
## mini example 
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))

In [None]:
letters.sample()

In [None]:
letters.sample(with_replacement = False)

In [None]:
letters.with_column('Shuffled', letters.sample(with_replacement = False).column(0))

# Simulation Under Null Hypothesis

In [None]:
## A reminder what our real data looks like
smoking_and_birthweight

In [None]:
## shuffle the labels once
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
## simulated difference in means with labeld shuffled
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

In [None]:
## reminder of the true difference in means
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')


# Permutation Test

In [None]:
## package everything into a function

def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
## a single trial
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
## what do we do when we want to simulate somethign over and over way?? For loop! 
num_simulations = 2500
differences = make_array()

for i in np.arange(num_simulations):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
## make a histogram to visualize the empirical null distribution

Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

In [None]:
## empirical pvalue
sum(differences <= observed_difference) / num_simulations

In [None]:
## association or causation? 

# Randomized Control Experiment

In [None]:
## Treatment for muscle pain, patients received a botox injection 
## 0 stayed the same, 1 got better 
botox = Table.read_table(path + 'bta.csv')
botox.show()

In [None]:
## helpful visualization
botox.pivot('Result', 'Group')

In [None]:
## proportions in each group 
botox.group('Group', np.average)

In [None]:
## back to slides

In [None]:
## the "full data" 
observed_outcomes = Table.read_table(path + 'observed_outcomes.csv')
observed_outcomes.show()

In [None]:
## back to slides

# Testing the Hypothesis

In [None]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

In [None]:
one_simulated_difference(botox, 'Result', 'Group')

In [None]:
simulated_diffs = make_array()

for i in np.arange(10000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)

In [None]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name, left_end=observed_diff)
plots.scatter(observed_diff, 0, color="red", s=40, zorder=3);

In [None]:
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)

In [None]:
## association or causation?