In [1]:
import random
import math
import sys

In [2]:
def bootstrap(x):
	samp_x = []
	for i in range(len(x)):
		samp_x.append(random.choice(x))
	return samp_x

In [3]:
# subtracts group a mean from group b mean and returns result
def meandiff(grpA, grpB):
	return sum(grpB) / float(len(grpB)) - sum(grpA) / float(len(grpA))

In [15]:
def diff2meanconf(diet_1, diet_2, grpA, grpB, conf_interval):
    # list of lists
    samples = [grpA, grpB] 
    a = 0
    b = 1

    observed_mean_diff = meandiff(samples[a], samples[b])

    num_resamples = 10000   # number of times we will resample from our original samples
    out = []                # will store results of each time we resample

    for i in range(num_resamples):
        # get bootstrap samples for each of our groups
        # then compute our statistic of interest
        # append statistic to out
        bootstrap_samples = []  # list of lists
        for sample in samples:
            bootstrap_samples.append(bootstrap(sample))
        # now we have a list of bootstrap samples, run meandiff
        out.append(meandiff(bootstrap_samples[a], bootstrap_samples[b]))

    out.sort()

    tails = (1 - conf_interval) / 2

    # in case our lower and upper bounds are not integers,
    # we decrease the range (the values we include in our interval),
    # so that we can keep the same level of confidence
    lower_bound = int(math.ceil(num_resamples * tails))
    upper_bound = int(math.floor(num_resamples * (1 - tails)))

    ######################################
    #
    # Output
    #
    ######################################

    # print observed value and then confidence interval
    print("**********Diet: ", diet_1, " vs Diet: ", diet_2,"**********")
    print ("Observed difference between the means: %.2f" % observed_mean_diff)
    print ("We have", conf_interval * 100, "% confidence that the true difference between the means", end=" ")
    print ("is between: %.2f" % out[lower_bound], "and %.2f" % out[upper_bound])

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [17]:
chickweight = pd.read_csv('chick_weight.csv')
chickweight = chickweight.drop(['Unnamed: 0'], axis=1)

In [18]:
def get_final_weights_for_diet(diet):
    df_this_diet = chickweight[chickweight['Diet']==diet]
    final_weights = []
    chicks = list(df_this_diet.Chick.unique())
    for chick_num in chicks:
        temp_df = df_this_diet[df_this_diet['Chick']==chick_num]
        final_weight = list(temp_df['weight'])[-1]
        final_weights.append(final_weight)
    return final_weights

In [19]:
all_final_weights = {}
for i in range(1,5):
    all_final_weights[i] = get_final_weights_for_diet(i)

In [20]:
combinations = [[1,2], [1,3], [1,4], [2,3], [2,4], [3,4]]

In [None]:
for combination in combinations:
    diet_1, diet_2 = combination
    diff2meanconf(diet_1, diet_2, all_final_weights[diet_1], all_final_weights[diet_2], conf_interval = 0.9)

**********Diet:  1  vs Diet:  2 **********
Observed difference between the means: 58.40
We have 90.0 % confidence that the true difference between the means is between: 11.05 and 104.35
**********Diet:  1  vs Diet:  3 **********
Observed difference between the means: 114.00
We have 90.0 % confidence that the true difference between the means is between: 70.30 and 156.45
**********Diet:  1  vs Diet:  4 **********
Observed difference between the means: 73.00
We have 90.0 % confidence that the true difference between the means is between: 37.10 and 108.00
**********Diet:  2  vs Diet:  3 **********
Observed difference between the means: 55.60
We have 90.0 % confidence that the true difference between the means is between: 2.90 and 108.00
