# Exercise 9.1 
As sample size increases, the power of a hypothesis test increases, which means it is more likely to be positive if the effect is real. Conversely, as sample size decreases, the test is less likely to be positive even if the effect is real.
To investigate this behavior, run the tests in this chapter with different sub- sets of the NSFG data. You can use thinkstats2.SampleRows to select a random subset of the rows in a DataFrame.
What happens to the p-values of these tests as sample size decreases? What is the smallest sample size that yields a positive test?

In [8]:
import nsfg
import hypothesis
import thinkstats2

def RunTests(live, iters=1000):
    """function to run the tests from Chapter 9 with a subset of the data."""
    
    n = len(live)
    firsts = live[live.birthord == 1]
    others = live[live.birthord != 1]

    # test1: difference in mean pregnancy length.
    data1 = firsts.prglngth.values, others.prglngth.values
    ht = hypothesis.DiffMeansPermute(data1)
    p1 = ht.PValue(iters)
    
    # test2: difference in mean birth weight.
    data2 = (firsts.totalwgt_lb.dropna().values,
            others.totalwgt_lb.dropna().values)
    ht = hypothesis.DiffMeansPermute(data2)
    p2 = ht.PValue(iters)

    # test3: correlation of mother's age and birth weight.
    live2 = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    data3 = live2.agepreg.values, live2.totalwgt_lb.values
    ht = hypothesis.CorrelationPermute(data3)
    p3 = ht.PValue(iters)

    # test4: chi-square test of pregnancy length.
    ht = hypothesis.PregLengthTest(data1)
    p4 = ht.PValue(iters)
    
    print('%d\t%0.2f\t%0.2f\t%0.2f\t%0.2f' % (n, p1, p2, p3, p4))


def main():
    thinkstats2.RandomSeed(18)
    
    preg = nsfg.ReadFemPreg()
    live = preg[preg.outcome == 1]
    n = len(live)
    
    print('n\ttest1\ttest2\ttest3\ttest4')
    for _ in range(8):
        sample = thinkstats2.SampleRows(live, n)
        RunTests(sample)
        n //= 2

if __name__ == '__main__':
    main()    

n	test1	test2	test3	test4
9148	0.16	0.00	0.00	0.00
4574	0.10	0.01	0.00	0.00
2287	0.25	0.06	0.00	0.00
1143	0.24	0.03	0.39	0.03
571	0.81	0.00	0.04	0.04
285	0.57	0.41	0.48	0.83
142	0.45	0.08	0.60	0.04
71	1.00	0.81	0.38	0.69


My observations:

Details of the tests: 
test1: difference in mean pregnancy length
test2: difference in mean birth weight
test3: correlation of mother's age and birth weight
test4: chi-square test of pregnancy length


Except for the test1 - on pregnancy length, the P-value for all the tests is very low / almost negligble i.e. <0.05 for larger samples. This means the null hypothesis is false with the exception of test1. Smallest sample size of 71 seems to be yielding positive test.

# Exercise 10.1 
Using the data from the BRFSS, compute the linear least squares fit for log(weight) versus height. How would you best present the estimated parameters for a model like this where one of the variables is log- transformed? If you were trying to guess someone’s weight, how much would it help to know their height?
Like the NSFG, the BRFSS oversamples some groups and provides a sampling weight for each respondent. In the BRFSS data, the variable name for these weights is finalwt. Use resampling, with and without weights, to estimate the mean height of respondents in the BRFSS, the standard error of the mean, and a 90% confidence interval. How much does correct weighting affect the estimates?


In [None]:
import brfss
from thinkstats2 import Mean, MeanVar, Var, Std, Cov
import thinkplot
import numpy as np

def LeastSquares(xs, ys):
    meanx, varx = MeanVar(xs)
    meany = Mean(ys)

    slope = Cov(xs, ys, meanx, meany) / varx
    inter = meany - slope * meanx

    return inter, slope

df = brfss.ReadBrfss(nrows=None)
df = df.dropna(subset=['htm3', 'wtkg2'])
heights, weights = df.htm3, df.wtkg2
log_weights = np.log10(weights)

In [None]:
# Computing the linear squares fit for log(weight) vs height:
inter, slope = thinkstats2.LeastSquares(heights, log_weights)
inter, slope

In [None]:
# Scatter plot of the data and showing the fitted line with x-axis as height, y-axis as log wgt
thinkplot.Scatter(heights, log_weights, alpha=0.02, s=2)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, fys, color='red')
thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)', legend=False)

In [None]:
# Same plot but applying the inverse transform to show weights on a linear (not log) scale.
thinkplot.Scatter(heights, weights, alpha=0.02, s=2)
fxs, fys = thinkstats2.FitLine(heights, inter, slope)
thinkplot.Plot(fxs, 10**fys, color='red')
thinkplot.Config(xlabel='Height (cm)', ylabel='Weight (kg)', legend=False)

Plot percentiles of the residuals:

In [None]:
# calculate residuals
res = thinkstats2.Residuals(heights, log_weights, inter, slope)
df['residual'] = res
df

In [None]:
bins = np.arange(130, 210, 5)  
indices = np.digitize(df.htm3, bins)
groups = df.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]

thinkplot.PrePlot(3)
for percent in [75, 50, 25]:
    ys = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(means, ys, label=label)
    
thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)', legend=False)

The lines are flat over most of the range, indicating that the relationship is linear.
The lines are mostly parallel, indicating that the variance of the residuals is the same over the range.

In [None]:
# Computing correlation.
corr = thinkstats2.Corr(heights, log_weights)
corr

In [None]:
# Computing coefficient of determination.
r2 = thinkstats2.CoefDetermination(log_weights, res)
r2

In [None]:
# Confirming that 𝑅2=𝜌2.
corr**2 - r2

In [None]:
# Computing Std(ys), which is the RMSE of predictions that don't use height.
std_ys = thinkstats2.Std(log_weights) #get std dv of residuals
std_ys

In [None]:
# Compute Std(res), the RMSE of predictions that do use height.
std_res = thinkstats2.Std(res)
std_res

In [None]:
# height information that reduce RMSE.
1 - std_res / std_ys

In [None]:
# Using resampling to compute sampling distributions for inter and slope.
t = [] # creating an empty list
for _ in range(100):
    sample = thinkstats2.ResampleRows(df)
    estimates = thinkstats2.LeastSquares(sample.htm3, np.log10(sample.wtkg2)) #get intercept & slope
    t.append(estimates) # add result to the list

inters, slopes = zip(*t) # unzip the results
print(inters)
print(slopes)

In [None]:
# Plotting the sampling distribution of slope.
cdf = thinkstats2.Cdf(slopes) # calculating the cdf
thinkplot.Cdf(cdf) #plotting the data

In [None]:
# Computing the p-value of the slope.
pvalue = cdf[0]
print("p-value is", pvalue)

In [None]:
# Computing the 90% confidence interval of slope.
ci = cdf.Percentile(5), cdf.Percentile(95) # get 90% confidence interval 
ci

In [None]:
# Computing the mean of the sampling distribution.
mean = thinkstats2.Mean(slopes) # calculate mean slopes
mean

In [None]:
# Computing the standard deviation of the sampling distribution, which is the standard error.
stderr = thinkstats2.Std(slopes) # get std error
stderr

In [None]:
def Summarize(estimates, actual=None):
    mean = Mean(estimates)
    stderr = Std(estimates, mu=actual)
    cdf = thinkstats2.Cdf(estimates)
    ci = cdf.ConfidenceInterval(90)
    print('mean, SE, CI', mean, stderr, ci)  
    
def ResampleRowsWeighted(df, column='finalwgt'):
    weights = df[column]
    cdf = thinkstats2.Cdf(dict(weights))
    indices = cdf.Sample(len(weights))
    sample = df.loc[indices]
    return sample    

In [None]:
# Resampling rows without weights, compute mean height, and summarize results.

estimates_unweighted = [thinkstats2.ResampleRows(df).htm3.mean() for _ in range(100)]
Summarize(estimates_unweighted)

In [None]:
# Resampling rows with weights.

estimates_weighted = [ResampleRowsWeighted(df, 'finalwt').htm3.mean() for _ in range(100)]
Summarize(estimates_weighted)

The estimated mean height is almost 2 cm taller if we take into account the sampling weights, and this difference is much bigger than the sampling error.