# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [88]:
from __future__ import print_function, division
import math

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [89]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [90]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [91]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [92]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [93]:
pregordr

0        1
1        2
2        1
3        2
4        3
        ..
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [94]:
pregordr[0]

1

Select a slice from a column.

In [95]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [96]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [97]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [98]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [99]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [100]:
preg['birthord'].value_counts().sort_index()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [101]:
preg.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [102]:
preg['prglngth']

0        39
1        39
2        39
3        39
4        39
         ..
13588    39
13589     6
13590     5
13591    39
13592    39
Name: prglngth, Length: 13593, dtype: int64

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [103]:
preg.totalwgt_lb.mean()

7.265628457623368

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [104]:
preg['totalwgt_kg'] = preg['totalwgt_lb']*0.453592

In [105]:
preg['totalwgt_kg']

0        3.997279
1        3.572037
2        4.139027
3        3.175144
4        2.806601
           ...   
13588    2.806601
13589         NaN
13590         NaN
13591    3.401940
13592    3.401940
Name: totalwgt_kg, Length: 13593, dtype: float64

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [106]:
resp = nsfg.ReadFemResp()

MemoryError: Unable to allocate 145. MiB for an array with shape (2491, 7643) and data type float64

`DataFrame` provides a method `head` that displays the first five rows:

In [None]:
resp.head()

Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [None]:
## the youngest is 15 and oldest is 44
resp['age_r'].value_counts().sort_index()

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [None]:
resp[resp.caseid==2298]

And we can get the corresponding rows from `preg` like this:

In [None]:
preg[preg.caseid==2298]

How old is the respondent with `caseid` 1?

In [None]:
## the person with caseid 1 is 44 years old.
one_persons_data = resp[resp.caseid==1]
one_persons_data["age_r"]

What are the pregnancy lengths for the respondent with `caseid` 2298?


In [None]:
one_persons_data = preg[preg.caseid==2298]
one_persons_data["prglngth"]

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [None]:
## It turns out the person with caseid 5012 only had the one child. The baby was 6.0 lbs.
one_persons_data = preg[preg.caseid == 5012]
one_persons_data["birthwgt_lb"]

In [None]:
## testing chapter 2 ideas.
## It seemed to work but we may need to remove Null values. Pressing on!
hist = {}
for x in preg.birthwgt_lb:
    hist[x] = hist.get(x,0)+1

## I used this to check my work but it spams nans so I commented it out
# hist

In [None]:
## kind of an elegant way of doing this. Seems to get the same results.
from collections import Counter
counter = Counter(preg.birthwgt_lb)

## Since this was practice I commented this out so I would not spam nans :)
#counter

In [None]:
## looks like thinkstats2 ignored the null values.
import thinkstats2
hist = thinkstats2.Hist(preg.birthwgt_lb)

## Same with this one I commented it out so my code would be more readable.
# hist

In [None]:
## this looks like it checks the frequencies based on the weight I put in.
## So in this case I tried 6 lbs which was the same weight that caseid = 5012 had
## It turns out that there were 2223 babies that also had this weight.
hist.Freq(6)



In [None]:
## Lets check something impossible and see what we get.
## Confirmed there were no babies that weighed 100 lbs.
hist.Freq(100)


In [None]:
## Since this is a dictionary this gives me the values stored in the dictionary.
hist.Values()


In [None]:
## Very cool this is starting to look like a histogram.
for val in sorted(hist.Values()):
    print(val,hist.Freq(val))



In [None]:
## Awesome we got a histogram that looks fairly normal.
import thinkplot
thinkplot.Hist(hist)
thinkplot.Show(xlabel = 'Weight in lbs', ylabel='Frequency')



In [None]:
## following the book I already imported the data and made a column with weights in lb
## this must be used to ensure that we are not building bias into our histogram since babies that die are most likely underweight. :(
live=preg[preg.prgoutcome==1]



In [None]:
## Interesting it looks very similar to what I got before but I did not filter for live births.
## I changed my variable to hist2 just to be sure
hist_birthwgt_lb = thinkstats2.Hist(live.birthwgt_lb,label= 'birthwgt_lb')
thinkplot.Hist(hist_birthwgt_lb)
thinkplot.Show(xlabel='Weight in lbs', ylabel = 'Frequency')

In [None]:
## The author brings up a good point outliers would be easy to miss on a frequency graph.
## here is the original with with outliers still in the histogram
hist_prgLngth = thinkstats2.Hist(live.prglngth,label="Weeks until birth")
thinkplot.Hist(hist_prgLngth)
thinkplot.Show(xlabel = 'Weeks', ylabel = 'frequency')

In [None]:
## Checking for outliers
for weeks, freq in hist_prgLngth.Smallest(10):
    print(weeks, freq)


In [None]:
## now we are making a list of the babies who were either first borns or not
firsts = live[live.birthord == 1]
others = live[live.birthord != 1]

## building two histograms one will be a histogram for first borns the other will contain all others.
first_hist = thinkstats2.Hist(firsts.prglngth)
other_hist = thinkstats2.Hist(others.prglngth)

## plotting on the same axis so we can compare them.
## I made an observation here that the histograms are hard to read due to a difference in sample size.
## The book confirms that this is an issue that we should be concerned with.
## Maybe we will use density functions to solve this?
width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align = 'right', width=width)
thinkplot.Hist(other_hist,align='left', width=width)
thinkplot.Show(xlabel='Weeks', ylabel = 'Frequency')

In [None]:
## Computing the mean, standard deviation and variance
mean = live.prglngth.mean()
var = live.prglngth.var()
std = live.prglngth.std()



In [None]:
## We need to find the effect size using Cohens effect size
def CohenEffectSize(group1,group2):
    diff = group1.mean()-group2.mean()

    var1 = group1.var()
    var2 = group2.var()

    n1,n2 = len(group1),len(group2)

    pooled_var =(n1*var1+n2*var2)/(n1+n2)
    d = diff/math.sqrt(pooled_var)
    return d

In [None]:
## testing it out
## It worked! The book said that he got .029 which is not much of a difference!
effect_size = CohenEffectSize(firsts['prglngth'],others['prglngth'])
print(effect_size)

## This is very interesting since the effect size is in standard deviations it is saying that the difference is slightly larger than the mean.
# But there is not a noticeable difference in fact I would say that first born babies are born very close to the average time.
## According to the the wiki and confirmed on other websites we make decisions on effect size based on these metrics.
## very small (0.01) small (0.2) medium (0.5) large (0.8) Very Large (1.20) Huge (2.0)
## as we can see since our effect size was 0.029 we would fall between very small and small.
# So I would say that there is a small chance that the baby would be born later than a normal
# If the baby was their first baby there may be a small chance according to those metrics that the baby would slightly late.