# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [1]:
from __future__ import print_function, division

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [3]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [4]:
preg.columns

Index([         u'caseid',        u'pregordr',       u'howpreg_n',
             u'howpreg_p',        u'moscurrp',        u'nowprgdk',
              u'pregend1',        u'pregend2',        u'nbrnaliv',
              u'multbrth',
       ...
            u'laborfor_i',      u'religion_i',         u'metro_i',
               u'basewgt', u'adj_mod_basewgt',        u'finalwgt',
                u'secu_p',            u'sest',         u'cmintvw',
           u'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [5]:
preg.columns[1]

u'pregordr'

Select a column and check what type it is.

In [6]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [7]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, dtype: int64

Select a single element from a column.

In [7]:
pregordr[0]

1

In [8]:
preg.pregordr[0]

1

In [9]:
preg['pregordr'][0]

1

Select a slice from a column.

In [10]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [11]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [12]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [30]:
preg.birthwgt_lb.value_counts().sort_index()  #sort_values() to sort on the vaules

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

In [27]:
preg.birthwgt_lb.value_counts()  #default sort of value_counts() is decreasing values

7.0     3049
6.0     2223
8.0     1889
5.0      697
9.0      623
4.0      229
10.0     132
3.0       98
2.0       53
1.0       40
11.0      26
12.0      10
0.0        8
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

In [21]:
type(preg.birthwgt_lb)

pandas.core.series.Series

In [23]:
import pandas as pd

In [26]:
help(pd.core.series.Series.value_counts)

Help on method value_counts in module pandas.core.base:

value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True) unbound pandas.core.series.Series method
    Returns object containing counts of unique values.
    
    The resulting object will be in descending order so that the
    first element is the most frequently-occurring element.
    Excludes NA values by default.
    
    Parameters
    ----------
    normalize : boolean, default False
        If True then the object returned will contain the relative
        frequencies of the unique values.
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        a convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN.
    
    Returns
    -------
    counts : Series



Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [31]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [35]:
# Solution goes here
preg.birthord.value_counts().sort_index()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [36]:
preg.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [41]:
# Solution goes here
preg.prglngth.value_counts().sort_index().head()

0     15
1      9
2     78
3    151
4    412
Name: prglngth, dtype: int64

In [49]:
test = preg.prglngth.value_counts().sort_index()[0:14]

In [50]:
sum(test)

3522

In [53]:
test2 = preg.prglngth.value_counts().sort_index()[14:27]
print(test2, sum(test2))

14     29
15     39
16     44
17    253
18     17
19     34
20     18
21     37
22    147
23     12
24     31
25     15
26    117
Name: prglngth, dtype: int64 793


In [54]:
test3 = preg.prglngth.value_counts().sort_index()[27:51]
print(test2, sum(test3))

14     29
15     39
16     44
17    253
18     17
19     34
20     18
21     37
22    147
23     12
24     31
25     15
26    117
Name: prglngth, dtype: int64 9278


To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [60]:
print(preg.totalwgt_lb.mean())
print(preg.totalwgt_lb.mean()/2.20462)

7.26562845762
3.29563755097


Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [64]:
# Solution goes here
preg['totalwgt_kg'] = preg.totalwgt_lb / 2.20462

print(preg.totalwgt_kg.mean())

3.29563755097


`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [65]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [66]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [70]:
# Solution goes here
resp.age_r.value_counts().sort_index()

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [71]:
resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [72]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118451
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.494761
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.89942
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118451


How old is the respondent with `caseid` 1?

In [76]:
# Solution goes here
resp[resp.caseid==1]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
1069,1,1,5,4,5,5.0,44,44,695,44,...,0,3410.389399,3869.349602,6448.271112,2,9,1231,1219,19:56:43,67.563833


What are the pregnancy lengths for the respondent with `caseid` 2298?

In [79]:
# Solution goes here
preg[preg.caseid==2298]['prglngth']

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [81]:
# Solution goes here
preg[preg.caseid==5012]['birthwgt_lb']

5515    6.0
Name: birthwgt_lb, dtype: float64

In [None]:
#Ex 1.2

In [84]:
from __future__ import print_function

import numpy as np
import sys

import nsfg
import thinkstats2


In [90]:
def ReadFemResp(dct_file='2002FemResp.dct',
                dat_file='2002FemResp.dat.gz',
                nrows=None):
    """Reads the NSFG respondent data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
    CleanFemResp(df)
    return df

In [91]:
def CleanFemResp(df):
    """Recodes variables from the respondent frame.

    df: DataFrame
    """
    pass


def ReadFemPreg(dct_file='2002FemPreg.dct',
                dat_file='2002FemPreg.dat.gz'):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    CleanFemPreg(df)
    return df

In [92]:
resp = ReadFemResp()
print(resp['pregnum'].value_counts().sort_index())
sum(resp['pregnum'].value_counts().sort_index())

0     2610
1     1267
2     1432
3     1110
4      611
5      305
6      150
7       80
8       40
9       21
10       9
11       3
12       2
14       2
19       1
Name: pregnum, dtype: int64


7643

In [93]:
type(resp['pregnum'].value_counts().sort_index())

pandas.core.series.Series

In [95]:
resp['pregnum'].value_counts().sort_index().index[1]

1

In [104]:
resp['pregnum'].value_counts().sort_index()[1]

1267

In [105]:
num_preg = resp['pregnum'].value_counts().sort_index()

In [None]:
#matching number of pregnancies from respondent file with preg file (number of records = 13593)

In [136]:
# complex syntax -see better one below
accu = 0
for i in range(len(num_preg.index)):
    print("i=", i,"index=", num_preg.index[i], num_preg[num_preg.index[i]], num_preg.index[i]*num_preg[num_preg.index[i]])
    accu += num_preg.index[i]*num_preg[num_preg.index[i]]
print(accu)

i= 0 index= 0 2610 0
i= 1 index= 1 1267 1267
i= 2 index= 2 1432 2864
i= 3 index= 3 1110 3330
i= 4 index= 4 611 2444
i= 5 index= 5 305 1525
i= 6 index= 6 150 900
i= 7 index= 7 80 560
i= 8 index= 8 40 320
i= 9 index= 9 21 189
i= 10 index= 10 9 90
i= 11 index= 11 3 33
i= 12 index= 12 2 24
i= 13 index= 14 2 28
i= 14 index= 19 1 19
13593


In [112]:
len(num_preg.index)

15

In [120]:
num_preg.index[13]

14

In [133]:
num_preg[14]

2

In [142]:
#Much better syntax

In [141]:
for index, pregnum in resp.pregnum.value_counts().iteritems():
    print(index, pregnum)

0 2610
2 1432
1 1267
3 1110
4 611
5 305
6 150
7 80
8 40
9 21
10 9
11 3
14 2
12 2
19 1


In [143]:
for index, pregnum in resp.pregnum.iteritems():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]
        print(caseid, indices)

2298 [2610, 2611, 2612, 2613]
5012 [5515]
11586 [12524]
6794 []
616 []
845 [936, 937, 938, 939, 940, 941, 942, 943]
10333 []
855 []
8656 [9412, 9413, 9414]
3566 []
5917 [6517, 6518]
9200 [9953]
6320 [6980, 6981]
11700 [12657, 12658]
7354 [8020, 8021, 8022, 8023, 8024, 8025, 8026]
3697 []
4881 [5371, 5372, 5373]
5862 [6464, 6465, 6466, 6467]
8542 [9282, 9283]
2054 [2305, 2306, 2307]
3719 [4178]
11740 [12720, 12721, 12722, 12723, 12724]
11343 [12273, 12274, 12275]
7075 [7721, 7722, 7723]
5422 [5925, 5926]
2178 [2456, 2457]
8358 []
5083 [5591, 5592]
1545 [1726, 1727]
5656 [6209, 6210]
9334 [10108, 10109, 10110, 10111]
5507 [6034, 6035, 6036]
611 []
4260 [4768, 4769]
11767 [12741, 12742, 12743]
5573 [6098, 6099, 6100, 6101, 6102]
11901 [12901]
8975 [9722]
5267 [5735, 5736, 5737]
910 [1034]
4463 [5000, 5001, 5002, 5003]
8954 [9714]
1814 [2074, 2075, 2076]
7011 [7678, 7679, 7680, 7681]
4057 [4552, 4553]
7081 [7736, 7737, 7738, 7739]
5499 [6023]
6551 [7231, 7232, 7233, 7234, 7235]
9242 []
114

In [145]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


In [149]:
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
0,1,1,,,,,6.0,,1.0,,...,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125,3.997288
1,1,2,,,,,6.0,,1.0,,...,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875,3.572044
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125,4.139035
3,2,2,,,,,6.0,,1.0,,...,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0,3.17515
4,2,3,,,,,6.0,,1.0,,...,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875,2.806606
