# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [2]:
from __future__ import print_function, division

import nsfg

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [3]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [102]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt', 'finalwgt',
       'secu_p', 'sest', 'cmintvw', 'totalwgt_lb', 'totalwgt_kg'],
      dtype='object', length=245)

In [106]:
count = 0
for column_name in preg.columns:
    print(column_name)
    print(count)
    count += 1

caseid
0
pregordr
1
howpreg_n
2
howpreg_p
3
moscurrp
4
nowprgdk
5
pregend1
6
pregend2
7
nbrnaliv
8
multbrth
9
cmotpreg
10
prgoutcome
11
cmprgend
12
flgdkmo1
13
cmprgbeg
14
ageatend
15
hpageend
16
gestasun_m
17
gestasun_w
18
wksgest
19
mosgest
20
dk1gest
21
dk2gest
22
dk3gest
23
bpa_bdscheck1
24
bpa_bdscheck2
25
bpa_bdscheck3
26
babysex
27
birthwgt_lb
28
birthwgt_oz
29
lobthwgt
30
babysex2
31
birthwgt_lb2
32
birthwgt_oz2
33
lobthwgt2
34
babysex3
35
birthwgt_lb3
36
birthwgt_oz3
37
lobthwgt3
38
cmbabdob
39
kidage
40
hpagelb
41
birthplc
42
paybirth1
43
paybirth2
44
paybirth3
45
knewpreg
46
trimestr
47
ltrimest
48
priorsmk
49
postsmks
50
npostsmk
51
getprena
52
bgnprena
53
pnctrim
54
lpnctri
55
workpreg
56
workborn
57
didwork
58
matweeks
59
weeksdk
60
matleave
61
matchfound
62
livehere
63
alivenow
64
cmkidied
65
cmkidlft
66
lastage
67
wherenow
68
legagree
69
parenend
70
anynurse
71
fedsolid
72
frsteatd_n
73
frsteatd_p
74
frsteatd
75
quitnurs
76
ageqtnur_n
77
ageqtnur_p
78
ageqtnur
79
matchf

Select a single column name.

In [7]:
preg.columns[10]

'cmotpreg'

Select a column and check what type it is.

In [11]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [12]:
pregordr

0        1
1        2
2        1
3        2
4        3
        ..
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [13]:
pregordr[0]

1

Select a slice from a column.

In [14]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [15]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [19]:
preg.outcome.value_counts(sort=False)
#.sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [20]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

In [21]:
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [28]:
preg.birthord.value_counts()


1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

We can also use `isnull` to count the number of nans.

In [50]:
#used .isnull() as instructed
print(preg.birthord.isnull().sum())

#wanted to see how many weren't null, easiest way to do this since i'm not familiar 
#with .isnull() was to subtract the null from the sum 
print('birthord.sum(): %s' % preg.birthord.sum())
print('birthord.sum()-birthord.isnull().sum(): %s' % ((preg.birthord.sum())-(preg.birthord.isnull().sum())))

#i just checked and it looks like the way to do that is .notnull()
print(preg.birthord.notnull().sum())

#but this gives us a different number.....
preg.birthord.isnull().value_counts()

#ok i figured it out! when i was doing the sum of the birthord, 
#it was adding together all of the values and not counting how many of them there were!
#so if there was one birthord of 3 and one of 5, it said "8" when what i really wanted was "2"
#the solution to this was to make sure it was counting before i summed, so i did:
print(preg.birthord.value_counts().sum())

4445
birthord.sum(): 16701.0
birthord.sum()-birthord.isnull().sum(): 12256.0
9148
9148


Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [51]:
#created it as a variable just because i wanted to double check 
#that it'll work the same way (it does)
prglngth = preg.prglngth
prglngth.value_counts().sort_index()

0       15
1        9
2       78
3      151
4      412
5      181
6      543
7      175
8      409
9      594
10     137
11     202
12     170
13     446
14      29
15      39
16      44
17     253
18      17
19      34
20      18
21      37
22     147
23      12
24      31
25      15
26     117
27       8
28      38
29      23
30     198
31      29
32     122
33      50
34      60
35     357
36     329
37     457
38     609
39    4744
40    1120
41     591
42     328
43     148
44      46
45      10
46       1
47       1
48       7
50       2
Name: prglngth, dtype: int64

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [52]:
print(preg.totalwgt_lb.mean())

preg.totalwgt_lb.value_counts().sort_index()

7.265628457623368


0.1250     1
0.3125     1
0.4375     1
0.5625     1
0.6250     2
          ..
13.0000    1
13.5000    1
13.7500    1
14.0000    3
15.4375    1
Name: totalwgt_lb, Length: 184, dtype: int64

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [53]:
#dict syntax is dict['new index'] = values
#dict syntax is fruits['banana'] = 'yellow'
#dict syntax is fruit_count['banana'] = 30

# 1 lb = 0.45359237 kg
preg['totalwgt_kg'] = (preg.totalwgt_lb) * (0.45359237)

#use mean function as above
print(preg.totalwgt_kg.mean())

preg.totalwgt_kg.value_counts().sort_index()

3.2956336316328243


0.056699    1
0.141748    1
0.198447    1
0.255146    1
0.283495    2
           ..
5.896701    1
6.123497    1
6.236895    1
6.350293    3
7.002332    1
Name: totalwgt_kg, Length: 184, dtype: int64

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [38]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [39]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [41]:
#total counts
resp.age_r.value_counts()

#find the youngest and oldest by sorting by index (which in this case is age)
resp.age_r.value_counts().sort_index()
#youngest is 15, oldest is 44

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [54]:
resp[resp.caseid==2298]

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667


And we can get the corresponding rows from `preg` like this:

In [22]:
preg[preg.caseid==2298]

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
2610,2298,1,,,,,6.0,,1.0,,...,0,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875
2611,2298,2,,,,,6.0,,1.0,,...,0,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5
2612,2298,3,,,,,6.0,,1.0,,...,0,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875
2613,2298,4,,,,,6.0,,1.0,,...,0,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875


How old is the respondent with `caseid` 1?

In [70]:
#use dict syntax: dict['index'] returns value
print(resp.age_r[resp.caseid==1])

#print the full resp result to validate that values are the same
resp[resp.caseid==1]

1069    44
Name: age_r, dtype: int64


Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
1069,1,1,5,4,5,5.0,44,44,695,44,...,0,3410.389399,3869.349602,6448.271112,2,9,1231,1219,19:56:43,67.563833


What are the pregnancy lengths for the respondent with `caseid` 2298?

In [73]:
print(preg.prglngth[preg.caseid==1])

#however prglngth isn't visible in the below table, so double check using a value that is:
print(preg.pregend1[preg.caseid==1])

#print the full preg result to validate that values are the same
preg[preg.caseid==1]

0    39
1    39
Name: prglngth, dtype: int64
0    6.0
1    6.0
Name: pregend1, dtype: float64


Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
0,1,1,,,,,6.0,,1.0,,...,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125,3.997283
1,1,2,,,,,6.0,,1.0,,...,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875,3.57204


In [107]:
print(preg.prglngth[preg.caseid==2298])

#however prglngth isn't visible in the below table, so double check using a value that is:
print(preg.pregend1[preg.caseid==2298])

#print the full preg result to validate that values are the same
preg[preg.caseid==2298]

2610    40
2611    36
2612    30
2613    40
Name: prglngth, dtype: int64
2610    6.0
2611    6.0
2612    6.0
2613    6.0
Name: pregend1, dtype: float64


Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
2610,2298,1,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118448
2611,2298,2,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,5.5,2.494758
2612,2298,3,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,4.1875,1.899418
2613,2298,4,,,,,6.0,,1.0,,...,0,0,3247.916977,5123.759559,5556.717241,2,18,,6.875,3.118448


What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [95]:
#2 ways to do this. let's do it with the previous caseid first (10299)
#one way is by looking up the weight according to case id:
print('Values from looking up column: \n%s\n' % preg.totalwgt_lb[preg.caseid==10229])

#you can also do it using the preg_map from earlier, so let's do it with 10299 as an example:
'''
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values
'''

#don't need to rerun preg_map because we already did it in this file and nothing has changed

caseid = 10229
indices = preg_map[caseid]
print('Values from array: \n%s\n' % preg.totalwgt_lb[indices].values)
preg[preg.caseid==caseid]


Values from looking up column: 
11093       NaN
11094       NaN
11095       NaN
11096       NaN
11097       NaN
11098       NaN
11099    7.6875
Name: totalwgt_lb, dtype: float64

Values from array: 
[   nan    nan    nan    nan    nan    nan 7.6875]



Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
11093,10229,1,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11094,10229,2,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11095,10229,3,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11096,10229,4,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11097,10229,5,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11098,10229,6,,,,,1.0,,,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,,
11099,10229,7,,,,,6.0,,1.0,,...,0,0,1914.323805,2021.999794,3369.662656,2,65,,7.6875,3.486991


In [96]:
print('Values from looking up column: \n%s\n' % preg.totalwgt_lb[preg.caseid==5012])


#can use the preg_map from earlier:
'''
caseid = 10229
preg_map = nsfg.MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values
'''
#have to change the caseid
caseid = 5012

#have to rerun the indices because it's dependent on caseid (previously 10299) 
#so if you do it without re-running this part then it'll give you the same 
#results as from 10299
indices = preg_map[caseid]

#array of values from totalwgt_lb that match the caseid
print('Values from array: \n%s\n' % preg.totalwgt_lb[indices].values)

#and here you can see it checked:
preg[preg.caseid==5012]

#it also works this way because caseid as a column is set to caseid as a variable (integer)
preg[preg.caseid==caseid]

Values from looking up column: 
5515    6.0
Name: totalwgt_lb, dtype: float64

Values from array: 
[6.]



Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb,totalwgt_kg
5515,5012,1,,,,,6.0,,1.0,,...,0,0,2335.279149,2846.79949,4744.19135,2,18,,6.0,2.721554
