This header imports all of the modules and libraries you will need in this exercise:

In [40]:
""" This file contains code for use with "Think Stats", by Allen B. Downey, available from greenteapress.com
    Copyright 2014 Allen B. Downey
    License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function

import numpy as np
import sys

import nsfg
import thinkstats2

Write a function <b>ReadFemResp</b> that reads the NSFG respondent data 2002FemResp and returns a df

In [41]:
def ReadFemResp(dct_file='data/2002FemResp.dct',
                dat_file='data/2002FemResp.dat.gz'):
    """Reads the NSFG 2002FemResp data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    return df

In [42]:
resp = ReadFemResp()
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


The variable *pregnum* is a recode that indicates how many times each respondent has been pregnant. Print the value counts for this variable and compare them to the published results in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=FEM&section=R&subSec=7869&srtLabel=606835)

In [47]:
assert(len(resp) == 7643)

pregnum_counts = resp.pregnum.value_counts()

assert(pregnum_counts[0] == 2610)
assert(pregnum_counts[1] == 1267)
assert(pregnum_counts[2] == 1432)
assert(pregnum_counts[3] == 1110)
assert(pregnum_counts[4] == 611)
assert(pregnum_counts[5] == 305)
assert(pregnum_counts[6] == 150)
assert(sum(pregnum_counts[7:]) == 158)

Cross-validate the respondent and pregnancy files by comparing *pregnum* for each respondent with the number of records in the pregnancy file. You can use nsfg.MakePregMap to make a dictionary that maps from each caseid to a list of indices into the pregnancy DataFrame.

In [54]:
def ValidatePregnum(resp):
    # read the pregnancy DataFrame
    preg = nsfg.ReadFemPreg()
    
    # make the map from caseid to list of pregnancy indices
    preg_map = nsfg.MakePregMap(preg)
    
    # iterate through the respondent pregnum series
    for index, pregnum in resp.pregnum.iteritems():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), pregnum)
            return False

    return True

In [53]:
assert(ValidatePregnum(resp) == True)

# Or: 
ValidatePregnum(resp)

True