In [1]:
import numpy as np
from collections import defaultdict

import nsfg
import thinkstats2 as util

### Read data from Stata format

In [2]:
dct = util.ReadStataDct('data/2002FemPreg.dct')
df = dct.ReadFixedWidth('data/2002FemPreg.dat.gz', compression='gzip')

### Data Cleaning

In [3]:
df.agepreg /= 100.0

na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)

df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0
### Note: dot notation does not work when adding a new column
# df.totalwgt_lb = df.birthwgt_lb + df.birthwgt_oz / 16.0 (X)

# df.loc[row_indexer_condition, column_name]
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan

### How to use DataFarme of Pandas

In [4]:
df.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

In [5]:
df.columns[1]

'pregordr'

In [6]:
#pregordr = df.pregordr
pregordr = df['pregordr']
type(pregordr)

pandas.core.series.Series

In [7]:
pregordr[0]

1

In [8]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

### Validation

In [9]:
''' 
Expected Result:

value label Total
1 LIVE BIRTH 9148
2 INDUCED ABORTION 1862
3 STILLBIRTH 120
4 MISCARRIAGE 1921
5 ECTOPIC PREGNANCY 190
6 CURRENT PREGNANCY 352
'''

df.outcome.value_counts(sort=False).sort_index(0)

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

In [10]:
'''
Expected Result:
value label Total
. INAPPLICABLE 4449
0-5 UNDER 6 POUNDS 1125
6 6 POUNDS 2223
7 7 POUNDS 3049
8 8 POUNDS 1889
9-95 9 POUNDS OR MORE 799
'''

df.birthwgt_lb.value_counts(sort=False).sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

### Interpretation

In [11]:
preg_map = defaultdict(list)

for index, caseid in df.caseid.iteritems():
    preg_map[caseid].append(index)

In [12]:
indices = preg_map[10229]

# Series.values: numpy array
df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1])

### Exercise 1

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [13]:
'''
Expected result:
value	label	 	Total
.	INAPPLICABLE	 	4445
1	1ST BIRTH	 	4413
2	2ND BIRTH	 	2874
3	3RD BIRTH	 	1234
4	4TH BIRTH	 	421
5	5TH BIRTH	 	126
6	6TH BIRTH	 	50
7	7TH BIRTH	 	20
8	8TH BIRTH	 	7
9	9TH BIRTH	 	2
10	10TH BIRTH	 	1
 	Total	 	13593
'''

df.birthord.value_counts()

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
9.0        2
10.0       1
Name: birthord, dtype: int64

In [14]:
df.birthord.isnull().sum()

4445

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [15]:
'''
Expected result:
value	label	 	Total
0-13	13 WEEKS OR LESS	 	3522
14-26	14-26 WEEKS	 	793
27-50	27 WEEKS OR LONGER	 	9278
 	Total	 	13593
'''

df.prglngth.value_counts().sort_index()

0       15
1        9
2       78
3      151
4      412
5      181
6      543
7      175
8      409
9      594
10     137
11     202
12     170
13     446
14      29
15      39
16      44
17     253
18      17
19      34
20      18
21      37
22     147
23      12
24      31
25      15
26     117
27       8
28      38
29      23
30     198
31      29
32     122
33      50
34      60
35     357
36     329
37     457
38     609
39    4744
40    1120
41     591
42     328
43     148
44      46
45      10
46       1
47       1
48       7
50       2
Name: prglngth, dtype: int64

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [16]:
lb_to_kg_ratio = 0.45359237

df['totalwgt_kg'] = df['totalwgt_lb'] * lb_to_kg_ratio
df.totalwgt_kg.mean()

3.29784711480321

In [20]:
resp = nsfg.ReadFemResp(dct_file='data/2002FemResp.dct', dat_file='data/2002FemResp.dat.gz')

In [21]:
resp.head()

Unnamed: 0,caseid,rscrinf,rdormres,rostscrn,rscreenhisp,rscreenrace,age_a,age_r,cmbirth,agescrn,...,pubassis_i,basewgt,adj_mod_basewgt,finalwgt,secu_r,sest,cmintvw,cmlstyr,screentime,intvlngth
0,2298,1,5,5,1,5.0,27,27,902,27,...,0,3247.916977,5123.759559,5556.717241,2,18,1234,1222,18:26:36,110.492667
1,5012,1,5,1,5,5.0,42,42,718,42,...,0,2335.279149,2846.79949,4744.19135,2,18,1233,1221,16:30:59,64.294
2,11586,1,5,1,5,5.0,43,43,708,43,...,0,2335.279149,2846.79949,4744.19135,2,18,1234,1222,18:19:09,75.149167
3,6794,5,5,4,1,5.0,15,15,1042,15,...,0,3783.152221,5071.464231,5923.977368,2,18,1234,1222,15:54:43,28.642833
4,616,1,5,4,1,5.0,20,20,991,20,...,0,5341.329968,6437.335772,7229.128072,2,18,1233,1221,14:19:44,69.502667


Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [23]:
resp.age_r.value_counts().sort_index()

# Answer: 15(youngeset), 44(oldest)

15    217
16    223
17    234
18    235
19    241
20    258
21    267
22    287
23    282
24    269
25    267
26    260
27    255
28    252
29    262
30    292
31    278
32    273
33    257
34    255
35    262
36    266
37    271
38    256
39    215
40    256
41    250
42    215
43    253
44    235
Name: age_r, dtype: int64

How old is the respondent with `caseid` 1?

In [30]:
resp[resp.caseid == 1].age_r

# Answer: 44

1069    44
Name: age_r, dtype: int64

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [34]:
df[df.caseid == 2298].prglngth.values

# Answer: 40, 36, 30, 40

array([40, 36, 30, 40])

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [37]:
df[df.caseid == 5012].birthwgt_lb

# Answer: 6.0

5515    6.0
Name: birthwgt_lb, dtype: float64