# Data Preprocessing

This notebook conducts data preprocessing, including data cleaning, feature engineering, and subsampling.

In [1]:
import pandas as pd

In [2]:
birth = pd.read_csv('US_births(2018).csv')

  birth = pd.read_csv('US_births(2018).csv')


In [3]:
birth.shape

(3801534, 55)

## Data Cleaning

In [4]:
clean_birth = birth.dropna()

# remove missing values in the response/outcome variable
clean_birth = clean_birth[clean_birth['DBWT'] != 9999] 

# remove missing values in the features
clean_birth = clean_birth[
    (clean_birth['PRECARE'] != 99) & 
    (clean_birth['CIG_0'] != 99) & 
    (clean_birth['BMI'] != 99.9) & 
    (clean_birth['PREVIS'] != 99) & 
    (clean_birth['MRAVE6'] != 9) & 
    (clean_birth['PAY_REC'] != 9) &
    (clean_birth['FRACE6'] != 9) &
    (clean_birth['MEDUC'] != 9) & 
    (clean_birth['FEDUC'] != 9) & 
    (clean_birth['NO_RISKS'] != 9) & 
    (clean_birth['ATTEND'] != 9) &
    (clean_birth['BFACIL'] != 9) &
    (clean_birth['FAGECOMB'] != 99) &
    (clean_birth['RF_CESAR'] != 'U') &
    (clean_birth['LD_INDL'] != 'U') &
    (clean_birth['MBSTATE_REC'] != 3) &
    (clean_birth['M_Ht_In'] != 99) &
    (clean_birth['NO_INFEC'] != 9) &
    (clean_birth['NO_MMORB'] != 9) &
    (clean_birth['PRIORLIVE'] != 99) &
    (clean_birth['PRIORTERM'] != 99) &
    (clean_birth['RDMETH_REC'] != 9) &
    (clean_birth['DLMP_YY'] != 9999) &
    (clean_birth['DLMP_MM'] != 99) &
    (clean_birth['PWgt_R'] != 999) &
    (clean_birth['WTGAIN'] != 99) &
    (clean_birth['ILLB_R'] != 909)
] 

In [5]:
clean_birth.shape

(2783789, 55)

## Feature engineering

In [6]:
# estimate pregnancy length
clean_birth['PREG_LEN'] = 12*(2018 - clean_birth['DLMP_YY']) + (clean_birth['DOB_MM'] - clean_birth['DLMP_MM']) 

# recode PRECARE
clean_birth['PRECARE'][(clean_birth['PRECARE'] < 4) & (clean_birth['PRECARE'] > 0)] = 1 
clean_birth['PRECARE'][(clean_birth['PRECARE'] < 7) & (clean_birth['PRECARE'] > 3)] = 2
clean_birth['PRECARE'][(clean_birth['PRECARE'] > 6)] = 3

# compute percentage weight gain
clean_birth['WTGAIN_PER'] = clean_birth['WTGAIN'] / clean_birth['PWgt_R'] 

# binarize CIG_0
clean_birth['CIG'] = clean_birth['CIG_0'] > 0 

# binarize PRIORDEAD
clean_birth['PRIORDEAD'] = clean_birth['PRIORDEAD'] > 0

# binarize PRIORTERM
clean_birth['PRIORTERM'] = clean_birth['PRIORTERM'] > 0

# binarize PRIORLIVE
clean_birth['PRIORLIVE'] = clean_birth['PRIORLIVE'] > 0

# compute first time live birth
clean_birth['FIRST_BIRTH'] = clean_birth['ILLB_R'] == 888

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_birth['PRECARE'][(clean_birth['PRECARE'] < 4) & (clean_birth['PRECARE'] > 0)] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_birth['PRECARE'][(clean_birth['PRECARE'] < 7) & (clean_birth['PRECARE'] > 3)] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_birth['PRECARE'][(clean_birth['PRECARE'] > 6)] = 3


In [7]:
# drop columns where >99% entries are the same
clean_birth = clean_birth.drop(['DOB_YY', 'IMP_SEX', 'IP_GON', 'MAGE_IMPFLG', 'MAR_IMP', 'MM_AICU', 'MTRAN'], axis=1)

# drop redundant columns due to feature engineering
clean_birth = clean_birth.drop(['WTGAIN', 'PWgt_R', 'DWgt_R', 'DOB_MM', 
                                   'DOB_WK', 'DOB_TT', 'DOB_MM', 'DLMP_YY',
                                   'DLMP_MM', 'PAY', 'MHISPX', 'MRACE15',
                                   'MRACE31', 'MRACEIMP', 'FHISPX', 'FRACE15',
                                   'FRACE31', 'RF_CESARN', 'ILOP_R', 'ILP_R', 'ILLB_R','CIG_0'], axis=1)

In [8]:
clean_birth.shape

(2783789, 31)

## Subsampling

In [9]:
n = 10000
sub_clean_birth = clean_birth.sample(n, random_state=102)

In [10]:
sub_clean_birth

Unnamed: 0,ATTEND,BFACIL,BMI,DBWT,DMAR,FAGECOMB,FEDUC,FRACE6,LD_INDL,MAGER,...,PRIORLIVE,PRIORTERM,RDMETH_REC,RESTATUS,RF_CESAR,SEX,PREG_LEN,WTGAIN_PER,CIG,FIRST_BIRTH
2780164,1,1,31.4,3670,1,29,6,1,N,32,...,False,False,1,2,N,M,9,0.000000,False,True
1306496,2,1,27.6,3494,1,34,4,1,Y,33,...,True,False,1,1,N,F,9,0.120482,False,False
3027006,1,1,27.1,3374,2,43,2,1,N,29,...,True,True,1,1,N,M,10,0.061350,True,False
903600,1,1,26.8,3520,1,30,3,1,Y,28,...,False,True,1,1,N,M,9,0.301282,False,True
2813944,1,1,21.3,3140,1,30,5,1,N,30,...,False,False,1,3,N,M,9,0.208333,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1541267,1,1,35.9,3062,1,30,4,1,Y,30,...,True,False,1,1,N,M,9,0.173684,False,False
1786746,1,1,22.5,3855,1,30,3,1,Y,23,...,False,False,3,1,N,M,9,0.263514,False,True
1928116,1,1,20.4,2710,1,39,2,1,Y,32,...,True,True,1,2,N,M,9,0.388889,True,False
60244,1,1,24.4,3118,1,35,2,1,Y,34,...,False,False,3,2,N,M,10,0.147887,False,True


In [11]:
sub_clean_birth.to_csv('subsampled_clean_data.csv')