Stakeholder: PRAMS Pregnancy Monitoring Network

Business Problem: APGAR Score is a metric doctors use to measure the overall health of a newborn infant. But what if there were a way to prepare for a low APGAR score before the infant is even born? PRAMS has asked me to use natality data to create a model to alert medical staff if a newborn is likely to need emergency medical intervention.

In [1]:
import numpy as np
import pandas as pd
import xlrd
import os
import seaborn as sns
import xgboost
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier,\
ExtraTreesClassifier, VotingClassifier, StackingRegressor
from sklearn.metrics import r2_score, accuracy_score, precision_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier

In [38]:
# Read in the data sets
df = pd.read_csv("data/nat2019us.csv")

In [4]:
df.columns

Index(['dob_yy', 'dob_mm', 'dob_tt', 'dob_wk', 'bfacil', 'f_facility',
       'bfacil3', 'mage_impflg', 'mage_repflg', 'mager',
       ...
       'f_ca_cleftlp', 'f_ca_cleft', 'f_ca_downs', 'f_ca_chrom', 'f_ca_hypos',
       'no_congen', 'itran', 'ilive', 'bfed', 'f_bfed'],
      dtype='object', length=226)

In [4]:
df.columns[23]

'mar_p'

In [8]:
df['mm_aicu'].value_counts()

N    3746724
Y       6745
U       4113
Name: mm_aicu, dtype: int64

In [9]:
df['apgar5r'].value_counts()

4    3170200
3     495375
2      55585
1      21749
5      14673
Name: apgar5r, dtype: int64

In [4]:
df.head()

Unnamed: 0,dob_yy,dob_mm,dob_tt,dob_wk,bfacil,f_facility,bfacil3,mage_impflg,mage_repflg,mager,...,f_ca_cleftlp,f_ca_cleft,f_ca_downs,f_ca_chrom,f_ca_hypos,no_congen,itran,ilive,bfed,f_bfed
0,2019,1,1135,3,1,1,1,,,29,...,1,1,1,1,1,1,N,Y,Y,1
1,2019,1,1305,3,1,1,1,,,40,...,1,1,1,1,1,1,N,Y,Y,1
2,2019,1,800,3,1,1,1,,,30,...,1,1,1,1,1,1,N,Y,Y,1
3,2019,1,130,4,1,1,1,,,25,...,1,1,1,1,1,1,N,Y,Y,1
4,2019,1,1426,4,1,1,1,,,38,...,1,1,1,1,1,1,N,Y,Y,1


In [20]:
df.columns[215:]

Index(['f_ca_limb', 'f_ca_cleftlp', 'f_ca_cleft', 'f_ca_downs', 'f_ca_chrom',
       'f_ca_hypos', 'no_congen', 'itran', 'ilive', 'bfed', 'f_bfed'],
      dtype='object')

In [13]:
removelist

[]

In [8]:
df['combgest'].value_counts()

39    1149203
40     689406
38     617932
37     374377
41     280391
36     171646
35      91927
42      91029
34      61717
43      45456
33      34924
44      24530
32      23384
31      16185
45      13048
30      12646
29       9369
46       7999
28       7778
27       5634
47       5493
26       4865
25       4242
24       3467
23       2611
99       2539
22       1830
21       1385
20       1081
19        699
18        493
17        296
Name: combgest, dtype: int64

In [39]:
remove = ['dob_yy', 'f_facility', 'bfacil3', 'mage_impflg', 'mage_repflg', 'mager14', 'mager9', 'mrace31', 'mrace6', 'mrace15',
           'mraceimp', 'mhispx', 'f_mhisp', 'mracehisp', 'mar_p', 'mar_imp', 'f_mar_p', 'f_meduc', 'fagerpt_flg', 'fagerec11',
           'frace31', 'frace15', 'fhispx', 'f_fhisp', 'lbo_rec', 'tbo_rec', 'illb_r', 'illb_r11', 'ilop_r',
           'ilop_r11', 'ilp_r11', 'f_mpcb', 'precare5', 'previs_rec', 'f_tpcv', 'f_wic', 'cig0_r', 'cig1_r', 'cig2_r',
           'cig3_r', 'f_cigs_0', 'f_cigs_1', 'f_cigs_2', 'f_cigs_3', 'cig_rec', 'f_tobaco', 'f_m_ht', 'bmi_r', 'pwgt_r',
           'f_pwgt', 'f_dwgt', 'wtgain_rec', 'f_wtgain', 'f_rf_pdiab', 'f_rf_gdiab', 'f_rf_phyper', 'f_rf_ghyper', 
           'f_rf_eclamp', 'f_rf_ppb', 'f_rf_inf_drg', 'f_rf_inf_art', 'rf_cesar', 'f_rf_cesar', 'f_rf_ncesar',
           'no_risks', 'f_ip_gonor', 'f_ip_syph', 'f_ip_chlam', 'f_ip_hepatb', 'f_ip_hepatc', 'no_infec', 'ob_ecvs', 'ob_ecvf',
           'f_ob_succ', 'f_ob_fail', 'ld_indl', 'ld_augm', 'ld_anes', 'f_ld_indl', 'f_ld_augm', 'f_ld_ster', 'f_ld_antb',
           'f_ld_chor', 'f_ld_anes', 'no_lbrdlv', 'me_pres', 'me_rout', 'me_trial', 'f_me_pres', 'f_me_rout', 'f_me_trial',
           'rdmeth_rec', 'dmeth_rec', 'f_dmeth_rec', 'mm_mtr', 'mm_plac', 'mm_rupt', 'mm_uhyst', 'mm_aicu', 'f_mm_mtr',
           'f_mm_rupt', 'f_mm_uhyst', 'f_mm_aicu', 'no_mmorb', 'mtran', 'pay', 'f_pay', 'f_pay_rec', 'apgar5r',
           'f_apgar5', 'apgar10', 'apgar10r', 'imp_plur', 'setorder_r', 'imp_sex', 'dlmp_mm', 'dlmp_yy', 'compgst_imp',
           'obgest_flg', 'gestrec10', 'gestrec3', 'lmpused', 'oegest_comb', 'oegest_r10', 'dbwt', 'bwtr12', 'bwtr4',
           'ab_aven1', 'ab_aven6', 'ab_nicu', 'ab_surf', 'ab_anti', 'ab_seiz', 'f_ab_vent', 'f_ab_vent6',
           'f_ab_surfac', 'f_ab_antibio', 'f_ab_seiz', 'no_abnorm', 'ca_anen', 'ca_mnsb', 'ca_cchd', 'ca_cdh', 'ca_omph', 
           'ca_gast', 'f_ca_anen', 'f_ca_menin', 'f_ca_heart', 'f_ca_hernia', 'f_ca_ompha', 'f_ca_gastro', 'ca_limb',
           'ca_cleft', 'ca_clpal', 'ca_disor', 'ca_hypo', 'f_ca_limb', 'f_ca_cleftlp', 'f_ca_cleft', 'f_ca_downs', 
           'f_ca_chrom', 'f_ca_hypos', 'no_congen', 'itran', 'ilive', 'bfed', 'f_bfed', 'f_mm_', 'f_ab_nicu', 'fracehisp',
           'oegest_r3']

In [40]:
df.drop(remove, axis=1, inplace=True)
df.columns

Index(['dob_mm', 'dob_tt', 'dob_wk', 'bfacil', 'mager', 'mbstate_rec',
       'restatus', 'mbrace', 'mhisp_r', 'dmar', 'meduc', 'fagecomb', 'frace6',
       'fhisp_r', 'feduc', 'priorlive', 'priordead', 'priorterm', 'ilp_r',
       'precare', 'previs', 'wic', 'cig_0', 'cig_1', 'cig_2', 'cig_3',
       'm_ht_in', 'bmi', 'dwgt_r', 'wtgain', 'rf_pdiab', 'rf_gdiab',
       'rf_phype', 'rf_ghype', 'rf_ehype', 'rf_ppterm', 'rf_inftr', 'rf_fedrg',
       'rf_artec', 'rf_cesarn', 'ip_gon', 'ip_syph', 'ip_chlam', 'ip_hepatb',
       'ip_hepatc', 'ld_ster', 'ld_antb', 'ld_chor', 'attend', 'pay_rec',
       'apgar5', 'dplural', 'sex', 'combgest', 'ca_downs'],
      dtype='object')

In [41]:
rename = ['birth_month', 'birth_time', 'birth_week', 'birth_place', 'mother_age', 'mother_nativity', 'mother_residence',
          'mother_race', 'mother_hispanic', 'marital_status', 'mother_education', 'father_age', 'father_race',
          'father_hispanic', 'father_education', 'living_children', 'deceased_children', 'terminations', 
          'months_last_pregnancy', 'first_prenatal_care_month', 'prenatal_visits', 'food_assistance', 'prepregnancy_daily_cig',
          'first_tri_daily_cig', 'second_tri_daily_cig', 'third_tri_daily_cig', 'height', 'bmi', 'delivery_weight',
          'weight_gain', 'diabetes', 'gestational_diabetes', 'hypertension', 'gestational_hypertension', 'eclampsia',
          'previous_premature_birth', 'infertility_treatment', 'fertility_drugs', 'assisted_reproduction',
          'previous_cesareans', 'gonorrhea', 'syphilis', 'chlamydia', 'hepatitis_b', 'hepatitis_c', 'steroids', 'antibiotics',
          'chorioamnionitis', 'attendant', 'payment_method', 'apgar_score', 'plural_pregnancy', 'infant_sex', 'weeks_gestation',
          'downs_syndrome']

In [42]:
df.columns = rename
df.columns

Index(['birth_month', 'birth_time', 'birth_week', 'birth_place', 'mother_age',
       'mother_nativity', 'mother_residence', 'mother_race', 'mother_hispanic',
       'marital_status', 'mother_education', 'father_age', 'father_race',
       'father_hispanic', 'father_education', 'living_children',
       'deceased_children', 'terminations', 'months_last_pregnancy',
       'first_prenatal_care_month', 'prenatal_visits', 'food_assistance',
       'prepregnancy_daily_cig', 'first_tri_daily_cig', 'second_tri_daily_cig',
       'third_tri_daily_cig', 'height', 'bmi', 'delivery_weight',
       'weight_gain', 'diabetes', 'gestational_diabetes', 'hypertension',
       'gestational_hypertension', 'eclampsia', 'previous_premature_birth',
       'infertility_treatment', 'fertility_drugs', 'assisted_reproduction',
       'previous_cesareans', 'gonorrhea', 'syphilis', 'chlamydia',
       'hepatitis_b', 'hepatitis_c', 'steroids', 'antibiotics',
       'chorioamnionitis', 'attendant', 'payment_method'