# Predicting the Hyper-local Prevalence of Chronic Kidney Disease

Using Census and CDC data, we may be able to better identify neighborhoods where aggressive public health campaigns and healthcare initiatives can positively affect the early detection and treatment of CKD.

## 1.0 Load Libraries

In [1]:
# import libraries
import os
import sys
from dotenv import load_dotenv, find_dotenv

%load_ext autoreload
%autoreload 2
%load_ext watermark

In [2]:
# define project root directory
PROJ_ROOT = os.path.join(os.pardir)

# define library directory
sys.path.append(PROJ_ROOT)

load_dotenv(find_dotenv(), verbose=True)

# Census Bureau API key
API_KEY = os.environ.get("CENSUS_API_KEY")

In [3]:
# import project modules
from predict_ckd.data import *

In [4]:
%watermark -a "E. Chris Lynch" -d -t -v -p numpy,pandas,matplotlib,requests,sklearn

E. Chris Lynch 2019-03-19 23:58:44 

CPython 3.6.6
IPython 7.1.1

numpy 1.15.4
pandas 0.24.1
matplotlib 3.0.3
requests 2.20.0
sklearn 0.20.0


## 1.1 Load Data
This function will load 2016 data that was collected from the Census Bureau API and cleaned in order to train a predictive model.

In [5]:
# load train and test data
X_train, X_test, y_train, y_test = load_and_split_dataset()

## 2.0 Train the Model
We will use the data to create a model using stochastic XGBoost. This model will be saved in order to predict CKD on 2017 and future data.

In [6]:
# model data with XGBoost
params = {'n_estimators':700, 'max_depth': 5, 'eta':0.1,
          'colsample_bytree':0.1, 'random_state':22}

xgb_model = xgb_model(params=params, features_train=X_train, 
                      labels_train=y_train, features_test=X_test, 
                      labels_test=y_test)
xgb_model.save_model('data/xgb.model')

R2: 0.8577
MSE: 0.1276
Adjusted R2: 0.8526


## 3.0 Make Predictions
We will now make predictions using 2017 data for Contra Costa County in California. If you don't know the FIPS code for your county, you can use the `get_county_code` module. You must have a valid Census Bureau API key to run the following code.

In [9]:
# get code for example county
code = get_county_code(state='CA', county='Contra Costa County')
code

192    013
Name: county_code, dtype: object

In [10]:
# get tract-level Census data for given county
contra_costa = get_county_data(state='CA', county='013', key=API_KEY, year='2017')

In [19]:
# predict CKD prevalence
cc_preds = predict_county_ckd(model='data/xgb.model', data=contra_costa)
cc_preds.head()

Unnamed: 0,S0101_C02_002E,state,county,tract,S0101_C02_003E,S0101_C02_004E,S0101_C02_005E,S0101_C02_006E,S0101_C02_007E,S0101_C02_008E,...,S2701_C05_002E,S2701_C05_005E,S2701_C05_011E,S2703_C03_003E,S2703_C03_004E,S2703_C03_005E,S2704_C03_003E,S2704_C03_004E,S2704_C03_005E,ckd_prediction
0,0.0,6,13,351102,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.5,1.85,66.65,34.5,42.5,0.0,3.9,99.1,5.480069
1,5.1,6,13,339002,2.9,4.5,4.8,8.1,15.9,7.7,...,0.0,4.8,4.6,62.7,70.5,31.1,0.0,0.6,92.6,3.700408
2,8.6,6,13,355115,11.1,9.5,6.1,4.0,5.7,8.6,...,0.0,8.2,1.4,75.7,79.3,11.1,0.0,0.6,92.4,2.949047
3,5.5,6,13,355116,12.4,13.8,8.3,2.1,1.9,3.5,...,0.0,0.0,0.4,87.1,86.0,33.3,0.0,0.8,89.8,3.426589
4,5.6,6,13,355117,13.4,15.0,6.8,3.7,1.4,1.5,...,0.0,0.0,0.5,91.5,89.4,22.5,0.0,1.1,87.6,2.829141


In [20]:
contra_costa_full = contra_costa.merge(pd.Series(ckd_preds).rename('ckd_prediction'), left_index=True, right_index=True)
contra_costa_full.head()

Unnamed: 0,S0101_C02_002E,state,county,tract,S0101_C02_003E,S0101_C02_004E,S0101_C02_005E,S0101_C02_006E,S0101_C02_007E,S0101_C02_008E,...,S2701_C05_002E,S2701_C05_005E,S2701_C05_011E,S2703_C03_003E,S2703_C03_004E,S2703_C03_005E,S2704_C03_003E,S2704_C03_004E,S2704_C03_005E,ckd_prediction
0,0.0,6,13,351102,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.5,1.85,66.65,34.5,42.5,0.0,3.9,99.1,5.480069
1,5.1,6,13,339002,2.9,4.5,4.8,8.1,15.9,7.7,...,0.0,4.8,4.6,62.7,70.5,31.1,0.0,0.6,92.6,3.700408
2,8.6,6,13,355115,11.1,9.5,6.1,4.0,5.7,8.6,...,0.0,8.2,1.4,75.7,79.3,11.1,0.0,0.6,92.4,2.949047
3,5.5,6,13,355116,12.4,13.8,8.3,2.1,1.9,3.5,...,0.0,0.0,0.4,87.1,86.0,33.3,0.0,0.8,89.8,3.426589
4,5.6,6,13,355117,13.4,15.0,6.8,3.7,1.4,1.5,...,0.0,0.0,0.5,91.5,89.4,22.5,0.0,1.1,87.6,2.829141


In [21]:
contra_costa_full.to_csv('data/contra_costa.csv')