In [1]:
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display 

from publicdata.chis.prepare import *

%matplotlib inline
sns.set_context('notebook')


In [2]:
pkg = mp.jupyter.open_package()
#pkg = mp.jupyter.open_source_package()
pkg

In [3]:
columns = pkg.reference('adult_2017').row_generator.columns

def find_var(*vals):

    
    result = []
    
    for c in columns:
        for v in vals:
            if v in c['description'].lower() or v in c['name'].lower():
                result.append((c['name'], c['description']))
            
    return result
    

diabet_vars = find_var('diabet')+find_var('general health')

food_vars = find_var('eat')+find_var('food')+find_var('hungry')+\
    find_var('drink')+find_var('soda')+find_var('bmi')+find_var('obe')+find_var('fresh')+\
    find_var('veg')+find_var('fruit')+find_var('fries')+find_var('beans')

demo_vars = find_var('gender')+find_var('hisp')+find_var('race')+find_var('income')+find_var('pov')+find_var('urban')

subset_vars = list(set(diabet_vars+food_vars+demo_vars))
subset_vars

[('ae5', '# TIMES ATE COOKED DRIED BEANS IN PAST MONTH'),
 ('dmc9', 'HOW LONG AGO TREATED UNFAIRLY W/ MEDICAL CARE DUE TO RACE/ETH'),
 ('dmc6b_p1', 'MAIN RSN TREATED UNFAIRLY GET MED. CARE (PUF 1 YR RECODE)'),
 ('whobmi', 'BODY MASS INDEX: WHO DEFINITION'),
 ('ur_tract6', 'RURAL AND URBAN - CLARITAS (BY CENSUS TRACT) (6 LVLS)'),
 ('am5', "HOW OFTEN HUNGRY BUT DIDN'T EAT B/C OF MONEY IN PAST 12 MOS"),
 ('ae3', '# TIMES ATE FRNCH FRIES, HME FRIES, HSH BRWNS IN PAST MO'),
 ('ae_soda', '# OF TIMES DRINKING SODA PER WEEK'),
 ('ovrwt', 'OVERWEIGHT OR OBESE'),
 ('srsex', 'SELF-REPORTED GENDER'),
 ('ab25', 'CURRENTLY TAKING DIABETIC PILLS TO LOWER BLOOD SUGAR'),
 ('povgwd_p1', 'FAMILY POVERTY THRESHOLD LEVEL (PUF 1 YR RECODE)'),
 ('ur_omb', 'RURAL AND URBAN - OMB'),
 ('ur_bg6', 'RURAL AND URBAN - CLARITAS (BY BLOCK GROUP)(6 LVLS)'),
 ('al6', 'RECEIVING SSI (SUPPLEMENTAL SECURITY INCOME)'),
 ('srh', 'SELF-REPORTED LATINO/HISPANIC'),
 ('ab22', 'DOCTOR EVER TOLD HAVE DIABETES'),
 ('povll_aca',
  

In [4]:
df = pkg.reference('adult_2017').dataframe()

In [41]:
dfr = df[[c for c in df.columns if 'raked' in c]].copy()
dfv = df[[c for c in df.columns if 'raked' not in c]].copy()

dfv = convert_to_numbers(dfv) 


In [5]:
dfs = df[[e[0] for e in subset_vars]]

In [6]:
d = pd.get_dummies(dfs)

In [7]:
corrs = d.corr()

In [8]:
len(corrs)

2986

In [9]:
t = corrs.stack().to_frame()
t.columns= ['correl']
top_cor = t[t.correl<1.0].sort_values('correl',ascending=False)

In [10]:
t = top_cor.reset_index()
t2 = t[ (~t.level_0.str.contains('INAPPLICABLE')) & (~t.level_1.str.contains('INAPPLICABLE'))]
                                                     

In [11]:
t2[(t2.correl < .8) & (t2.correl > .7)]

Unnamed: 0,level_0,level_1,correl
210,ak32_YES,ak33_p1_1.0,0.799663
211,ak33_p1_1.0,ak32_YES,0.799663
212,ur_tract6_SUBURBAN,ur_bg6_SUBURBAN,0.787830
213,ur_bg6_SUBURBAN,ur_tract6_SUBURBAN,0.787830
214,ab25_YES,ab112_YES,0.785694
215,ab112_YES,ab25_YES,0.785694
216,ae_soda_2.0,aesoda_p1_2-3 TIMES,0.777431
217,aesoda_p1_2-3 TIMES,ae_soda_2.0,0.777431
218,fslevcb_FOOD SECURITY,am5_NO,0.777065
219,fslev_FOOD SECURITY,am5_NO,0.777065


In [12]:
find_var('bmi','prepovll', 'ae7')

[('ae7', '# OF TIMES ATE VEGETABLES IN PAST MO'),
 ('whobmi', 'BODY MASS INDEX: WHO DEFINITION'),
 ('rbmi', 'BMI DESCRIPTIVE'),
 ('bmi_p', 'BODY MASS INDEX (PUF RECODE)')]

In [13]:
x = pd.DataFrame({ 'ae7' : dfs.ae7.astype(float),
                    'bmi' : dfs.bmi_p.astype(float)})
x.corr()

Unnamed: 0,ae7,bmi
ae7,1.0,-0.075463
bmi,-0.075463,1.0


In [33]:
dfn[['ae7','bmi_p','ac11','povll2_p1v2']].corr()



Unnamed: 0,ae7,bmi_p,ac11,povll2_p1v2
ae7,1.0,-0.075463,-0.078796,0.138781
bmi_p,-0.075463,1.0,0.047354,-0.10928
ac11,-0.078796,0.047354,1.0,-0.144427
povll2_p1v2,0.138781,-0.10928,-0.144427,1.0


In [49]:
t = dfv.select_dtypes('float64').corr().stack().to_frame().sort_values(0, ascending = False)
t[t[0]< .3]

Unnamed: 0,Unnamed: 1,0
ac46,ac11,0.185315
ac11,ac46,0.185315
ac46,ae_soda,0.185200
ae_soda,ac46,0.185200
povll2_p1v2,heighm_p,0.183933
hghtm_p,povll2_p1v2,0.183933
povll2_p1v2,hghtm_p,0.183933
heighm_p,povll2_p1v2,0.183933
povll2_p1v2,hghti_p,0.183496
hghti_p,povll2_p1v2,0.183496


In [53]:
(dfv.ab1.value_counts()/21153*100).round(2)

VERY GOOD    32.27
GOOD         30.18
EXCELLENT    17.38
FAIR         15.09
POOR          5.07
Name: ab1, dtype: float64