/
cdc_dataset.py
144 lines (125 loc) · 5.64 KB
/
cdc_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import sys
import requests
import pandas as pd
import xport, csv
def default_preprocessing(df):
return df
class CDCDataset():
"""
The CDC (Center for Disease Control and Prevention) questionnaire datasets [#]_ are surveys conducted
by the organization involving 1000s of civilians about various facets of daily life. There are 44
questionnaires that collect data about income, occupation, health, early childhood and many other
behavioral and lifestyle aspects of people living in the US. These questionnaires are thus a rich
source of information indicative of the quality of life of many civilians. More information about
each questionaire and the type of answers are available in the following reference.
References:
.. [#] `NHANES 2013-2014 Questionnaire Data
<https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2013>`_
"""
def __init__(self, custom_preprocessing=default_preprocessing, dirpath=None):
self._cdcfileinfo, self._cdcweb, self._cdcfiles = self._cdc_files_info()
self._dirpath = dirpath
if not self._dirpath:
self._dirpath = os.path.join(os.path.dirname(os.path.abspath(__file__)),
'..', 'data','cdc_data')
self._csv_path = os.path.join(self._dirpath, 'csv')
if not os.path.exists(self._dirpath):
os.mkdir(self._dirpath)
for f in self._cdcfiles:
try:
filename = os.path.join(self._dirpath, f)
if not os.path.exists(filename):
print("Downloading file {}".format(f))
file = requests.get(os.path.join(self._cdcweb, f), allow_redirects=True)
open(filename, 'wb').write(file.content)
except IOError as err:
print("IOError: {}".format(err))
sys.exit(1)
self._convert_xpt_to_csv()
#if custom_preprocessing:
# self._data = custom_preprocessing(df)
def _cdc_files_info(self):
# List of files (i.e. questionnaires) in the CDC dataset. The following 4 files were ignored due to processing issues.
# RXQ_RX_H: Prescription Medications
# SMQ_H: Smoking - Cigarette Use
# PUQMEC_H: Pesticide Use
# RXQ_DRUG.xpt: Prescription Medications - Drug Information
# If the errors can be fixed, they can be added to this list.
cdcfileinfo = ['Acculturation', 'Alcohol Use', 'Blood Pressure & Cholesterol', 'Cardiovascular Health',
'Cognitive Functioning', 'Consumer Behavior', 'Creatine Kinase', 'Current Health Status',
'Dermatology', 'Diabetes', 'Diet Behavior & Nutrition', 'Disability', 'Drug Use', 'Early Childhood',
'Food Security', 'Health Insurance', 'Hepatitis', 'Hospital Utilization & Access to Care',
'Housing Characteristics', 'Immunization', 'Income', 'Kidney Conditions - Urology',
'Medical Conditions', 'Mental Health - Depression Screener',
'Occupation', 'Oral Health', 'Osteoporosis', 'Physical Activity',
'Physical Functioning', 'Preventive Aspirin Use','Reproductive Health',
'Sexual Behavior', 'Sleep Disorders', 'Smoking - Household Smokers', 'Smoking - Recent Tobacco Use',
'Smoking - Secondhand Smoke Exposure', 'Taste & Smell', 'Volatile Toxicant (Subsample)',
'Weight History', 'Weight History - Youth'
]
cdcweb = 'https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/'
cdcfiles = ["ACQ_H.XPT",
"ALQ_H.XPT",
"BPQ_H.XPT",
"CDQ_H.XPT",
"CFQ_H.XPT",
"CBQ_H.XPT",
"CKQ_H.XPT",
"HSQ_H.XPT",
"DEQ_H.XPT",
"DIQ_H.XPT",
"DBQ_H.XPT",
"DLQ_H.XPT",
"DUQ_H.XPT",
"ECQ_H.XPT",
"FSQ_H.XPT",
"HIQ_H.XPT",
"HEQ_H.XPT",
"HUQ_H.XPT",
"HOQ_H.XPT",
"IMQ_H.XPT",
"INQ_H.XPT",
"KIQ_U_H.XPT",
"MCQ_H.XPT",
"DPQ_H.XPT",
"OCQ_H.XPT",
"OHQ_H.XPT",
"OSQ_H.XPT",
"PAQ_H.XPT",
"PFQ_H.XPT",
"RXQASA_H.XPT",
"RHQ_H.XPT",
"SXQ_H.XPT",
"SLQ_H.XPT",
"SMQFAM_H.XPT",
"SMQRTU_H.XPT",
"SMQSHS_H.XPT",
"CSQ_H.XPT",
"VTQ_H.XPT",
"WHQ_H.XPT",
"WHQMEC_H.XPT"]
return cdcfileinfo, cdcweb, cdcfiles
def _convert_xpt_to_csv(self):
if not os.path.exists(self._csv_path):
os.mkdir(self._csv_path)
for i in range(len(self._cdcfiles)):
f = self._cdcfiles[i]
finfo = self._cdcfileinfo[i]
xptfile = os.path.join(self._dirpath, f)
csvfile = os.path.join(self._csv_path, f)
csvfile = os.path.splitext(csvfile)[0]
csvfile = csvfile + ".csv"
if not os.path.exists(csvfile):
print("converting ", finfo, ": ", xptfile, " to ", csvfile)
with open(xptfile, 'rb') as in_xpt:
with open(csvfile, 'w',newline='') as out_csv:
reader = xport.Reader(in_xpt)
writer = csv.writer(out_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(reader.fields)
for row in reader:
writer.writerow(row)
def get_csv_file(self, filename):
return pd.read_csv(os.path.join(self._csv_path, filename))
def get_csv_file_names(self):
return [os.path.splitext(x)[0]+".csv" for x in self._cdcfiles]