# Main Notebook of Milestone 1 Presentation

In [11]:
__author__ = "Josh Fisher, Nick Bermudez, Zhou Jiang"

In [12]:
import os
import json
import pandas as pd
from visualizations import Visualizations

## Data Source

We were interested in two of the CDC's National Health and Nutrition Examination Survey (NHANES) datasets. Specifically, the demographics dataset [(link)](https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Demographics&Cycle=2017-2020) and the total nutrient intakes dataset [(link)](https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Dietary&Cycle=2017-2020).

The demographics survery asks questions regarding demographic factors such as pregnancy status, ratio of family income to poverty, and "other selected demographic information, such as gender, age, race/Hispanic origin, education, marital status, country of birth, and years of U.S. residence." [source](https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_DEMO.htm)

"The objective of the dietary interview component is to obtain detailed dietary intake information from NHANES participants. The dietary intake data are used to estimate the types and amounts of foods and beverages (including all types of water) consumed during the 24-hour period prior to the interview (midnight to midnight), and to estimate intakes of energy, nutrients, and other food components from those foods and beverages." [source](https://wwwn.cdc.gov/Nchs/Nhanes/2015-2016/DR1TOT_I.htm)

We were able to scrape all available years for both datasets but opted to use only the most recent report as we were interested in current health trends. You can check out the webscraping script at [Scripts/webscrape.py](webscrape.py)

The format of each dataset is .xpt and the variable names are encoded. Data is stored in Data/Final/*.xpt and data dictionaries mapping encoded_var: var can be found in same directory

In [13]:
class NHANESDataFrame:
    """ This class is specific to NHANES dataset and contains cleaning functions, utilities, etc. """
    def __init__(self, path):
        self.path = path

    def clean_demographics_df(self):
        """ returns cleaned, renamed, demographics dataframe """
        demo_d = self.get_demographics_dict()
        df = pd.read_sas(self.path)

        # filter, only include cols in demo_d.keys()
        df = df[list(demo_d.keys())]

        # rename to demo_d.valuese()
        df = df.rename(columns=demo_d)

        # Removing rows where all values are NaN
        temp = list(demo_d.values())
        temp.remove('id')
        return df

    def clean_total_nutrients_df(self):
        """ returns cleaned, renamed, total nutrients dataframe """
        d = {'SEQN': 'id'}
        macro_d = self.get_macronutrient_dict()
        ess_d = self.get_essential_vitamins_dict()
        oth_d = self.get_other_nutrients_dict()

        d.update(macro_d)
        d.update(ess_d)
        d.update(oth_d)

        df = pd.read_sas(self.path)

        # filter
        df = df[list(d.keys())]
        # rename
        df = df.rename(columns=d)
        return df

    @staticmethod
    def read_combine_clean_demographics_total_nutrients(demographics_path, total_nutrients_path):
        """ Reads demographics and total nutrients, clean both, and merge the two dataframe together """
        demo_df = NHANESDataFrame(demographics_path).clean_demographics_df()
        nuts_df = NHANESDataFrame(total_nutrients_path).clean_total_nutrients_df()
        df = demo_df.merge(nuts_df, on='id')
        return df

    @staticmethod
    def get_essential_vitamins_dict():
        """
        https://www.nia.nih.gov/health/vitamins-and-supplements/vitamins-and-minerals-older-adults#:~:text=There%20are%2013%20essential%20vitamins,keep%20the%20body%20working%20properly.
        There are 13 essential vitamins —
        vitamins A, C, D, E, K,
        and the B vitamins (thiamine, riboflavin, niacin, pantothenic acid, biotin, B6, B12, and folate).

        no pantothenic acid in NHANES
        no biotin in NHANES

        :return:
            encoded_var: var
        """
        d = {
            "DR1TVARA": 'vitaminA', "DR1TVC": "vitaminC", "DR1TVD": "vitaminD", "DR1TATOC": 'vitaminE', "DR1TVK": "vitaminK",
            "DR1TVB1": 'thiamine', "DR1TVB2": "riboflavin", "DR1TNIAC": 'niacin', "DR1TVB6": "vitaminB6",
            "DR1TVB12": "vitaminB12", "DR1TFOLA": "folate"
        }
        return d

    @staticmethod
    def get_macronutrient_dict() -> dict:
        """
        Common macronutrients
        :return:
            encoded_var: var
        """
        d = {"DR1TKCAL": "kcals", "DR1TPROT": "protein", "DR1TCARB": "carbs", "DR1TSUGR": "sugar",
            "DR1TTFAT": "total_fat", "DR1TSFAT": "sat_fat", "DR1TMFAT": "mono_fat", "DR1TPFAT": "poly_fat",
            "DR1TCHOL": "cholesterol"}
        return d

    @staticmethod
    def get_other_nutrients_dict() -> dict:
        d = {"DR1TCALC": "calcium", "DR1TPHOS": "phosphorus", "DR1TMAGN": "magnesium",
             "DR1TIRON": "iron", "DR1TZINC": "zinc", "DR1TCOPP": "copper", "DR1TSODI": "sodium",
             "DR1TPOTA": "potassium", "DR1TSELE": "selenium", "DR1TCAFF": "caffeine",
             "DR1TTHEO": "theobromine", "DR1TALCO": "alcohol"}
        return d

    @staticmethod
    def get_demographics_dict() -> dict:
        with open('../Data/Final/demographics.json') as f:
            d = json.load(f)
        return d



Going from raw XPTs to cleaned csv uses a lot of RAM (the concatenated dfs are about 1.5 gbs each) and I don't trust the jupyter server enough so here are some convenient function that read in pre-cleaned dataframes

In [15]:
def get_demographics_df():
    """ Convenient function to get cleaned demographics dataframes"""
    df = pd.read_csv("../Data/Final/demographics_clean.csv")
    return df

def get_total_nutrients_df():
    """ Convenient function to get cleaned total nutrients dataframes"""
    df = pd.read_csv("../Data/Final/total_nutrients_clean.csv")
    return df

def get_combined_df():
    """ Convenient function to get cleaned and combined demographics and total nutrient file """
    df = pd.read_csv('../Data/Final/nhanes.csv')
    return df.loc[df.year == 2020]



In [16]:
# TODO: Add Nick EDA

In [17]:
# TODO: Add Zhou EDA

In [None]:

vars = list(NHANESDataFrame.get_other_nutrients_dict().values())
vars.remove('alcohol')
vars.remove('copper')
vars.remove('phosphorus')
vars.remove('potassium')
vars.remove('caffeine')
vars.remove('theobromine')
vars = ['caffeine', 'vitaminD', 'vitaminC', *vars]

Visualizations.ridgeplot(df)
