## Steps for Calculating Healthy Eating Index Scores - HEI-2020 & HEI–2015 Components & Scoring Standards

In [12]:
# DISCLAIMER:
# -----------
# 
# This code is provided "as-is" without any warranties or guarantees. 
# The author is not responsible for the results you obtain by using it. 
# Please ensure you understand the code before using it in critical applications.
#
# If you have any questions or need clarification, you can reach out via email 
# at abdelhaq.fste@gmail.com or contact me using my Slack handle: https://sanjosechapte-jto9479.slack.com/archives/D07T2ATT0P5
#
# Use at your own risk.
#
# Author: Abdelhaq KHARROU

<h3>References (and attribution) used in both this codebook and README.md:</h3>
<ul>
    <li><a href="https://epi.grants.cancer.gov/hei/hei-scoring-method.html"> HEI Scoring Algorithm</a></li>
    <li><a href="https://epi.grants.cancer.gov/hei/developing.html#2015">HEI-2020 & HEI–20151 Components & Scoring Standards</a></li>
    <li><a href="https://epi.grants.cancer.gov/hei/calculating-hei-scores.html">Steps for Calculating Healthy Eating Index Scores & Dietary  Constituents for HEI-2020</a> </li>
    <li><a href="https://github.com/AnnieKLamar/foodframe/tree/master">Github repo to calculate HEI, some corrections have been made to the code</a></li>
    <li><a href="https://www.ars.usda.gov/northeast-area/beltsville-md-bhnrc/beltsville-human-nutrition-research-center/food-surveys-research-group/docs/fped-databases/">FPED for Use with WWEIA, NHANES 2017-March 2020 Prepandemic</a></li>
    <li><a href="https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DR1IFF.htm">Dietary Interview - Individual Foods, First Day (P_DR1IFF)</a></li>
<li><a href="https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2017/DataFiles/P_DR2IFF.htm">Dietary Interview - Individual Foods, Second Day (P_DR2IFF)</a></li>

</ul>

<h3>To reproduce the results of this notebook, you may need to use the <strong>nhanes.yml</strong> conda environment acompagnied with this note book</h3>

In [13]:
# All imports go here
import pandas as pd
import numpy as np
import re # Importing re package for using regular expressions
from nutrition import get_nutrition_df
from hei import get_hei_df
from nutrition import get_nutrition_df
from IPython.display import display, Markdown

In [14]:
df_diet1 = pd.read_csv('data/mice_imputation_dr1.csv')
df_diet2 = pd.read_csv('data/mice_imputation_dr2.csv')

df_diet1.head()

Unnamed: 0,SEQN,DRIFDCD,DRIKCAL,DRISODI,DRISFAT,DRIMFAT,DRIPFAT
0,109263.0,28320300.0,114.0,649.0,1.472,2.105,0.948
1,109263.0,55501000.0,58.0,1.0,0.029,0.033,0.028
2,109263.0,94000010.0,5.397605e-79,5.0,5.397605e-79,5.397605e-79,5.397605e-79
3,109263.0,94000100.0,5.397605e-79,4.0,5.397605e-79,5.397605e-79,5.397605e-79
4,109263.0,11710801.0,129.0,47.0,1.635,2.521,1.391


### Calculate Healthy Eating Index (HEI)

In [15]:
out_vars = ['SEQN', 'DRIFDCD', 'DRIKCAL', 'DRISODI', 'DRISFAT', 'DRIMFAT','DRIPFAT']


df_diet1 = df_diet1[out_vars]
df_diet2 = df_diet2[out_vars]

df_diet1 = df_diet1.rename(
    columns={
        'SEQN': 'sequence_number',
        'DRIFDCD': 'food_code',
        'DRISODI' : 'sodium_mg',
        'DRIKCAL': 'energy_kcal',
        'DRISFAT': 'saturated_fats_g',
        'DRIMFAT': 'monounsaturated_fats_g',
        'DRIPFAT': 'polyunsaturated_fats_g',
    }
)


df_diet2 = df_diet2.rename(
    columns={
        'SEQN': 'sequence_number',
        'DRIFDCD': 'food_code',
        'DRISODI' : 'sodium_mg',
        'DRIKCAL': 'energy_kcal',
        'DRISFAT': 'saturated_fats_g',
        'DRIMFAT': 'monounsaturated_fats_g',
        'DRIPFAT': 'polyunsaturated_fats_g',

    }
)

df1_hei_nutrition = get_nutrition_df(df_diet1, 'data/')
df2_hei_nutrition = get_nutrition_df(df_diet2, 'data/')
#print(df1_hei_nutrition.isna().sum())
df1_hei_nutrition[df1_hei_nutrition.isnull().any(axis=1)][['sequence_number','total_fruits_cup','V_TOTAL (cup eq)','F_OTHER (cup eq)','added_sugars_tsp','total_vegetables_cup','total_vegetables_cup']]
df1_hei_components =  get_hei_df(df1_hei_nutrition)
df2_hei_components = get_hei_df(df2_hei_nutrition)

In [16]:
## Instantiate the FoodFrame class to calculate HEI
## DONT USE directly from the repo, many adaptation and corrections made to it so that it works as expected
## https://github.com/AnnieKLamar/foodframe/tree/master

hei_score_variables = ['sequence_number', 'whole_grains', 'total_fruits', 'whole_fruits', 'total_vegetables',
                       'greens_and_beans', 'whole_grains', 'dairy', 'total_protein_foods', 
                       'seafood_and_plant_proteins', 'refined_grains', 'added_sugars', 'sodium',
                       'saturated_fats', 'fatty_acids','hei_score']




In [17]:
df1_hei_components[hei_score_variables]

Unnamed: 0,sequence_number,whole_grains,total_fruits,whole_fruits,total_vegetables,greens_and_beans,whole_grains.1,dairy,total_protein_foods,seafood_and_plant_proteins,refined_grains,added_sugars,sodium,saturated_fats,fatty_acids,hei_score
0,109263.0,7.370423,1.783167,0.000000,1.069900,0.000000,7.370423,4.883134,3.537803,5.000000,10.0,10.0,10.0,10.0,1.046253,64.690680
1,109264.0,10.000000,0.000000,0.000000,5.000000,5.000000,10.000000,10.000000,5.000000,3.107075,10.0,10.0,10.0,10.0,10.000000,88.107075
2,109265.0,0.000000,4.770249,4.348390,5.000000,2.725857,0.000000,10.000000,5.000000,0.129803,10.0,10.0,10.0,10.0,0.209394,72.183693
3,109266.0,0.000000,1.141048,2.282097,5.000000,1.619552,0.000000,10.000000,0.895171,2.429329,10.0,10.0,10.0,10.0,6.269708,69.636904
4,109269.0,0.000000,0.149880,0.000000,0.363346,0.000000,0.000000,4.365738,0.239808,0.000000,10.0,10.0,10.0,10.0,10.000000,55.118772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12627,124817.0,0.000000,3.923519,5.000000,0.000000,0.000000,0.000000,0.340067,5.000000,0.000000,10.0,10.0,10.0,10.0,0.515955,54.779541
12628,124818.0,0.000000,0.436272,0.872544,0.070509,0.000000,0.000000,2.187575,0.103413,0.000000,10.0,10.0,10.0,10.0,4.004607,47.674919
12629,124819.0,0.228702,2.144082,4.288165,3.196632,0.000000,0.228702,10.000000,3.956547,1.500858,10.0,10.0,10.0,10.0,0.000000,65.314985
12630,124820.0,10.000000,4.723837,5.000000,0.943824,0.000000,10.000000,9.136213,1.179402,0.000000,10.0,10.0,10.0,10.0,4.531629,75.514904


In [18]:
df1_hei_components.isnull().sum()

sequence_number           0
energy_kcal               0
sodium_mg                 0
saturated_fats_g          0
monounsaturated_fats_g    0
                         ..
sodium                    0
added_sugars              0
saturated_fats            0
fatty_acids               0
hei_score                 0
Length: 63, dtype: int64

In [19]:
df2_hei_components[hei_score_variables]

Unnamed: 0,sequence_number,whole_grains,total_fruits,whole_fruits,total_vegetables,greens_and_beans,whole_grains.1,dairy,total_protein_foods,seafood_and_plant_proteins,refined_grains,added_sugars,sodium,saturated_fats,fatty_acids,hei_score
0,109263.0,0.000000,5.000000,0.000000,0.120356,0.000000,0.000000,10.000000,5.000000,0.000000,10.0,10.0,10.0,10.0,0.840939,60.961295
1,109264.0,7.315390,0.000000,0.000000,3.670243,5.000000,7.315390,3.304666,5.000000,0.000000,10.0,10.0,10.0,10.0,2.106046,66.396345
2,109265.0,9.714163,3.264023,3.304320,2.373835,0.000000,9.714163,4.562813,3.146357,0.000000,10.0,10.0,10.0,10.0,1.004608,67.370119
3,109266.0,10.000000,1.845992,3.691983,4.579018,0.922996,10.000000,10.000000,2.415612,1.450422,10.0,10.0,10.0,10.0,2.165886,77.071908
4,109269.0,0.000000,0.073790,0.000000,5.000000,0.000000,0.000000,10.000000,5.000000,0.000000,10.0,10.0,10.0,10.0,7.768825,67.842615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10825,124815.0,5.513784,2.088555,0.000000,0.000000,0.000000,5.513784,0.000000,5.000000,5.000000,10.0,10.0,10.0,10.0,10.000000,67.602339
10826,124816.0,0.000000,5.000000,5.000000,5.000000,2.217742,0.000000,9.925558,5.000000,2.192540,10.0,10.0,10.0,10.0,5.900282,80.236122
10827,124818.0,0.000000,0.384484,0.000000,5.000000,0.000000,0.000000,7.992008,3.219412,2.328264,10.0,10.0,10.0,10.0,0.000000,58.924168
10828,124819.0,0.000000,4.429134,4.483815,5.000000,0.000000,0.000000,1.749781,5.000000,0.000000,10.0,10.0,10.0,10.0,8.183399,68.846128


In [20]:
df2_hei_components.isnull().sum()

sequence_number           0
energy_kcal               0
sodium_mg                 0
saturated_fats_g          0
monounsaturated_fats_g    0
                         ..
sodium                    0
added_sugars              0
saturated_fats            0
fatty_acids               0
hei_score                 0
Length: 63, dtype: int64

### Final HEI using Day 1 recall and Day 2 recall

In [21]:
# Merge datasets on SEQN
merged_df = pd.merge(df1_hei_components, df2_hei_components,how='outer', on='sequence_number', suffixes=('_df1', '_df2'))

display(Markdown('***Counting NaN values before making replacement***'))
print(merged_df.isnull().sum())

***Counting NaN values before making replacement***

sequence_number                  0
energy_kcal_df1                  2
sodium_mg_df1                    2
saturated_fats_g_df1             2
monounsaturated_fats_g_df1       2
                              ... 
sodium_df2                    1804
added_sugars_df2              1804
saturated_fats_df2            1804
fatty_acids_df2               1804
hei_score_df2                 1804
Length: 125, dtype: int64


In [22]:
# Create subset with no missing values for hei_score1 and hei_score2
merged_df_subset = merged_df.dropna(axis=0, subset=['hei_score_df1', 'hei_score_df2'])
merged_df_subset = merged_df_subset.loc[:,['hei_score_df1', 'hei_score_df2']]
display(Markdown('***Counting NaN values before making replacement for merged_df***'))
print(merged_df_subset.isnull().sum())

***Counting NaN values before making replacement for merged_df***

hei_score_df1    0
hei_score_df2    0
dtype: int64


In [23]:
# Calculate the mean of corresponding columns
for col in df1_hei_components.columns:
    if col != 'sequence_number':
        merged_df[f'{col}_df1'] = merged_df[f'{col}_df1'].fillna(merged_df[f'{col}_df2'])
        merged_df[f'{col}_df2'] = merged_df[f'{col}_df2'].fillna(merged_df[f'{col}_df1'])


display(Markdown('***Counting NaN values after making replacement***'))
print(merged_df.isnull().sum())

# Create another dataframe to avoid memory silicing, while dropping values from df_merged
mean_df = pd.DataFrame()
for col in df1_hei_components.columns:
    if col != 'sequence_number':  # Avoid calculating mean for the SEQN column
        mean_df[col] = (merged_df[f'{col}_df1'] + merged_df[f'{col}_df2']) / 2

mean_df['sequence_number'] = merged_df['sequence_number']

# Show the resulting dataframe
display(mean_df[['sequence_number', 'hei_score']].head(60))
mean_df[['sequence_number', 'hei_score']].to_csv('data/hei_index_dietary_dataset.csv', index=False)

***Counting NaN values after making replacement***

sequence_number               0
energy_kcal_df1               0
sodium_mg_df1                 0
saturated_fats_g_df1          0
monounsaturated_fats_g_df1    0
                             ..
sodium_df2                    0
added_sugars_df2              0
saturated_fats_df2            0
fatty_acids_df2               0
hei_score_df2                 0
Length: 125, dtype: int64


Unnamed: 0,sequence_number,hei_score
0,109263.0,62.825988
1,109264.0,77.25171
2,109265.0,69.776906
3,109266.0,73.354406
4,109269.0,61.480694
5,109270.0,65.22631
6,109271.0,62.23961
7,109272.0,40.0
8,109273.0,52.813817
9,109274.0,58.554643
