# Feature extraction
## Modality 5: Nutritional Knowledge
Each modality is a combination of features from the different CSV that conform the database. This modality's features should include the following:
- CS_ETIQUETADO_FRONTAL
    - "P3": "nut_cont_knowledge"
    - "P4": "nut_inf_decision"
    - "P5_1": "read_dng"
    - "P5_2": "read_nf_table"
    - "P5_3": "read_ing_list"
    - "P5_4": "read_none"
    - "P5_5": "read_not_know"
    - "P7": "buy_choice"
    - "P8": "compare_choice"
    - "P10": "time_choice"
    - "P11_1": "use_dng"
    - "P11_2": "use_nut_seal"
    - "P11_3": "use_legends"
    - "P11_4": "use_nut_inf"
    - "P11_5": "use_ing_list"
    - "P11_6": "use_none"
    - "P11_7": "use_not_know"
    - "P12": "unhealthy_product"
    - "P13": "sodium_level"
    - "P14": "calories"
    - "P17_1": "product_A"
    - "P17_2": "product_B"
    - "P17_3": "product_C"
    - "P18_1": "energy_importance"
    - "P18_2": "sodium_importance"
    - "P18_3": "sugars_importance"
    - "P18_4": "fats_importance"
    - "P18_5": "sat_fats_importance"

## Setup

### Libraries

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

### Paths

In [2]:
input_path = '../../0_source_csv/ensanut/'
input_path_sample = '../output'

output_path = '../output/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Set local variables

In [3]:
exec(open(glob_conf_path).read())

## Extract features

In [4]:
# Nutritional Knowledge dataset
nutritional_knowledge_dataset = pd.read_csv(os.path.join(input_path, "CS_ETIQUETADO_FRONTAL.csv"))

nutritional_knowledge_dataset = nutritional_knowledge_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","SEXO","P3","P4",
                                                               "P5_1","P5_2","P5_3","P5_4","P5_5","P7", "P8", "P10","P11_1",
                                                               "P11_2","P11_3","P11_4","P11_5","P11_6","P11_7","P12","P13",
                                                               "P14","P17_1","P17_2","P17_3","P18_1","P18_2","P18_3","P18_4",
                                                               "P18_5"]]

nutritional_knowledge_dataset = nutritional_knowledge_dataset.rename(columns={"EDAD":"age_nut_know_adult",
                                                                              "SEXO":"sex_nut_know_adult",
                                                                              "P3": "nut_cont_knowledge",
                                                                              "P4": "nut_inf_decision",
                                                                            "P5_1": "read_dng",
                                                                            "P5_2": "read_nf_table",
                                                                            "P5_3": "read_ing_list",
                                                                            "P5_4": "read_none",
                                                                            "P5_5": "read_not_know",
                                                                            "P7": "buy_choice",
                                                                            "P8": "compare_choice",
                                                                            "P10": "time_choice",
                                                                            "P11_1": "use_dng",
                                                                            "P11_2": "use_nut_seal",
                                                                            "P11_3": "use_legends",
                                                                            "P11_4": "use_nut_inf",
                                                                            "P11_5": "use_ing_list",
                                                                            "P11_6": "use_none",
                                                                            "P11_7": "use_not_know",
                                                                            "P12": "unhealthy_product",
                                                                            "P13": "sodium_level",
                                                                            "P14": "calories",
                                                                            "P17_1": "product_A",
                                                                            "P17_2": "product_B",
                                                                            "P17_3": "product_C",
                                                                            "P18_1": "energy_importance",
                                                                            "P18_2": "sodium_importance",
                                                                            "P18_3": "sugars_importance",
                                                                            "P18_4": "fats_importance",
                                                                            "P18_5": "sat_fats_importance"})

# Add column with primary keys for house and household
nutritional_knowledge_dataset["house_ID"] = nutritional_knowledge_dataset["UPM"].astype(str)+'_'+nutritional_knowledge_dataset["VIV_SEL"].astype(str)
nutritional_knowledge_dataset["household_ID"] = nutritional_knowledge_dataset["UPM"].astype(str)+'_'+nutritional_knowledge_dataset["VIV_SEL"].astype(str)+'_'+nutritional_knowledge_dataset["HOGAR"].astype(str)
nutritional_knowledge_dataset["person_ID"] = nutritional_knowledge_dataset["UPM"].astype(str)+'_'+nutritional_knowledge_dataset["VIV_SEL"].astype(str)+'_'+nutritional_knowledge_dataset["HOGAR"].astype(str)+'_'+nutritional_knowledge_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del nutritional_knowledge_dataset["UPM"]
del nutritional_knowledge_dataset["VIV_SEL"]
del nutritional_knowledge_dataset["HOGAR"]
del nutritional_knowledge_dataset["NUMREN"]

nutritional_knowledge_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43157 entries, 0 to 43156
Data columns (total 33 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age_nut_know_adult   43157 non-null  int64  
 1   sex_nut_know_adult   43157 non-null  int64  
 2   nut_cont_knowledge   38586 non-null  float64
 3   nut_inf_decision     28769 non-null  float64
 4   read_dng             16366 non-null  float64
 5   read_nf_table        16366 non-null  float64
 6   read_ing_list        16366 non-null  float64
 7   read_none            16366 non-null  float64
 8   read_not_know        16366 non-null  float64
 9   buy_choice           38586 non-null  float64
 10  compare_choice       38586 non-null  float64
 11  time_choice          38586 non-null  float64
 12  use_dng              38586 non-null  float64
 13  use_nut_seal         38586 non-null  float64
 14  use_legends          38586 non-null  float64
 15  use_nut_inf          38586 non-null 

In [5]:
nutritional_knowledge_dataset

Unnamed: 0,age_nut_know_adult,sex_nut_know_adult,nut_cont_knowledge,nut_inf_decision,read_dng,read_nf_table,read_ing_list,read_none,read_not_know,buy_choice,...,product_B,product_C,energy_importance,sodium_importance,sugars_importance,fats_importance,sat_fats_importance,house_ID,household_ID,person_ID
0,73,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,...,,,4.0,3.0,2.0,1.0,5.0,1_1,1_1_1,1_1_1_2
1,39,2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,...,,,5.0,4.0,1.0,2.0,3.0,1_2,1_2_1,1_2_1_2
2,47,2,2.0,,,,,,,1.0,...,9.0,9.0,,,,,,1_3,1_3_1,1_3_1_2
3,30,2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,3.0,...,9.0,9.0,4.0,3.0,2.0,5.0,1.0,1_4,1_4_1,1_4_1_4
4,49,2,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,...,9.0,9.0,1.0,5.0,2.0,4.0,3.0,1_5,1_5_1,1_5_1_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43152,58,2,,,,,,,,,...,,,,,,,,6289_14,6289_14_1,6289_14_1_2
43153,34,2,,,,,,,,,...,,,,,,,,6289_15,6289_15_1,6289_15_1_2
43154,24,2,,,,,,,,,...,,,,,,,,6289_16,6289_16_1,6289_16_1_2
43155,20,2,,,,,,,,,...,,,,,,,,6289_17,6289_17_1,6289_17_1_2


In [6]:
nutritional_knowledge_dataset.shape

(43157, 33)

Before merging the four datasets, first we need to identify include and distinguish the kinship and personal IDs of the random adult and the observation.

In [7]:
# Import residents dataset
residents_dataset = pd.read_csv(os.path.join(input_path, "CS_RESIDENTES.csv"))
residents_dataset = residents_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","SEXO","P3_5"]]
residents_dataset = residents_dataset.rename(columns={"P3_5": "kinship",})

# Add column with primary keys for house and household
residents_dataset["person_ID"] = residents_dataset["UPM"].astype(str)+'_'+residents_dataset["VIV_SEL"].astype(str)+'_'+residents_dataset["HOGAR"].astype(str)+'_'+residents_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del residents_dataset["UPM"]
del residents_dataset["VIV_SEL"]
del residents_dataset["HOGAR"]
del residents_dataset["NUMREN"]

# Generate dataframe with person_ID and kinship
kinships = pd.DataFrame(residents_dataset[["person_ID","kinship"]])
kinships

Unnamed: 0,person_ID,kinship
0,34_1_1_2,2
1,34_1_1_3,3
2,34_2_1_1,1
3,34_2_1_2,2
4,34_2_1_3,3
...,...,...
158039,6287_20_1_3,3
158040,6287_20_1_4,3
158041,6287_21_1_1,1
158042,6287_21_1_2,3


In [8]:
# Merge it to the dataset with adult information
nutritional_knowledge_dataset = nutritional_knowledge_dataset.merge(kinships, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove'))
nutritional_knowledge_dataset.shape

# Rename columns to identify them as random adult columns
nutritional_knowledge_dataset = nutritional_knowledge_dataset.rename(columns={"kinship":"kinship_random_adult"})

## Merge based on key

In [9]:
# Import observation dataset
sample_dataset = pd.read_csv(os.path.join(input_path_sample, "sample_dataset.csv"))

In [10]:
m5_dataset = sample_dataset.merge(nutritional_knowledge_dataset, left_on="household_ID", right_on="household_ID", how="left", suffixes=('', '_remove'))
m5_dataset.shape

(10303, 46)

In [11]:
m5_dataset.drop([i for i in m5_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
m5_dataset.columns

Index(['house_ID', 'household_ID', 'person_ID', 'region', 'strata',
       'locality_type', 'locality_size', 'age_months', 'age_years', 'sex',
       'BMI_SD', 'label_cat', 'label', 'age_nut_know_adult',
       'sex_nut_know_adult', 'nut_cont_knowledge', 'nut_inf_decision',
       'read_dng', 'read_nf_table', 'read_ing_list', 'read_none',
       'read_not_know', 'buy_choice', 'compare_choice', 'time_choice',
       'use_dng', 'use_nut_seal', 'use_legends', 'use_nut_inf', 'use_ing_list',
       'use_none', 'use_not_know', 'unhealthy_product', 'sodium_level',
       'calories', 'product_A', 'product_B', 'product_C', 'energy_importance',
       'sodium_importance', 'sugars_importance', 'fats_importance',
       'sat_fats_importance', 'kinship_random_adult'],
      dtype='object')

In [12]:
m5_dataset.shape

(10303, 44)

## Export dataset

In [13]:
#Export dataset as a csv
m5_dataset.to_csv(os.path.join(output_path,'m5_feature_extraction.csv'), index=None, header=True)