# Feature extraction
## Modality 1: House characteristics
Each modality is a combination of features from the different CSV that conform the database.
- CS_HOGARES
    - P2_9_2: hired_care
    - P2_9_5: hired_cook
    - P6_1_1: has_TV
    - P6_1_2: has_paid_TV
    - P6_1_3: has_radio
    - P6_1_4: has_audio_system
    - P6_1_5: has_computer
    - P6_1_6: has_cell_phone
    - P6_1_7: has_internet
    - P6_1_8: has_phone
    - P6_1_9: has_iron
    - P6_1_10: has_blender
    - P6_1_11: has_refrigerator
    - P6_1_12: has_gas_stove
    - P6_1_13: has_other_stove
    - P6_1_14: has_wash_dry
    - P6_1_15: has_microwave

- CS_VIVIENDAS
    - P1_6: kitchen
    - P1_7: place_cook
    - P1_8: cook_sleep
    - P1_9: type_fuel
    - P1_10: type_stove
    - P2_1: food_expense

- CS_SEGURIDAD_ALIMENTARIA
    - P1: food_worry
    - P2: food_ran_out
    - P3: healthy_food_lack_adu
    - P4: low_food_variety_adu
    - P5: food_lack_meal_adu
    - P6: ate_less_food_adu
    - P7: hunger_adu
    - P8: ate_once_adu
    - P10: healthy_food_lack_minor
    - P11: low_food_variety_minor
    - P12: food_lack_meal_minor
    - P13: ate_less_food_minor
    - P14: ate_less_meals_minor
    - P15: hunger_minor
    - P16: ate_once_minor

## Setup

### Libraries

In [8]:
# Import libraries
import os
import pandas as pd
import numpy as np

### Paths

In [9]:
input_path = '../../0_source_csv/ensanut/'
input_path_sample = '../output'

output_path = '../output/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Set local variables

In [10]:
exec(open(glob_conf_path).read())

## Extract features

In [11]:
# Household survey
household_dataset = pd.read_csv(os.path.join(input_path, "CS_HOGARES.csv"))

household_dataset = household_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","P2_8","P6_1_1","P6_1_2","P6_1_3","P6_1_4",
                             "P6_1_5","P6_1_6","P6_1_7","P6_1_8","P6_1_9","P6_1_10","P6_1_11","P6_1_12","P6_1_13",
                             "P6_1_14","P6_1_15"]]

household_dataset = household_dataset.rename(columns={"P2_8": "hired_aid",
                                            "P6_1_1": "has_TV",
                                            "P6_1_2": "has_paid_TV",
                                            "P6_1_3": "has_radio",
                                            "P6_1_4": "has_audio_system",
                                            "P6_1_5": "has_computer",
                                            "P6_1_6": "has_cell_phone",
                                            "P6_1_7": "has_internet",
                                            "P6_1_8": "has_phone",
                                            "P6_1_9": "has_iron",
                                            "P6_1_10": "has_blender",
                                            "P6_1_11": "has_refrigerator",
                                            "P6_1_12": "has_gas_stove",
                                            "P6_1_13": "has_other_stove",
                                            "P6_1_14": "has_wash_dry",
                                            "P6_1_15": "has_microwave"})                      

# Add column with primary keys for house and household
household_dataset["house_ID"] = household_dataset["UPM"].astype(str)+'_'+household_dataset["VIV_SEL"].astype(str)
household_dataset["household_ID"] = household_dataset["UPM"].astype(str)+'_'+household_dataset["VIV_SEL"].astype(str)+'_'+household_dataset["HOGAR"].astype(str)

# Delete unnecesary columns 
del household_dataset["UPM"]
del household_dataset["VIV_SEL"]
del household_dataset["HOGAR"]
del household_dataset["NUMREN"]

household_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44612 entries, 0 to 44611
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   hired_aid         44612 non-null  int64 
 1   has_TV            44612 non-null  int64 
 2   has_paid_TV       44612 non-null  int64 
 3   has_radio         44612 non-null  int64 
 4   has_audio_system  44612 non-null  int64 
 5   has_computer      44612 non-null  int64 
 6   has_cell_phone    44612 non-null  int64 
 7   has_internet      44612 non-null  int64 
 8   has_phone         44612 non-null  int64 
 9   has_iron          44612 non-null  int64 
 10  has_blender       44612 non-null  int64 
 11  has_refrigerator  44612 non-null  int64 
 12  has_gas_stove     44612 non-null  int64 
 13  has_other_stove   44612 non-null  int64 
 14  has_wash_dry      44612 non-null  int64 
 15  has_microwave     44612 non-null  int64 
 16  house_ID          44612 non-null  object
 17  household_ID

In [12]:
# House survey
house_dataset = pd.read_csv(os.path.join(input_path, "CS_VIVIENDAS.csv"))

house_dataset = house_dataset[["UPM","VIV_SEL","P1_6","P1_7","P1_8","P1_9","P1_10","P2_1"]]

house_dataset = house_dataset.rename(columns={"P1_6": "kitchen",
                                              "P1_7": "place_cook",
                                              "P1_8": "cook_sleep",
                                              "P1_9": "type_fuel",
                                              "P1_10":" type_stove",
                                              "P2_1": "food_expense"})

# Add column with primary keys for house and household
house_dataset["house_ID"] = house_dataset["UPM"].astype(str)+'_'+house_dataset["VIV_SEL"].astype(str)

# Delete unnecesary columns 
del house_dataset["UPM"]
del house_dataset["VIV_SEL"]

house_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44069 entries, 0 to 44068
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   kitchen       44069 non-null  int64  
 1   place_cook    2344 non-null   float64
 2   cook_sleep    41725 non-null  float64
 3   type_fuel     43597 non-null  float64
 4    type_stove   43394 non-null  float64
 5   food_expense  44069 non-null  int64  
 6   house_ID      44069 non-null  object 
dtypes: float64(4), int64(2), object(1)
memory usage: 2.4+ MB


In [13]:
# Food safety
food_safety_dataset = pd.read_csv(os.path.join(input_path, "CS_SEGURIDAD_ALIMENTARIA.csv"))

food_safety_dataset = food_safety_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","P1", "P2" ,"P3" ,"P4" ,"P5" ,"P6" ,"P7" ,
                                           "P8" ,"P10","P11","P12","P13","P14","P15","P16"]]

food_safety_dataset = food_safety_dataset.rename(columns={"P1" : "food_worry",
                                                          "P2" : "food_ran_out",
                                                          "P3" : "healthy_food_lack_adu",
                                                          "P4" : "low_food_variety_adu",
                                                          "P5" : "food_lack_meal_adu",
                                                          "P6" : "ate_less_food_adu",
                                                          "P7" : "hunger_adu",
                                                          "P8" : "ate_once_adu",
                                                          "P10": "healthy_food_lack_minor",
                                                          "P11": "low_food_variety_minor",
                                                          "P12": "food_lack_meal_minor",
                                                          "P13": "ate_less_food_minor",
                                                          "P14": "ate_less_meals_minor",
                                                          "P15": "hunger_minor",
                                                          "P16": "ate_once_minor"})

# Add column with primary keys for house and household
food_safety_dataset["house_ID"] = food_safety_dataset["UPM"].astype(str)+'_'+food_safety_dataset["VIV_SEL"].astype(str)
food_safety_dataset["household_ID"] = food_safety_dataset["UPM"].astype(str)+'_'+food_safety_dataset["VIV_SEL"].astype(str)+'_'+food_safety_dataset["HOGAR"].astype(str)

# Delete unnecesary columns 
del food_safety_dataset["UPM"]
del food_safety_dataset["VIV_SEL"]
del food_safety_dataset["HOGAR"]
del food_safety_dataset["NUMREN"]

food_safety_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44574 entries, 0 to 44573
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   food_worry               44574 non-null  int64  
 1   food_ran_out             44574 non-null  int64  
 2   healthy_food_lack_adu    44574 non-null  int64  
 3   low_food_variety_adu     44574 non-null  int64  
 4   food_lack_meal_adu       44574 non-null  int64  
 5   ate_less_food_adu        44574 non-null  int64  
 6   hunger_adu               44574 non-null  int64  
 7   ate_once_adu             44574 non-null  int64  
 8   healthy_food_lack_minor  25272 non-null  float64
 9   low_food_variety_minor   25272 non-null  float64
 10  food_lack_meal_minor     25272 non-null  float64
 11  ate_less_food_minor      25272 non-null  float64
 12  ate_less_meals_minor     25272 non-null  float64
 13  hunger_minor             25272 non-null  float64
 14  ate_once_minor        

## Merge based on key

Merging the data for this modality is tricky. First the information about the household and house needs to be concatenated and afterwards joined to the observations dataset (corresponding to the child or adolescent that lives in the same household).

In [14]:
# Import observation dataset
sample_dataset = pd.read_csv(os.path.join(input_path_sample, "sample_dataset.csv"))

In [15]:
# Concatenate based on household_ID or house_ID to evaluate size of n
n_merge1 = sample_dataset.merge(household_dataset, left_on="household_ID", right_on="household_ID")
n_merge2 = sample_dataset.merge(house_dataset, left_on="house_ID", right_on="house_ID")
n_merge3 = sample_dataset.merge(food_safety_dataset, left_on="household_ID", right_on="household_ID")

In [16]:
# See lenght of each concatenation
print("Size of n from Sample & Household survey: ",len(n_merge1))
print("Size of n from Sample & House services: ",len(n_merge2))
print("Size of n from Sample & Food safety survey: ",len(n_merge3))

Size of n from Sample & Household survey:  10301
Size of n from Sample & House services:  10301
Size of n from Sample & Food safety survey:  10299


Seems like there is information about most of the children and adolescents that are sample population. Next step is merging the datasets corresponding to the household or house.

In [17]:
m1_dataset = sample_dataset.merge(
    household_dataset, left_on="household_ID", right_on="household_ID", how="left", suffixes=('', '_remove')).merge(
    house_dataset,left_on="house_ID", right_on="house_ID", how="left", suffixes=('', '_remove')).merge(
        food_safety_dataset, left_on="household_ID", right_on="household_ID", how="left", suffixes=('', '_remove'))
    
m1_dataset.shape

(10301, 52)

In [18]:
print("Duplicated rows: ", m1_dataset.duplicated().sum())

Duplicated rows:  0


In [19]:
m1_dataset.columns

Index(['house_ID', 'household_ID', 'person_ID', 'region', 'strata',
       'locality_type', 'locality_size', 'age_months', 'age_years', 'sex',
       'BMI_SD', 'label_cat', 'label', 'hired_aid', 'has_TV', 'has_paid_TV',
       'has_radio', 'has_audio_system', 'has_computer', 'has_cell_phone',
       'has_internet', 'has_phone', 'has_iron', 'has_blender',
       'has_refrigerator', 'has_gas_stove', 'has_other_stove', 'has_wash_dry',
       'has_microwave', 'house_ID_remove', 'kitchen', 'place_cook',
       'cook_sleep', 'type_fuel', ' type_stove', 'food_expense', 'food_worry',
       'food_ran_out', 'healthy_food_lack_adu', 'low_food_variety_adu',
       'food_lack_meal_adu', 'ate_less_food_adu', 'hunger_adu', 'ate_once_adu',
       'healthy_food_lack_minor', 'low_food_variety_minor',
       'food_lack_meal_minor', 'ate_less_food_minor', 'ate_less_meals_minor',
       'hunger_minor', 'ate_once_minor', 'house_ID_remove'],
      dtype='object')

In [20]:
m1_dataset.drop([i for i in m1_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
m1_dataset.columns

Index(['house_ID', 'household_ID', 'person_ID', 'region', 'strata',
       'locality_type', 'locality_size', 'age_months', 'age_years', 'sex',
       'BMI_SD', 'label_cat', 'label', 'hired_aid', 'has_TV', 'has_paid_TV',
       'has_radio', 'has_audio_system', 'has_computer', 'has_cell_phone',
       'has_internet', 'has_phone', 'has_iron', 'has_blender',
       'has_refrigerator', 'has_gas_stove', 'has_other_stove', 'has_wash_dry',
       'has_microwave', 'kitchen', 'place_cook', 'cook_sleep', 'type_fuel',
       ' type_stove', 'food_expense', 'food_worry', 'food_ran_out',
       'healthy_food_lack_adu', 'low_food_variety_adu', 'food_lack_meal_adu',
       'ate_less_food_adu', 'hunger_adu', 'ate_once_adu',
       'healthy_food_lack_minor', 'low_food_variety_minor',
       'food_lack_meal_minor', 'ate_less_food_minor', 'ate_less_meals_minor',
       'hunger_minor', 'ate_once_minor'],
      dtype='object')

In [21]:
m1_dataset.shape

(10301, 50)

In [22]:
m1_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10301 entries, 0 to 10300
Data columns (total 50 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   house_ID                 10301 non-null  object 
 1   household_ID             10301 non-null  object 
 2   person_ID                10301 non-null  object 
 3   region                   10301 non-null  object 
 4   strata                   10301 non-null  object 
 5   locality_type            10301 non-null  object 
 6   locality_size            10301 non-null  object 
 7   age_months               10301 non-null  int64  
 8   age_years                10301 non-null  int64  
 9   sex                      10301 non-null  object 
 10  BMI_SD                   10301 non-null  float64
 11  label_cat                10301 non-null  object 
 12  label                    10301 non-null  int64  
 13  hired_aid                10301 non-null  int64  
 14  has_TV                

## Check for missing values

Since a lot columns represent dummy variables for one answer and a lot of feature engineering is required, missing values would not be removed yet.

## Export dataset

In [23]:
#Export dataset as a csv
m1_dataset.to_csv(os.path.join(output_path,'m1_feature_extraction.csv'), index=None, header=True)