# Feature extraction
## Unimodal
This notebook combines the dimensions of all modalities:
- Modality 1: House characteristics
- Modality 2: Household income and expenses
- Modality 3: Health information
- Modality 4: Biometrics
- Modality 5: Nutritional Knowledge

## Setup

### Libraries

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

### Paths

In [2]:
input_path = '../../2_data_preprocessing/output/'
input_path_sample = '../output'

output_path = '../output/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Set local variables

In [3]:
exec(open(glob_conf_path).read())

## Extract features

In [4]:
m1_dataset = pd.read_csv(os.path.join(input_path, "m1_unimodal.csv"))
m1_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10301 entries, 0 to 10300
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   house_ID          10301 non-null  object 
 1   household_ID      10301 non-null  object 
 2   person_ID         10301 non-null  object 
 3   region            10301 non-null  object 
 4   strata            10301 non-null  object 
 5   locality_type     10301 non-null  object 
 6   locality_size     10301 non-null  object 
 7   age_months        10301 non-null  int64  
 8   age_years         10301 non-null  int64  
 9   sex               10301 non-null  object 
 10  BMI_SD            10301 non-null  float64
 11  label_cat         10301 non-null  object 
 12  label             10301 non-null  int64  
 13  hired_aid         10301 non-null  int64  
 14  has_TV            10301 non-null  int64  
 15  has_paid_TV       10301 non-null  int64  
 16  has_radio         10301 non-null  int64 

In [5]:
m2_dataset = pd.read_csv(os.path.join(input_path, "m2_unimodal.csv"))
m2_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10301 entries, 0 to 10300
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   house_ID           10301 non-null  object 
 1   household_ID       10301 non-null  object 
 2   person_ID          10301 non-null  object 
 3   region             10301 non-null  object 
 4   strata             10301 non-null  object 
 5   locality_type      10301 non-null  object 
 6   locality_size      10301 non-null  object 
 7   age_months         10242 non-null  float64
 8   age_years          10301 non-null  float64
 9   sex                10301 non-null  object 
 10  BMI_SD             10301 non-null  float64
 11  label_cat          10301 non-null  object 
 12  label              10301 non-null  float64
 13  exp_fruits         9651 non-null   float64
 14  exp_vegetables     9651 non-null   float64
 15  exp_corn           9651 non-null   float64
 16  exp_bread          965

In [6]:
m3_dataset = pd.read_csv(os.path.join(input_path, "m3_unimodal.csv"))
m3_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10301 entries, 0 to 10300
Data columns (total 88 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   house_ID                     10301 non-null  object 
 1   household_ID                 10301 non-null  object 
 2   person_ID                    10301 non-null  object 
 3   region                       10301 non-null  object 
 4   strata                       10301 non-null  object 
 5   locality_type                10301 non-null  object 
 6   locality_size                10301 non-null  object 
 7   age_months                   10301 non-null  int64  
 8   age_years                    10301 non-null  int64  
 9   sex                          10301 non-null  object 
 10  BMI_SD                       10301 non-null  float64
 11  label_cat                    10301 non-null  object 
 12  label                        10301 non-null  int64  
 13  ob_diag         

In [7]:
m4_dataset = pd.read_csv(os.path.join(input_path, "m4_unimodal.csv"))
m4_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10301 entries, 0 to 10300
Data columns (total 30 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   house_ID               10301 non-null  object 
 1   household_ID           10301 non-null  object 
 2   person_ID              10301 non-null  object 
 3   region                 10301 non-null  object 
 4   strata                 10301 non-null  object 
 5   locality_type          10301 non-null  object 
 6   locality_size          10301 non-null  object 
 7   age_months             10260 non-null  float64
 8   age_years              10301 non-null  float64
 9   sex                    10301 non-null  object 
 10  BMI_SD                 10301 non-null  float64
 11  label_cat              10301 non-null  object 
 12  label                  10301 non-null  float64
 13  glucose_value          7370 non-null   float64
 14  hba1c_value            7213 non-null   float64
 15  al

In [8]:
m5_dataset = pd.read_csv(os.path.join(input_path, "m5_unimodal.csv"))
m5_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8672 entries, 0 to 8671
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   house_ID                8672 non-null   object 
 1   household_ID            8672 non-null   object 
 2   person_ID               8672 non-null   object 
 3   region                  8672 non-null   object 
 4   strata                  8672 non-null   object 
 5   locality_type           8672 non-null   object 
 6   locality_size           8672 non-null   object 
 7   age_months              8672 non-null   int64  
 8   age_years               8672 non-null   int64  
 9   sex                     8672 non-null   object 
 10  BMI_SD                  8672 non-null   float64
 11  label_cat               8672 non-null   object 
 12  label                   8672 non-null   int64  
 13  age_nut_know_adult      8672 non-null   object 
 14  sex_nut_know_adult      8672 non-null   

## Merge based on key

In [9]:
# Import observation dataset
sample_dataset = pd.read_csv(os.path.join(input_path_sample, "sample_dataset.csv"))

In [10]:
unimodal = sample_dataset.merge(
    m1_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
    m2_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
        m3_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
            m4_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
                m5_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove'))
    
unimodal.shape

(10303, 229)

In [11]:
unimodal.drop([i for i in unimodal.columns if 'remove' in i],
               axis=1, inplace=True)
unimodal.columns

Index(['house_ID', 'household_ID', 'person_ID', 'region', 'strata',
       'locality_type', 'locality_size', 'age_months', 'age_years', 'sex',
       ...
       'calories', 'product_A', 'product_B', 'product_C', 'energy_importance',
       'sodium_importance', 'sugars_importance', 'fats_importance',
       'sat_fats_importance', 'kinship_nut_know_adult'],
      dtype='object', length=169)

In [12]:
unimodal.columns.tolist()

['house_ID',
 'household_ID',
 'person_ID',
 'region',
 'strata',
 'locality_type',
 'locality_size',
 'age_months',
 'age_years',
 'sex',
 'BMI_SD',
 'label_cat',
 'label',
 'hired_aid',
 'has_TV',
 'has_paid_TV',
 'has_radio',
 'has_audio_system',
 'has_computer',
 'has_cell_phone',
 'has_internet',
 'has_phone',
 'has_iron',
 'has_blender',
 'has_refrigerator',
 'has_gas_stove',
 'has_other_stove',
 'has_wash_dry',
 'has_microwave',
 'cook_sleep',
 'food_expense',
 'cooking_room',
 'result_elcsa',
 'exp_fruits',
 'exp_vegetables',
 'exp_corn',
 'exp_bread',
 'exp_grains',
 'exp_oil_sugar',
 'exp_meat',
 'exp_dairy',
 'exp_eggs',
 'exp_snacks',
 'exp_fast_food',
 'exp_bottled_water',
 'exp_soda',
 'exp_alcohol',
 'exp_tobacco',
 'exp_not_homemade',
 'monthly_income',
 'total_food_exp',
 'total_medical_exp',
 'ob_diag',
 'dm_diag',
 'hbp_diag',
 'cvd_ha_hi',
 'cvd_chest_angina',
 'cvd_heart_failure',
 'kd_diag_uti',
 'kd_diag_k_stones',
 'kd_diag_renal_fail',
 'chol_diag',
 'trig_diag

## Export dataset

In [13]:
#Export dataset as a csv
unimodal.to_csv(os.path.join(output_path,'unimodal_feature_extraction.csv'), index=None, header=True)