# Feature extraction
## Modality 3: Health information
Each modality is a combination of features from the different CSV that conform the database. This modality's features should include the following:
- CS_RESIDENTES
    - P3_5: relationship
    - P3_6, P3_6_1: place_birth
    - P3_9_01: hc_inst_IMSS
    - P3_9_02: hc_inst_ISSSTE
    - P3_9_03: hc_inst_ISSSTE_st
    - P3_9_04: hc_inst_PEMEX
    - P3_9_05: hc_inst_Defensa
    - P3_9_06: hc_inst_Marina
    - P3_9_07: hc_inst_SSA
    - P3_9_08: hc_inst_IMSS_Pro
    - P3_9_09: hc_inst_pharma
    - P3_9_10: hc_inst_private
    - P3_9_11: hc_inst_selfmed
    - P3_9_77: hc_inst_other
    - P3_9_12: hc_inst_doesnotgo
    - P3_9_99: hc_inst_doesnotknow
    - P3_11: speak_ind_lang
    - P3_12: speak_spanish
    - P3_13: attend_school
    - NIVEL: grade_achieved
    - P3_18: read_write
    - P3_19: marital_status
    - P3_25: worked_hours
    - P3_26_1: income_period
    - P3_26_2: income
    - P4_1_1: diff_use_legs
    - P4_1_2: diff_see
    - P4_1_3: diff_use_arms
    - P4_1_4: diff_learn
    - P4_1_5: diff_hear
    - P4_1_6: diff_self_care
    - P4_1_7: diff_speak
    - P4_1_8: diff_daily_act

- CS_SERV_SALUD
    - P8_1_1: care_costs
    - P8_1_2: treat_cost
    - P8_1_3: meds_cost
    - P8_1_4: care_travel
    - P8_1_5: time_diagnosis
    - P8_5_1: hc_pers_know_patient
    - P8_5_2: hc_pers_questions
    - P8_5_3: hc_pers_time
    - P8_5_4: hc_pers_shared_dec
    - P8_5_5: hc_pers_explain
    - P8_5_6: hc_pers_coordination
    - P8_6: eval_care
    - P8_7: specialist_care
    - P8_8_1: medication_check

- CS_ADULTOS
    - P1_1: ob_diag
    - P3_1: dm_diag
    - P4_1: hbp_diag
    - P5_2_1: cvd_ha_hi
    - P5_2_2: cvd_chest_angina
    - P5_2_3: cvd_heart_failure
    - P5_6: cvd_cerebral_inf_emb
    - P6_1_1: kd_diag_uti
    - P6_1_2: kd_diag_k_stones
    - P6_1_3: kd_diag_renal_fail
    - P6_4: chol_diag
    - P6_6: trig_diag
    - P7_1_1: fmh_father_dm
    - P7_1_2: fmh_father_hbp
    - P7_1_3: fmh_father_hi
    - P7_5_1: fmh_father_chol_tri
    - P7_1_2: fmh_mother_dm
    - P7_2_2: fmh_mother_hbp
    - P7_3_2: fmh_mother_hi
    - P7_5_2: fmh_mother_chol_tri
    - P7_1_3: fmh_sibling_dm
    - P7_2_3: fmh_sibling_hbp
    - P7_3_3: fmh_sibling_hi
    - P7_5_3: fmh_sibling_chol_tri
    - P12_1: violence
    - P13_2: present_smoker
    - P13_4: past_smoker
    - P13_2: present_e_smoker
    - P3_11: present_alc_drinker
    - P13_12_1: freq_alc_glasses
    - P14_1: sight_aid
    - P14_2: hearing_aid
    - P14_5: walk_difficulty
    - P14_6: memory_difficulty
    - P14_7: self_care_difficulty
    - P14_6: communication_difficulty

- CN_FCA_ADU
    - FRUTAS: fruit_intake
    - VERDURAS: vegetable_intake
    - LEGUMINOSAS: legumes_intake
    - CARNES: meat_intake
    - CARNES_PROC: proc_meat_intake
    - COMIDA_RAP: fast_food_intake
    - DULCES: sweets_intake
    - CEREALES_DUL: sug_cereal_intake
    - BEB_NOLAC_ENDUL: nd_sug_bev_intake
    - AGUA: water_intake
    - BEB_LAC_ENDUL: dairy_sug_bev_intake
    - LACTEOS: dairy_intake
    - HUEVO: egg_intake

## Setup

### Libraries

In [1]:
# Import libraries
import os
import pandas as pd
import numpy as np

### Paths

In [2]:
input_path = '../../0_source_csv/ensanut/'
input_path_sample = '../output'

output_path = '../output/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Set local variables

In [3]:
exec(open(glob_conf_path).read())

## Extract features

In [4]:
# Residents survey
residents_dataset = pd.read_csv(os.path.join(input_path, "CS_RESIDENTES.csv"))

residents_dataset = residents_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","EDAD","SEXO","P3_5","P3_9_01","P3_9_02","P3_9_03",
                                       "P3_9_04","P3_9_05","P3_9_06","P3_9_07","P3_9_08","P3_9_09","P3_9_10","P3_9_11",
                                       "P3_9_77","P3_9_12","P3_9_99","P3_10_01","P3_10_02","P3_10_03","P3_10_04","P3_10_05",
                                       "P3_10_06","P3_10_07","P3_10_08","P3_10_09","P3_10_10","P3_10_11","P3_10_99","P3_11",
                                       "P3_12","P3_13","NIVEL","P3_18","P3_19","P3_25","P4_1_1","P4_1_2",
                                       "P4_1_3","P4_1_4","P4_1_5","P4_1_6","P4_1_7","P4_1_8"]]

residents_dataset = residents_dataset.rename(columns={"EDAD":"age_health_info_adult",
                                                      "SEXO":"sex_health_info_adult",
                                                      "P3_5": "kinship",
                                                      "P3_9_01": "hc_inst_IMSS",
                                                      "P3_9_02": "hc_inst_ISSSTE",
                                                      "P3_9_03": "hc_inst_ISSSTE_st",
                                                      "P3_9_04": "hc_inst_PEMEX",
                                                      "P3_9_05": "hc_inst_Defensa",
                                                      "P3_9_06": "hc_inst_Marina",
                                                      "P3_9_06": "hc_inst_Marina",
                                                      "P3_9_07": "hc_inst_SSA",
                                                      "P3_9_08": "hc_inst_IMSS_Pro",
                                                      "P3_9_09": "hc_inst_pharma",
                                                      "P3_9_10": "hc_inst_private",
                                                      "P3_9_11": "hc_inst_selfmed",
                                                      "P3_9_77": "hc_inst_other",
                                                      "P3_9_12": "hc_inst_doesnotgo",
                                                      "P3_9_99": "hc_inst_doesnotknow",
                                                      "P3_10_01": "hc_afill_IMSS",
                                                      "P3_10_02": "hc_afill_ISSSTE",
                                                      "P3_10_03": "hc_afill_ISSSTE_st",
                                                      "P3_10_04": "hc_afill_PEMEX",
                                                      "P3_10_05": "hc_afill_Defensa",
                                                      "P3_10_06": "hc_afill_Marina",
                                                      "P3_10_07": "hc_afill_SSA",
                                                      "P3_10_08": "hc_afill_IMSS_Pro",
                                                      "P3_10_09": "hc_afill_private",
                                                      "P3_10_10": "hc_afill_other",
                                                      "P3_10_11": "hc_afill_none",
                                                      "P3_10_99": "hc_afill_notknow",
                                                      "P3_11": "speak_ind_lang",
                                                      "P3_12": "speak_spanish",
                                                      "P3_13": "attend_school",
                                                      "NIVEL": "grade_achieved",
                                                      "P3_18": "read_write",
                                                      "P3_19": "marital_status",
                                                      "P3_25": "worked_hours",
                                                      "P4_1_1": "diff_use_legs",
                                                      "P4_1_2": "diff_see",
                                                      "P4_1_3": "diff_use_arms",
                                                      "P4_1_4": "diff_learn",
                                                      "P4_1_5": "diff_hear",
                                                      "P4_1_6": "diff_self_care",
                                                      "P4_1_7": "diff_speak",
                                                      "P4_1_8": "diff_daily_act"})

# Add column with primary keys for house and household
residents_dataset["house_ID"] = residents_dataset["UPM"].astype(str)+'_'+residents_dataset["VIV_SEL"].astype(str)
residents_dataset["household_ID"] = residents_dataset["UPM"].astype(str)+'_'+residents_dataset["VIV_SEL"].astype(str)+'_'+residents_dataset["HOGAR"].astype(str)
residents_dataset["person_ID"] = residents_dataset["UPM"].astype(str)+'_'+residents_dataset["VIV_SEL"].astype(str)+'_'+residents_dataset["HOGAR"].astype(str)+'_'+residents_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del residents_dataset["UPM"]
del residents_dataset["VIV_SEL"]
del residents_dataset["HOGAR"]
del residents_dataset["NUMREN"]

residents_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158044 entries, 0 to 158043
Data columns (total 47 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   age_health_info_adult  158044 non-null  int64  
 1   sex_health_info_adult  158044 non-null  int64  
 2   kinship                158044 non-null  int64  
 3   hc_inst_IMSS           158044 non-null  int64  
 4   hc_inst_ISSSTE         158044 non-null  int64  
 5   hc_inst_ISSSTE_st      158044 non-null  int64  
 6   hc_inst_PEMEX          158044 non-null  int64  
 7   hc_inst_Defensa        158044 non-null  int64  
 8   hc_inst_Marina         158044 non-null  int64  
 9   hc_inst_SSA            158044 non-null  int64  
 10  hc_inst_IMSS_Pro       158044 non-null  int64  
 11  hc_inst_pharma         158044 non-null  int64  
 12  hc_inst_private        158044 non-null  int64  
 13  hc_inst_selfmed        158044 non-null  int64  
 14  hc_inst_other          158044 non-nu

In [5]:
# Health services survey
health_services_dataset = pd.read_csv(os.path.join(input_path, "CS_SERV_SALUD.csv"))

health_services_dataset = health_services_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","P8_1_1","P8_1_2","P8_1_3","P8_1_4","P8_1_5","P8_5_1",
                                       "P8_5_2","P8_5_3","P8_5_4","P8_5_5","P8_5_6"]]

health_services_dataset = health_services_dataset.rename(columns={"P8_1_1": "care_costs",
                                                      "P8_1_2": "treat_cost",
                                                      "P8_1_3": "meds_cost",
                                                      "P8_1_4": "care_travel",
                                                      "P8_1_5": "time_diagnosis",
                                                      "P8_5_1": "hc_pers_know_patient",
                                                      "P8_5_2": "hc_pers_questions",
                                                      "P8_5_3": "hc_pers_time",
                                                      "P8_5_4": "hc_pers_shared_dec",
                                                      "P8_5_5": "hc_pers_explain",
                                                      "P8_5_6": "hc_pers_coordination",
                                                      "P8_6": "eval_care",
                                                      "P8_7": "specialist_care",
                                                      "P8_8_1": "medication_check"})
                                                      

# Add column with primary keys for house and household
health_services_dataset["house_ID"] = health_services_dataset["UPM"].astype(str)+'_'+health_services_dataset["VIV_SEL"].astype(str)
health_services_dataset["household_ID"] = health_services_dataset["UPM"].astype(str)+'_'+health_services_dataset["VIV_SEL"].astype(str)+'_'+health_services_dataset["HOGAR"].astype(str)
health_services_dataset["person_ID"] = health_services_dataset["UPM"].astype(str)+'_'+health_services_dataset["VIV_SEL"].astype(str)+'_'+health_services_dataset["HOGAR"].astype(str)+'_'+health_services_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del health_services_dataset["UPM"]
del health_services_dataset["VIV_SEL"]
del health_services_dataset["HOGAR"]
del health_services_dataset["NUMREN"]

health_services_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8757 entries, 0 to 8756
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   care_costs            8757 non-null   int64 
 1   treat_cost            8757 non-null   int64 
 2   meds_cost             8757 non-null   int64 
 3   care_travel           8757 non-null   int64 
 4   time_diagnosis        8757 non-null   int64 
 5   hc_pers_know_patient  8757 non-null   int64 
 6   hc_pers_questions     8757 non-null   int64 
 7   hc_pers_time          8757 non-null   int64 
 8   hc_pers_shared_dec    8757 non-null   int64 
 9   hc_pers_explain       8757 non-null   int64 
 10  hc_pers_coordination  8757 non-null   int64 
 11  house_ID              8757 non-null   object
 12  household_ID          8757 non-null   object
 13  person_ID             8757 non-null   object
dtypes: int64(11), object(3)
memory usage: 957.9+ KB


In [6]:
# Adulth health survey
adult_survey_dataset = pd.read_csv(os.path.join(input_path, "CS_ADULTOS.csv"))

adult_survey_dataset = adult_survey_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","P1_1", "P3_1","P4_1","P5_2_1","P5_2_2","P5_2_3",
                                             "P5_6","P6_1_1","P6_1_2","P6_1_3","P6_4","P6_6","P7_1_1","P7_2_1","P7_3_1",
                                              "P7_5_1","P7_1_2","P7_2_2","P7_3_2","P7_5_2","P7_1_3","P7_2_3","P7_3_3","P7_5_3",
                                              "P12_1","P13_2","P13_4","P3_11","P14_1","P14_2","P14_5","P14_6","P14_7","P14_8"]]

adult_survey_dataset = adult_survey_dataset.rename(columns={"P1_1": "ob_diag",
                                                                  "P3_1": "dm_diag",
                                                                  "P4_1": "hbp_diag",
                                                                  "P5_2_1": "cvd_ha_hi",
                                                                  "P5_2_2": "cvd_chest_angina",
                                                                  "P5_2_3": "cvd_heart_failure",
                                                                  "P5_6": "cvd_cerebral_inf_emb",
                                                                  "P6_1_1": "kd_diag_uti",
                                                                  "P6_1_2": "kd_diag_k_stones",
                                                                  "P6_1_3": "kd_diag_renal_fail",
                                                                  "P6_4": "chol_diag",
                                                                  "P6_6": "trig_diag",
                                                                  "P7_1_1": "fmh_father_dm",
                                                                  "P7_2_1": "fmh_father_hbp",
                                                                  "P7_3_1": "fmh_father_hi",
                                                                  "P7_5_1": "fmh_father_chol_tri",
                                                                  "P7_1_2": "fmh_mother_dm",
                                                                  "P7_2_2": "fmh_mother_hbp",
                                                                  "P7_3_2": "fmh_mother_hi",
                                                                  "P7_5_2": "fmh_mother_chol_tri",
                                                                  "P7_1_3": "fmh_sibling_dm",
                                                                  "P7_2_3": "fmh_sibling_hbp",
                                                                  "P7_3_3": "fmh_sibling_hi",
                                                                  "P7_5_3": "fmh_sibling_chol_tri",
                                                                  "P12_1": "violence",
                                                                  "P13_2": "present_smoker",
                                                                  "P13_4": "past_smoker",
                                                                  "P3_11": "present_alc_drinker",
                                                                  "P14_1": "sight_aid",
                                                                  "P14_2": "hearing_aid",
                                                                  "P14_5": "walk_difficulty",
                                                                  "P14_6": "memory_difficulty",
                                                                  "P14_7": "self_care_difficulty",
                                                                  "P14_8": "communication_difficulty"})

# Add column with primary keys for house and household
adult_survey_dataset["house_ID"] = adult_survey_dataset["UPM"].astype(str)+'_'+adult_survey_dataset["VIV_SEL"].astype(str)
adult_survey_dataset["household_ID"] = adult_survey_dataset["UPM"].astype(str)+'_'+adult_survey_dataset["VIV_SEL"].astype(str)+'_'+adult_survey_dataset["HOGAR"].astype(str)
adult_survey_dataset["person_ID"] = adult_survey_dataset["UPM"].astype(str)+'_'+adult_survey_dataset["VIV_SEL"].astype(str)+'_'+adult_survey_dataset["HOGAR"].astype(str)+'_'+adult_survey_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del adult_survey_dataset["UPM"]
del adult_survey_dataset["VIV_SEL"]
del adult_survey_dataset["HOGAR"]
del adult_survey_dataset["NUMREN"]

adult_survey_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43070 entries, 0 to 43069
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ob_diag                   43070 non-null  int64  
 1   dm_diag                   43070 non-null  int64  
 2   hbp_diag                  43070 non-null  int64  
 3   cvd_ha_hi                 43070 non-null  int64  
 4   cvd_chest_angina          43070 non-null  int64  
 5   cvd_heart_failure         43070 non-null  int64  
 6   cvd_cerebral_inf_emb      602 non-null    float64
 7   kd_diag_uti               43070 non-null  int64  
 8   kd_diag_k_stones          43070 non-null  int64  
 9   kd_diag_renal_fail        43070 non-null  int64  
 10  chol_diag                 43070 non-null  int64  
 11  trig_diag                 43070 non-null  int64  
 12  fmh_father_dm             42436 non-null  float64
 13  fmh_father_hbp            42436 non-null  float64
 14  fmh_fa

In [7]:
# Food intake frequency adults
food_intake_dataset = pd.read_csv(os.path.join(input_path, "CN_FCA_ADU.csv"))

food_intake_dataset = food_intake_dataset[["UPM","VIV_SEL","HOGAR","NUMREN","FRUTAS","VERDURAS","LEGUMINOSAS","CARNES",
                                           "CARNES_PROC","COMIDA_RAP","DULCES","CEREALES_DUL", "BEB_NOLAC_ENDUL",
                                           "AGUA","BEB_LAC_ENDUL","LACTEOS","HUEVO"]]

food_intake_dataset = food_intake_dataset.rename(columns={"FRUTAS": "fruit_intake",
                                                          "VERDURAS": "vegetable_intake",
                                                          "LEGUMINOSAS": "legumes_intake",
                                                          "CARNES": "meat_intake",
                                                          "CARNES_PROC": "proc_meat_intake",
                                                          "COMIDA_RAP": "fast_food_intake",
                                                          "DULCES": "sweets_intake",
                                                          "CEREALES_DUL": "sug_cereal_intake",
                                                          "BEB_NOLAC_ENDUL": "nd_sug_bev_intake",
                                                          "AGUA": "water_intake",
                                                          "BEB_LAC_ENDUL": "dairy_sug_bev_intake",
                                                          "LACTEOS": "dairy_intake",
                                                          "HUEVO": "egg_intake"})

# Add column with primary keys for house and household
food_intake_dataset["house_ID"] = food_intake_dataset["UPM"].astype(str)+'_'+food_intake_dataset["VIV_SEL"].astype(str)
food_intake_dataset["household_ID"] = food_intake_dataset["UPM"].astype(str)+'_'+food_intake_dataset["VIV_SEL"].astype(str)+'_'+food_intake_dataset["HOGAR"].astype(str)
food_intake_dataset["person_ID"] = food_intake_dataset["UPM"].astype(str)+'_'+food_intake_dataset["VIV_SEL"].astype(str)+'_'+food_intake_dataset["HOGAR"].astype(str)+'_'+food_intake_dataset["NUMREN"].astype(str)

# Delete unnecesary columns 
del food_intake_dataset["UPM"]
del food_intake_dataset["VIV_SEL"]
del food_intake_dataset["HOGAR"]
del food_intake_dataset["NUMREN"]

food_intake_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15791 entries, 0 to 15790
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   fruit_intake          15791 non-null  int64 
 1   vegetable_intake      15791 non-null  int64 
 2   legumes_intake        15791 non-null  int64 
 3   meat_intake           15791 non-null  int64 
 4   proc_meat_intake      15791 non-null  int64 
 5   fast_food_intake      15791 non-null  int64 
 6   sweets_intake         15791 non-null  int64 
 7   sug_cereal_intake     15791 non-null  int64 
 8   nd_sug_bev_intake     15791 non-null  int64 
 9   water_intake          15791 non-null  int64 
 10  dairy_sug_bev_intake  15791 non-null  int64 
 11  dairy_intake          15791 non-null  int64 
 12  egg_intake            15791 non-null  int64 
 13  house_ID              15791 non-null  object
 14  household_ID          15791 non-null  object
 15  person_ID             15791 non-null

## Merge based on key

In [8]:
# Import observation dataset
sample_dataset = pd.read_csv(os.path.join(input_path_sample, "sample_dataset.csv"))

In [9]:
# Concatenate based on household_ID or house_ID to evaluate size of n
n_merge1 = sample_dataset.merge(residents_dataset, left_on="household_ID", right_on="household_ID", how="left")
n_merge2 = sample_dataset.merge(health_services_dataset, left_on="household_ID", right_on="household_ID", how="left")
n_merge3 = sample_dataset.merge(adult_survey_dataset, left_on="household_ID", right_on="household_ID", how="left")
n_merge4 = sample_dataset.merge(food_intake_dataset, left_on="household_ID", right_on="household_ID", how="left")

In [10]:
# See lenght of each concatenation
print("Size of n from Sample & Residents survey: ",len(n_merge1))
print("Size of n from Sample & Health services: ",len(n_merge2))
print("Size of n from Sample & Adult health survey: ",len(n_merge3))
print("Size of n from Sample & Food intake: ",len(n_merge4))

Size of n from Sample & Residents survey:  50211
Size of n from Sample & Health services:  10501
Size of n from Sample & Adult health survey:  10301
Size of n from Sample & Food intake:  10312


Before merging the four datasets, first we need to identify include and distinguish the kinship and personal IDs of the random adult and the observation.

In [11]:
# It is necessary to do the same as before but for the random adult
# Generate dataframe with person_ID and kinship
relationship_random_adult = pd.DataFrame(residents_dataset[["person_ID","kinship"]])

# Merge with sample dataset based on person_ID
adult_survey_dataset = adult_survey_dataset.merge(relationship_random_adult, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove'))

# Remove duplicated columns
adult_survey_dataset.drop([i for i in adult_survey_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
adult_survey_dataset.columns

# Rename columns to identify them as random adult columns
adult_survey_dataset = adult_survey_dataset.rename(columns={"kinship":"kinship_random_adult"})

adult_survey_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43070 entries, 0 to 43069
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ob_diag                   43070 non-null  int64  
 1   dm_diag                   43070 non-null  int64  
 2   hbp_diag                  43070 non-null  int64  
 3   cvd_ha_hi                 43070 non-null  int64  
 4   cvd_chest_angina          43070 non-null  int64  
 5   cvd_heart_failure         43070 non-null  int64  
 6   cvd_cerebral_inf_emb      602 non-null    float64
 7   kd_diag_uti               43070 non-null  int64  
 8   kd_diag_k_stones          43070 non-null  int64  
 9   kd_diag_renal_fail        43070 non-null  int64  
 10  chol_diag                 43070 non-null  int64  
 11  trig_diag                 43070 non-null  int64  
 12  fmh_father_dm             42436 non-null  float64
 13  fmh_father_hbp            42436 non-null  float64
 14  fmh_fa

Since the "kinship" column is no longer necessary it will be removed before merging the rest of the datasets. 

In [12]:
del residents_dataset["kinship"]

Next step is merging the datasets corresponding to the random adult in the house.

In [13]:
health_info_adult = adult_survey_dataset.merge(
    residents_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
    food_intake_dataset,left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove')).merge(
        health_services_dataset, left_on="person_ID", right_on="person_ID", how="left", suffixes=('', '_remove'))
    
health_info_adult.shape

# Remove duplicated columns
health_info_adult.drop([i for i in health_info_adult.columns if 'remove' in i],
               axis=1, inplace=True)
health_info_adult.columns

Index(['ob_diag', 'dm_diag', 'hbp_diag', 'cvd_ha_hi', 'cvd_chest_angina',
       'cvd_heart_failure', 'cvd_cerebral_inf_emb', 'kd_diag_uti',
       'kd_diag_k_stones', 'kd_diag_renal_fail',
       ...
       'treat_cost', 'meds_cost', 'care_travel', 'time_diagnosis',
       'hc_pers_know_patient', 'hc_pers_questions', 'hc_pers_time',
       'hc_pers_shared_dec', 'hc_pers_explain', 'hc_pers_coordination'],
      dtype='object', length=105)

In [14]:
m3_dataset = sample_dataset.merge(health_info_adult, left_on="household_ID", right_on="household_ID", how="left", suffixes=('', '_remove'))
m3_dataset.shape

# Remove duplicated columns
m3_dataset.drop([i for i in m3_dataset.columns if 'remove' in i],
               axis=1, inplace=True)
m3_dataset.columns.tolist()

['house_ID',
 'household_ID',
 'person_ID',
 'region',
 'strata',
 'locality_type',
 'locality_size',
 'age_months',
 'age_years',
 'sex',
 'BMI_SD',
 'label_cat',
 'label',
 'ob_diag',
 'dm_diag',
 'hbp_diag',
 'cvd_ha_hi',
 'cvd_chest_angina',
 'cvd_heart_failure',
 'cvd_cerebral_inf_emb',
 'kd_diag_uti',
 'kd_diag_k_stones',
 'kd_diag_renal_fail',
 'chol_diag',
 'trig_diag',
 'fmh_father_dm',
 'fmh_father_hbp',
 'fmh_father_hi',
 'fmh_father_chol_tri',
 'fmh_mother_dm',
 'fmh_mother_hbp',
 'fmh_mother_hi',
 'fmh_mother_chol_tri',
 'fmh_sibling_dm',
 'fmh_sibling_hbp',
 'fmh_sibling_hi',
 'fmh_sibling_chol_tri',
 'violence',
 'present_smoker',
 'past_smoker',
 'present_alc_drinker',
 'sight_aid',
 'hearing_aid',
 'walk_difficulty',
 'memory_difficulty',
 'self_care_difficulty',
 'communication_difficulty',
 'kinship_random_adult',
 'age_health_info_adult',
 'sex_health_info_adult',
 'hc_inst_IMSS',
 'hc_inst_ISSSTE',
 'hc_inst_ISSSTE_st',
 'hc_inst_PEMEX',
 'hc_inst_Defensa',
 'hc_in

In [15]:
m3_dataset.shape

(10301, 115)

In [16]:
m3_dataset.duplicated().sum()

0

## Check for missing values

Since a lot columns represent dummy variables for one answer and a lot of feature engineering is required, missing values would not be removed yet.

## Export dataset

In [17]:
#Export dataset as a csv
m3_dataset.to_csv(os.path.join(output_path,'m3_feature_extraction.csv'), index=None, header=True)