# ENIGH Data

This notebook extracts and cleans the data of the Income and Expenditure Survey. Such files can be downloaded from the INEGI's website with the following link:
https://www.inegi.org.mx/programas/enigh/nc/2018/default.html#Datos_abiertos

In [4]:
import pandas as pd
from functools import reduce

### Directories and files' paths

In [5]:
dataRoot = r'D:\Tesis\Datos' #path of data folder

# files
tabConHogar_2018 = r'\Ingresos y Gastos de los Hogares\2018\conjunto_de_datos_concentradohogar_enigh_2018_ns' \
                   '\conjunto_de_datos\conjunto_de_datos_concentradohogar_enigh_2018_ns.csv'
tabPoblacion_2018 = r'\Ingresos y Gastos de los Hogares\2018\conjunto_de_datos_poblacion_enigh_2018_ns' \
                    '\conjunto_de_datos\conjunto_de_datos_poblacion_enigh_2018_ns.csv'
tabViviendas_2018 = r'\Ingresos y Gastos de los Hogares\2018\conjunto_de_datos_vivienda_enigh_2018_ns' \
                    '\conjunto_de_datos\conjunto_de_datos_viviendas_enigh_2018_ns.csv'
tabGastosH_2018 = r'\Ingresos y Gastos de los Hogares\2018\conjunto_de_datos_gastoshogar_enigh_2018_ns' \
                  '\conjunto_de_datos\conjunto_de_datos_gastoshogar_enigh_2018_ns.csv'
tabGastosP_2018 = r'\Ingresos y Gastos de los Hogares\2018\conjunto_de_datos_gastospersona_enigh_2018_ns' \
                  '\conjunto_de_datos\conjunto_de_datos_gastospersona_enigh_2018_ns.csv'

### Read files
The taken columns correspond to demographic and energy consumption indicators.

In [32]:
concentrador_cols = list(range(17)) + [19, 22, 56,82,92] # columns of interest
concentrador_2018 = pd.read_csv(dataRoot + tabConHogar_2018, usecols=concentrador_cols, na_values=' ') # read tabla concentrador hogar

In [33]:
poblacion_cols = [0,1,2,5,40] # columns of interest
poblacion_2018 = pd.read_csv(dataRoot + tabPoblacion_2018, usecols=poblacion_cols, na_values=' ') # read tabla poblacion

In [34]:
vivienda_cols = [0,1,5,10] + list(range(21,25)) + [27,46,47,50,51,52]  # columns of interest
vivienda_2018 = pd.read_csv(dataRoot + tabViviendas_2018, usecols=vivienda_cols, na_values=' ') # read tabla vivienda

In [11]:
gastosH_cols = [0,1,2,23]
gastosH_2018 = pd.read_csv(dataRoot + tabGastosH_2018, usecols=gastosH_cols, na_values=' ') # read tabla gastoshogar

FileNotFoundError: [Errno 2] File D:\Tesis\DatosC:\Users\mrbon\Downloads\enigh2018_ns_gastoshogar_csv\gastoshogar.csv does not exist: 'D:\\Tesis\\DatosC:\\Users\\mrbon\\Downloads\\enigh2018_ns_gastoshogar_csv\\gastoshogar.csv'

In [79]:
gastosP_cols = [0,1,2,3,17,19]
gastosP_2018 = pd.read_csv(dataRoot + tabGastosP_2018, usecols=gastosP_cols, na_values=' ') # read tabla gastoshogar

### Age and Education
This section takes variables from the Poblacion table to build more accurate variables regarding age and education

In [18]:
# Transform values to school years
dic_edu = {0:0,1:0,2:6,3:9,4:12,5:15,6:15,7:16,8:18,9:21} # for mapping the number of school years equivalent to each value 
poblacion_2018['nivelaprob'] = poblacion_2018['nivelaprob'].map(dic_edu)

In [19]:
# Age
pobEdad = poblacion_2018.groupby(['folioviv','foliohog'], sort=False, as_index=False)['edad'].mean() # Get average age

# Education
pobEdu = poblacion_2018.loc[poblacion_2018['edad']>=15].groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean education of population > 15 y
pobEdu_men15 = poblacion_2018.groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean poblacion_2018 of whole population to fill houses with no >15 population

# merge
pob = pobEdad.merge(pobEdu, on=['folioviv','foliohog'], how='left')
pob['nivelaprob'] = pob['nivelaprob'].fillna(pobEdu_men15['nivelaprob']) # give a value for households with no population older than 15 years old

In [None]:
# Merge with concentrador Hogar
con_2018 = concentrador_2018.merge(pob, on=['folioviv','foliohog'], how='left')

In [20]:
# del concentrador_2018, poblacion_2018, pob, pobEdad, pobEdu, pobEdu_men15

### Expenses
 - Electricity 
 - Natural Gas
 - LPG
 - Gasoline
 - Diesel

In [13]:
# Expenses' keys
claves = ['R001','R003','G009','G010','G011','G012','G013','G014']
dic_cves = {'R001':'ele', 'R003':'gas', 'G009':'glp', 'G010':'pet', 'G011':'die', 'G012':'car', 'G013':'len', 'G014':'heat'}

In [14]:
# Convert every type of expense into a column
gastoList = []
for clave in dic_cves:
    gasto_x = gastosH_2018.loc[gastosH_2018['clave']==clave].copy()
    gasto_x.rename(columns={'gasto_tri':'gasto_tri_' + dic_cves[clave]}, inplace=True)
    gasto_x.drop(['clave'], axis=1, inplace=True)
    gastoList.append(gasto_x)
    
# merge all the expenses
gastH_2018 = reduce(lambda  left,right: pd.merge(left,right, on=['folioviv','foliohog'], how='outer'), gastoList)

In [9]:
del gastosH_2018