# ENIGH Data

This notebook extracts and cleans the data of the Income and Expenditure Survey. Such files can be downloaded from the INEGI's website with the following link:
https://www.inegi.org.mx/programas/enigh/nc/2018/default.html#Microdatos

In [1]:
import pandas as pd
from functools import reduce

### Directories and files' paths

In [2]:
dataRoot = r'D:\Tesis\Datos' #path of data folder

# files
tabConHogar_2018 = r'\Ingresos y Gastos de los Hogares\2018\concentradohogar.csv'

tabPoblacion_2018 = r'\Ingresos y Gastos de los Hogares\2018\poblacion.csv'

tabViviendas_2018 = r'\Ingresos y Gastos de los Hogares\2018\viviendas.csv'

tabGastosH_2018 = r'\Ingresos y Gastos de los Hogares\2018\gastoshogar.csv'

tabGastosP_2018 = r'\Ingresos y Gastos de los Hogares\2018\gastospersona.csv'

### Read files
The taken columns correspond to demographic and energy consumption indicators.

In [3]:
concentrador_cols = list(range(17)) + [19, 22, 56,82,92] # columns of interest
concentrador_2018 = pd.read_csv(dataRoot + tabConHogar_2018, usecols=concentrador_cols, na_values=' ') # read tabla concentrador hogar

In [4]:
poblacion_cols = [0,1,2,5,40] # columns of interest
poblacion_2018 = pd.read_csv(dataRoot + tabPoblacion_2018, usecols=poblacion_cols, na_values=' ') # read tabla poblacion

In [5]:
vivienda_cols = [0,1,5,10] + list(range(21,25)) + [27,46,47,50,51,52]  # columns of interest
vivienda_2018 = pd.read_csv(dataRoot + tabViviendas_2018, usecols=vivienda_cols, na_values=['&',' '], dtype={'combustible': float}) # read tabla vivienda

In [6]:
gastosH_cols = [0,1,2,23]
gastosH_2018 = pd.read_csv(dataRoot + tabGastosH_2018, usecols=gastosH_cols, na_values=' ') # read tabla gastoshogar
gastosH_2018 = gastosH_2018.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

In [7]:
gastosP_cols = [0,1,3,17]
gastosP_2018 = pd.read_csv(dataRoot + tabGastosP_2018, usecols=gastosP_cols, na_values=' ') # read tabla gastoshogar
gastosP_2018 = gastosP_2018.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

### Age and Education
This section takes variables from the Poblacion table to build more accurate variables regarding age and education

In [8]:
# Transform values to school years
dic_edu = {0:0,1:0,2:6,3:9,4:12,5:15,6:15,7:16,8:18,9:21} # for mapping the number of school years equivalent to each value 
poblacion_2018['nivelaprob'] = poblacion_2018['nivelaprob'].map(dic_edu)

In [9]:
# Age
pobEdad = poblacion_2018.groupby(['folioviv','foliohog'], sort=False, as_index=False)['edad'].mean() # Get average age

# Education
pobEdu = poblacion_2018.loc[poblacion_2018['edad']>=15].groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean education of population > 15 y
pobEdu_men15 = poblacion_2018.groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean poblacion_2018 of whole population to fill houses with no >15 population

# merge
pob = pobEdad.merge(pobEdu, on=['folioviv','foliohog'], how='left')
pob['nivelaprob'] = pob['nivelaprob'].fillna(pobEdu_men15['nivelaprob']) # give a value for households with no population older than 15 years old

In [10]:
# Merge with concentrador Hogar
con_2018 = concentrador_2018.merge(pob, on=['folioviv','foliohog'], how='left')

In [11]:
# del concentrador_2018, poblacion_2018, pob, pobEdad, pobEdu, pobEdu_men15

### Housing characteristics
From Vivienda table

In [12]:
# Merge dataframes
con_2018 = con_2018.merge(vivienda_2018, on=['folioviv'], how='left')

### Expenses
Energy expenses

In [13]:
# Expenses' keys
claves = ['R001','R003','G009','G010','G011','G012','G013','G014']
dic_cves = {'R001':'ele', 'R003':'gas', 'G009':'lpg', 'G010':'oil', 'G011':'diesel', 'G012':'coal', 
            'G013':'wood', 'G014':'heat', 'F007':'Magna', 'F008':'Premium', 'F009':'Die-Gas'}

In [14]:
# Sort every type of expenses into columns

def mergeGastos(gastoDF):
    gastoList = []
    for clave in dic_cves:
        gasto_x = gastoDF.loc[gastoDF['clave']==clave].copy()
        gasto_x.rename(columns={'gasto_tri':'gasto_tri_' + dic_cves[clave]}, inplace=True)
        gasto_x.drop(['clave'], axis=1, inplace=True)
        gastoList.append(gasto_x)

    # merge all the expenses
    return reduce(lambda  left,right: pd.merge(left,right, on=['folioviv','foliohog'], how='outer'), gastoList)

In [15]:
gastH_2018 = mergeGastos(gastosH_2018)
gastP_2018 = mergeGastos(gastosP_2018)

In [16]:
# Sum expenses from hogar and persona
gasto_2018 = gastH_2018.set_index(['folioviv', 'foliohog']).add(gastP_2018.set_index(['folioviv', 'foliohog']), fill_value=0)
gasto_2018.reset_index(inplace=True)

In [17]:
# Merge with concentrador Hogar
con_2018 = con_2018.merge(gasto_2018, on=['folioviv','foliohog'], how='left')

### Completeness

In [76]:
subset = list(con_2018)[list(con_2018).index('folioviv'):list(con_2018).index('factor')] + \
            list(con_2018)[list(con_2018).index('disp_elect'):]

In [None]:
percent_completeness = con_2018[subset].notnull().sum() * 100 / len(con_2018[subset])
percent_completeness

In [None]:
keyDict = {1:'public', 2:'priv plant', 3:'solar', 4:'other', 5:'no elec'}
s = (con_2018['disp_elect'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'wood', 2:'coal', 3:'gas tank', 4:'gas pipe', 5:'elec', 6:'other'}
s = (con_2018['combustible'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['calent_sol'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['calent_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['tanque_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['aire_acond'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['calefacc'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'rented', 2:'lend', 3:'owned and paying', 4:'own', 5:'litigated', 6:'other'}
s = (con_2018['tenencia'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)