# ENIGH Data

This notebook extracts and cleans the data of the Income and Expenditure Survey. Such files can be downloaded from the INEGI's website with the following link:
https://www.inegi.org.mx/programas/enigh/nc/2018/default.html#Microdatos

In [1]:
import pandas as pd
from functools import reduce

## 2018

### Directories and files' paths

In [2]:
dataRoot = r'D:\Tesis\Datos' #path of data folder

# files
tabConHogar_2018 = r'\Ingresos y Gastos de los Hogares\2018\concentradohogar.csv'

tabPoblacion_2018 = r'\Ingresos y Gastos de los Hogares\2018\poblacion.csv'

tabViviendas_2018 = r'\Ingresos y Gastos de los Hogares\2018\viviendas.csv'

tabGastosH_2018 = r'\Ingresos y Gastos de los Hogares\2018\gastoshogar.csv'

tabGastosP_2018 = r'\Ingresos y Gastos de los Hogares\2018\gastospersona.csv'

### Read files
The taken columns correspond to demographic and energy consumption indicators.

In [3]:
concentrador_cols = list(range(17)) + [19, 22, 56,82,92] # columns of interest
concentrador_2018 = pd.read_csv(dataRoot + tabConHogar_2018, usecols=concentrador_cols, na_values=' ') # read tabla concentrador hogar

In [4]:
poblacion_cols = [0,1,2,5,40] # columns of interest
poblacion_2018 = pd.read_csv(dataRoot + tabPoblacion_2018, usecols=poblacion_cols, na_values=' ') # read tabla poblacion

In [5]:
vivienda_cols = [0,1,5,10] + list(range(21,25)) + [27,46,47,50,51,52]  # columns of interest
vivienda_2018 = pd.read_csv(dataRoot + tabViviendas_2018, usecols=vivienda_cols, na_values=['&',' '], dtype={'combustible': float}) # read tabla vivienda

In [6]:
gastosH_cols = [0,1,2,23]
gastosH_2018 = pd.read_csv(dataRoot + tabGastosH_2018, usecols=gastosH_cols, na_values=' ') # read tabla gastoshogar
gastosH_2018 = gastosH_2018.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

In [7]:
gastosP_cols = [0,1,3,17]
gastosP_2018 = pd.read_csv(dataRoot + tabGastosP_2018, usecols=gastosP_cols, na_values=' ') # read tabla gastoshogar
gastosP_2018 = gastosP_2018.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

### Age and Education
This section takes variables from the Poblacion table to build more accurate variables regarding age and education

In [8]:
# Transform values to school years
dic_edu = {0:0,1:0,2:6,3:9,4:12,5:15,6:15,7:16,8:18,9:21} # for mapping the number of school years equivalent to each value 
poblacion_2018['nivelaprob'] = poblacion_2018['nivelaprob'].map(dic_edu)

In [9]:
# Age
pobEdad = poblacion_2018.groupby(['folioviv','foliohog'], sort=False, as_index=False)['edad'].mean() # Get average age

# Education
pobEdu = poblacion_2018.loc[poblacion_2018['edad']>=15].groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean education of population > 15 y
pobEdu_men15 = poblacion_2018.groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean poblacion_2018 of whole population to fill houses with no >15 population

# merge
pob = pobEdad.merge(pobEdu, on=['folioviv','foliohog'], how='left')
pob['nivelaprob'] = pob['nivelaprob'].fillna(pobEdu_men15['nivelaprob']) # give a value for households with no population older than 15 years old

In [10]:
# Merge with concentrador Hogar
con_2018 = concentrador_2018.merge(pob, on=['folioviv','foliohog'], how='left')

In [11]:
# del concentrador_2018, poblacion_2018, pob, pobEdad, pobEdu, pobEdu_men15

### Housing characteristics
From Vivienda table

In [12]:
# Merge dataframes
con_2018 = con_2018.merge(vivienda_2018, on=['folioviv'], how='left')

### Expenses
Energy expenses

In [13]:
# Expenses' keys
claves = ['R001','R003','G009','G010','G011','G012','G013','G014']
dic_cves = {'R001':'ele', 'R003':'gas', 'G009':'lpg', 'G010':'oil', 'G011':'diesel', 'G012':'coal', 
            'G013':'wood', 'G014':'heat', 'F007':'Magna', 'F008':'Premium', 'F009':'Die-Gas'}

In [14]:
# Sort every type of expenses into columns

def mergeGastos(gastoDF):
    gastoList = []
    for clave in dic_cves:
        gasto_x = gastoDF.loc[gastoDF['clave']==clave].copy()
        gasto_x.rename(columns={'gasto_tri':'gasto_tri_' + dic_cves[clave]}, inplace=True)
        gasto_x.drop(['clave'], axis=1, inplace=True)
        gastoList.append(gasto_x)

    # merge all the expenses
    return reduce(lambda  left,right: pd.merge(left,right, on=['folioviv','foliohog'], how='outer'), gastoList)

In [15]:
gastH_2018 = mergeGastos(gastosH_2018)
gastP_2018 = mergeGastos(gastosP_2018)

In [16]:
# Sum expenses from hogar and persona
gasto_2018 = gastH_2018.set_index(['folioviv', 'foliohog']).add(gastP_2018.set_index(['folioviv', 'foliohog']), fill_value=0)
gasto_2018.reset_index(inplace=True)

In [17]:
# Merge with concentrador Hogar
con_2018 = con_2018.merge(gasto_2018, on=['folioviv','foliohog'], how='left')

### Completeness

In [18]:
subset = list(con_2018)[list(con_2018).index('folioviv'):list(con_2018).index('factor')] + \
            list(con_2018)[list(con_2018).index('disp_elect'):]

In [19]:
percent_completeness = con_2018[subset].notnull().sum() * 100 / len(con_2018[subset])
percent_completeness

folioviv             100.000000
foliohog             100.000000
ubica_geo            100.000000
tam_loc              100.000000
est_socio            100.000000
est_dis              100.000000
upm                  100.000000
disp_elect           100.000000
focos_inca            99.332860
focos_ahor            99.332860
combustible           99.998660
tenencia             100.000000
calent_sol           100.000000
calent_gas           100.000000
tanque_gas           100.000000
aire_acond           100.000000
calefacc             100.000000
gasto_tri_ele         88.250030
gasto_tri_gas          6.146262
gasto_tri_lpg         50.797755
gasto_tri_oil          0.146021
gasto_tri_diesel       0.179512
gasto_tri_coal         2.330971
gasto_tri_wood         4.176993
gasto_tri_heat         0.336249
gasto_tri_Magna       42.735810
gasto_tri_Premium      3.284794
gasto_tri_Die-Gas      0.288022
dtype: float64

In [20]:
keyDict = {1:'public', 2:'priv plant', 3:'solar', 4:'other', 5:'no elec'}
s = (con_2018['disp_elect'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'wood', 2:'coal', 3:'gas tank', 4:'gas pipe', 5:'elec', 6:'other'}
s = (con_2018['combustible'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2018['calent_sol'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2018['calent_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2018['tanque_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2018['aire_acond'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2018['calefacc'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'rented', 2:'lend', 3:'owned and paying', 4:'own', 5:'litigated', 6:'other'}
s = (con_2018['tenencia'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

   disp_elect         key
1   98.263828      public
5    0.667140     no elec
4    0.610875       other
3    0.282664       solar
2    0.175493  priv plant 
 ------------------------------
     combustible       key
3.0    74.854984  gas tank
1.0    17.635002      wood
4.0     5.141533  gas pipe
5.0     1.302129      elec
6.0     0.683216     other
2.0     0.381797      coal
NaN     0.001340       NaN 
 ------------------------------
   calent_sol  key
2   92.043887   no
1    7.956113  yes 
 ------------------------------
   calent_gas  key
2   65.710611   no
1   34.289389  yes 
 ------------------------------
   tanque_gas  key
2   91.087385   no
1    8.912615  yes 
 ------------------------------
   aire_acond  key
2   81.484855   no
1   18.515145  yes 
 ------------------------------
    calefacc  key
2  97.366271   no
1   2.633729  yes 
 ------------------------------
    tenencia               key
4  62.819671               own
2  13.721918              lend
1  12.351468          

## 2016

### Directories and files' paths

In [21]:
dataRoot = r'D:\Tesis\Datos' #path of data folder

# files
tabConHogar_2016 = r'\Ingresos y Gastos de los Hogares\2016\concentradohogar.csv'

tabPoblacion_2016 = r'\Ingresos y Gastos de los Hogares\2016\poblacion.csv'

tabViviendas_2016 = r'\Ingresos y Gastos de los Hogares\2016\viviendas.csv'

tabGastosH_2016 = r'\Ingresos y Gastos de los Hogares\2016\gastoshogar.csv'

tabGastosP_2016 = r'\Ingresos y Gastos de los Hogares\2016\gastospersona.csv'

### Read files
The taken columns correspond to demographic and energy consumption indicators.

In [22]:
concentrador_cols = [0,1,2] + list(range(4,18)) + [20, 23, 57,83,93] # columns of interest
concentrador_2016 = pd.read_csv(dataRoot + tabConHogar_2016, usecols=concentrador_cols, na_values=' ') # read tabla concentrador hogar

In [23]:
poblacion_cols = [0,1,2,5,40] # columns of interest
poblacion_2016 = pd.read_csv(dataRoot + tabPoblacion_2016, usecols=poblacion_cols, na_values=' ') # read tabla poblacion

In [24]:
vivienda_cols = [0,1,5,10] + list(range(21,25)) + [27,46,47,50,51,52]  # columns of interest
vivienda_2016 = pd.read_csv(dataRoot + tabViviendas_2016, usecols=vivienda_cols, na_values=['&',' '], dtype={'combustible': float}) # read tabla vivienda

In [25]:
gastosH_cols = [0,1,2,23]
gastosH_2016 = pd.read_csv(dataRoot + tabGastosH_2016, usecols=gastosH_cols, na_values=' ') # read tabla gastoshogar
gastosH_2016 = gastosH_2016.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

In [26]:
gastosP_cols = [0,1,3,17]
gastosP_2016 = pd.read_csv(dataRoot + tabGastosP_2016, usecols=gastosP_cols, na_values=' ') # read tabla gastoshogar
gastosP_2016 = gastosP_2016.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

### Age and Education
This section takes variables from the Poblacion table to build more accurate variables regarding age and education

In [27]:
# Transform values to school years
dic_edu = {0:0,1:0,2:6,3:9,4:12,5:15,6:15,7:16,8:18,9:21} # for mapping the number of school years equivalent to each value 
poblacion_2016['nivelaprob'] = poblacion_2016['nivelaprob'].map(dic_edu)

In [28]:
# Age
pobEdad = poblacion_2016.groupby(['folioviv','foliohog'], sort=False, as_index=False)['edad'].mean() # Get average age

# Education
pobEdu = poblacion_2016.loc[poblacion_2016['edad']>=15].groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean education of population > 15 y
pobEdu_men15 = poblacion_2016.groupby(['folioviv','foliohog'], 
                            sort=False, as_index=False)['nivelaprob'].mean() # Get mean poblacion_2016 of whole population to fill houses with no >15 population

# merge
pob = pobEdad.merge(pobEdu, on=['folioviv','foliohog'], how='left')
pob['nivelaprob'] = pob['nivelaprob'].fillna(pobEdu_men15['nivelaprob']) # give a value for households with no population older than 15 years old

In [29]:
# Merge with concentrador Hogar
con_2016 = concentrador_2016.merge(pob, on=['folioviv','foliohog'], how='left')

In [30]:
# del concentrador_2018, poblacion_2018, pob, pobEdad, pobEdu, pobEdu_men15

### Housing characteristics
From Vivienda table

In [31]:
# Merge dataframes
con_2016 = con_2016.merge(vivienda_2016, on=['folioviv'], how='left')

### Expenses
Energy expenses

In [32]:
# Expenses' keys
claves = ['R001','R003','G009','G010','G011','G012','G013','G014']
dic_cves = {'R001':'ele', 'R003':'gas', 'G009':'lpg', 'G010':'oil', 'G011':'diesel', 'G012':'coal', 
            'G013':'wood', 'G014':'heat', 'F007':'Magna', 'F008':'Premium', 'F009':'Die-Gas'}

In [33]:
gastH_2016 = mergeGastos(gastosH_2016)
gastP_2016 = mergeGastos(gastosP_2016)

In [34]:
# Sum expenses from hogar and persona
gasto_2016 = gastH_2016.set_index(['folioviv', 'foliohog']).add(gastP_2016.set_index(['folioviv', 'foliohog']), fill_value=0)
gasto_2016.reset_index(inplace=True)

In [35]:
# Merge with concentrador Hogar
con_2016 = con_2016.merge(gasto_2016, on=['folioviv','foliohog'], how='left')

### Completeness

In [36]:
subset = list(con_2016)[list(con_2016).index('folioviv'):list(con_2016).index('factor')] + \
            list(con_2016)[list(con_2016).index('disp_elect'):]

In [40]:
percent_completeness = con_2016[subset].notnull().sum() * 100 / len(con_2016[subset])
percent_completeness

folioviv             100.000000
foliohog             100.000000
ubica_geo            100.000000
tam_loc              100.000000
est_socio            100.000000
est_dis              100.000000
upm                  100.000000
disp_elect           100.000000
focos_inca            99.270384
focos_ahor            99.270384
combustible          100.000000
tenencia             100.000000
calent_sol           100.000000
calent_gas           100.000000
tanque_gas           100.000000
aire_acond           100.000000
calefacc             100.000000
gasto_tri_ele         88.848118
gasto_tri_gas          6.354624
gasto_tri_lpg         48.103426
gasto_tri_oil          0.146492
gasto_tri_diesel       0.257428
gasto_tri_coal         2.311160
gasto_tri_wood         3.788881
gasto_tri_heat         0.732460
gasto_tri_Magna       40.170101
gasto_tri_Premium      4.936639
gasto_tri_Die-Gas      0.391119
dtype: float64

In [38]:
keyDict = {1:'public', 2:'priv plant', 3:'solar', 4:'other', 5:'no elec'}
s = (con_2016['disp_elect'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'wood', 2:'coal', 3:'gas tank', 4:'gas pipe', 5:'elec', 6:'other'}
s = (con_2016['combustible'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_2016['calent_sol'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2016['calent_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2016['tanque_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2016['aire_acond'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_2016['calefacc'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'rented', 2:'lend', 3:'owned and paying', 4:'own', 5:'litigated', 6:'other'}
s = (con_2016['tenencia'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

   disp_elect         key
1   98.311786      public
5    0.729616     no elec
4    0.520544       other
3    0.285873       solar
2    0.152181  priv plant 
 ------------------------------
     combustible       key
3.0    75.980999  gas tank
1.0    16.651733      wood
4.0     5.194066  gas pipe
5.0     1.099401      elec
6.0     0.768016     other
2.0     0.305784      coal 
 ------------------------------
   calent_sol  key
2   95.131629   no
1    4.868371  yes 
 ------------------------------
   calent_gas  key
2   64.600134   no
1   35.399866  yes 
 ------------------------------
   tanque_gas  key
2   91.354127   no
1    8.645873  yes 
 ------------------------------
   aire_acond  key
2   81.933126   no
1   18.066874  yes 
 ------------------------------
    calefacc  key
2  97.603504   no
1   2.396496  yes 
 ------------------------------
    tenencia               key
4  62.141059               own
2  13.754605              lend
1  12.165948            rented
3   9.645717  owne