# ENIGH Data

This notebook extracts and cleans the data of the Income and Expenditure Survey. Such files can be downloaded from the INEGI's website with the following link:
https://www.inegi.org.mx/programas/enigh/nc/2018/default.html#Microdatos

In [None]:
import pandas as pd
from functools import reduce

In [None]:
def extractENIGH(dataRoot, year):
    # file path
    
    tabConHogar = r'/Ingresos y Gastos de los Hogares/' + year + '/concentradohogar.csv'
    tabPoblacion = r'\Ingresos y Gastos de los Hogares/' + year + '/poblacion.csv'
    tabViviendas = r'\Ingresos y Gastos de los Hogares/' + year + '/viviendas.csv'
    tabGastosH = r'\Ingresos y Gastos de los Hogares/' + year + '/gastoshogar.csv'
    tabGastosP = r'\Ingresos y Gastos de los Hogares/' + year + '/gastospersona.csv'
    
    if year=='2018':
        concentrador_cols = list(range(17)) + [19, 22, 56,82,92] # columns of interest
    elif year=='2016':
        concentrador_cols = [0,1,2] + list(range(4,18)) + [20, 23, 57,83,93] # columns of interest
    else:
        raise ValueError('There is no data for the given year')
        
        
    # Read files
    concentrador = pd.read_csv(dataRoot + tabConHogar, usecols=concentrador_cols, na_values=' ') # read tabla concentrador hogar

    poblacion_cols = [0,1,2,5,40] # columns of interest
    poblacion = pd.read_csv(dataRoot + tabPoblacion, usecols=poblacion_cols, na_values=' ') # read tabla poblacion

    vivienda_cols = [0,1,5,10] + list(range(21,25)) + [27,46,47,50,51,52]  # columns of interest
    vivienda = pd.read_csv(dataRoot + tabViviendas, usecols=vivienda_cols, na_values=['&',' '], dtype={'combustible': float}) # read tabla vivienda

    gastosH_cols = [0,1,2,23]
    gastosH = pd.read_csv(dataRoot + tabGastosH, usecols=gastosH_cols, na_values=' ') # read tabla gastoshogar
    gastosH = gastosH.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)

    gastosP_cols = [0,1,3,17]
    gastosP = pd.read_csv(dataRoot + tabGastosP, usecols=gastosP_cols, na_values=' ') # read tabla gastoshogar
    gastosP = gastosP.groupby(['folioviv', 'foliohog', 'clave'],as_index=False).sum(min_count=1)
    
    # Add metropolitan area code and municpality name
    ZM_2015 = pd.read_csv(dataRoot + "\Zonas metropolitanas\ZM_2015.csv", encoding='latin-1', usecols=list(range(6)))
    concentrador = concentrador.merge(ZM_2015[['CVE_ZM','NOM_ZM','CVE_MUN','NOM_MUN']], 
                                      left_on='ubica_geo', right_on='CVE_MUN', how='left') 
    concentrador = concentrador[list(concentrador)[:3]+list(concentrador)[-4:]+list(concentrador)[3:-4]]
    
    # Transform values to school years
    dic_edu = {0:0,1:0,2:6,3:9,4:12,5:15,6:15,7:16,8:18,9:21} # for mapping the number of school years equivalent to each value 
    poblacion['nivelaprob'] = poblacion['nivelaprob'].map(dic_edu)

    # Age
    pobEdad = poblacion.groupby(['folioviv','foliohog'], sort=False, as_index=False)['edad'].mean() # Get average age

    # Education
    pobEdu = poblacion.loc[poblacion['edad']>=15].groupby(['folioviv','foliohog'], 
                                sort=False, as_index=False)['nivelaprob'].mean() # Get mean education of population > 15 y
    pobEdu_men15 = poblacion.groupby(['folioviv','foliohog'], 
                                sort=False, as_index=False)['nivelaprob'].mean() # Get mean poblacion_ of whole population to fill houses with no >15 population

    # merge
    pob = pobEdad.merge(pobEdu, on=['folioviv','foliohog'], how='left')
    pob['nivelaprob'] = pob['nivelaprob'].fillna(pobEdu_men15['nivelaprob']) # give a value for households with no population older than 15 years old
    
    # Merge with concentrador Hogar
    con = concentrador.merge(pob, on=['folioviv','foliohog'], how='left')
    
    # Housing
    con = con.merge(vivienda, on='folioviv', how='left') # Merge dataframes

    
    # Expenses' keys
    claves = ['R001','R003','G009','G010','G011','G012','G013','G014']
    dic_cves = {'R001':'ele', 'R003':'gas', 'G009':'lpg', 'G010':'oil', 'G011':'diesel', 'G012':'coal', 
                'G013':'wood', 'G014':'heat', 'F007':'Magna', 'F008':'Premium', 'F009':'Die-Gas'}

    # Sort every type of expenses into columns
    def mergeGastos(gastoDF):
        gastoList = []
        for clave in dic_cves:
            gasto_x = gastoDF.loc[gastoDF['clave']==clave].copy()
            gasto_x.rename(columns={'gasto_tri':'gasto_tri_' + dic_cves[clave]}, inplace=True)
            gasto_x.drop(['clave'], axis=1, inplace=True)
            gastoList.append(gasto_x)

        # merge all the expenses
        return reduce(lambda  left,right: pd.merge(left,right, on=['folioviv','foliohog'], how='outer'), gastoList)

    gastH = mergeGastos(gastosH)
    gastP = mergeGastos(gastosP)

    # Sum expenses from hogar and persona
    gasto = gastH.set_index(['folioviv', 'foliohog']).add(gastP.set_index(['folioviv', 'foliohog']), fill_value=0)
    gasto.reset_index(inplace=True)

    # Merge with concentrador Hogar
    con = con.merge(gasto, on=['folioviv','foliohog'], how='left')
    
    return con

## Data extraction
Give the root folder of the files and the version (year) of the ENIGH survey to analyze

In [None]:
dataRoot = r'D:\Tesis\Datos' #path of data folder
year = '2018'
con_2018 = extractENIGH(dataRoot, year)

In [None]:
con_2018

### Completeness

In [None]:
con_df = con_2018 # name of dataframe

# Columns for completeness check
subset = list(con_df)[list(con_df).index('folioviv'):list(con_df).index('factor')]+['publico'] + \
            list(con_df)[list(con_df).index('disp_elect'):]

In [None]:
percent_completeness = con_df[subset].notnull().sum() * 100 / len(con_df[subset])
print(percent_completeness,'\n','-'*30)

keyDict = {1:'public', 2:'priv plant', 3:'solar', 4:'other', 5:'no elec'}
s = (con_df['disp_elect'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'wood', 2:'coal', 3:'gas tank', 4:'gas pipe', 5:'elec', 6:'other'}
s = (con_df['combustible'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'yes', 2:'no'}
s = (con_df['calent_sol'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_df['calent_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_df['tanque_gas'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_df['aire_acond'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

s = (con_df['calefacc'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

keyDict = {1:'rented', 2:'lend', 3:'owned and paying', 4:'own', 5:'litigated', 6:'other'}
s = (con_df['tenencia'].value_counts(normalize=True, dropna=False) * 100).to_frame()
s['key'] = s.index.to_series().map(keyDict)
print(s,'\n','-'*30)

## Save data
Choose the path to save the dataframe in pickle format

In [None]:
pkls_path = pkls = r'D:\Tesis\ResEleCon-MX\pickles'
con_df.to_pickle(pkls + '\concentrador.pkl')