# Data Check: Metropolitan areas

In [26]:
import pandas as pd
from functools import reduce

In [27]:
pkls_path = pkls = r'D:\Tesis\ResEleCon-MX\pickles'
concentrador = pd.read_pickle(pkls_path + '\concentrador_2018.pkl')

In [28]:
concentrador['gasto_tri_total_gas'] = concentrador[['gasto_tri_gas','gasto_tri_lpg']].sum(axis=1, min_count=1)
concentrador['gasto_tri_total_gasolina'] = concentrador[['gasto_tri_Magna','gasto_tri_Premium']].sum(axis=1, min_count=1)

## Check for true nulls

In [29]:
concentrador.loc[(~concentrador['combustible'].isin([3,4])) & (concentrador['calent_gas']==2) & (concentrador['tanque_gas']==2) 
                 & (concentrador['gasto_tri_total_gas'].isnull()), 'gasto_tri_total_gas'] = 0

concentrador.loc[(concentrador['vehiculos']==0) & (concentrador['gasto_tri_total_gasolina'].isnull()),
                 'gasto_tri_total_gasolina'] = 0

## Create DataFrame

In [30]:
dataRoot = r'D:\Tesis\Datos' #path of data folder
ZM_2015 = pd.read_csv(dataRoot + "\Zonas metropolitanas\ZM_2015.csv", encoding='latin-1', usecols=list(range(6)))
metropolis_list = ZM_2015.set_index('CVE_ZM')['NOM_ZM'].to_dict()

In [31]:
percent_completeness = pd.DataFrame()
col_names = list(concentrador)
subset = col_names[col_names.index('folioviv'):col_names.index('factor')]+['publico']+col_names[col_names.index('disp_elect'):]

for key, name in metropolis_list.items():
    # Columns for completeness check
    con_df = concentrador.loc[concentrador['CVE_ZM']==key, subset]
    percent_completeness[name] = (con_df.notnull().sum() * 100 / len(con_df))

In [32]:
def Service_Availability(keyDict, infra, legend):
    df_list = []
    for key, name in metropolis_list.items():
        s = pd.DataFrame()
        con_df = concentrador.loc[concentrador['CVE_ZM']==key]
        s[name] = (con_df[infra].value_counts(normalize=True, dropna=False) * 100)
        df_list.append(s)
    s = reduce(lambda  left,right: pd.merge(left,right,left_index=True, right_index=True, how='outer'), df_list)
    s['Type'] = s.index.to_series().map(keyDict)
    cols = list(s.columns); cols = [cols[-1]] + cols[:-1]; s = s[cols]
    return s.T, legend


In [33]:
frames_list = []

keyDict = {1:'public', 2:'priv plant', 3:'solar', 4:'other', 5:'no elec'}
frames_list.append(Service_Availability(keyDict, 'disp_elect', "Electricity source"))

keyDict = {1:'wood', 2:'coal', 3:'gas tank', 4:'gas pipe', 5:'elec', 6:'other'}
frames_list.append(Service_Availability(keyDict, 'combustible', "Energy source for cooking"))

keyDict = {1:'yes', 2:'no'}
frames_list.append(Service_Availability(keyDict, 'calent_sol', "Solar heater"))

frames_list.append(Service_Availability(keyDict, 'calent_gas', "Gas heater"))
frames_list.append(Service_Availability(keyDict, 'tanque_gas', "Gas tank"))
frames_list.append(Service_Availability(keyDict, 'aire_acond', "Air conditioned"))
frames_list.append(Service_Availability(keyDict, 'calefacc', "Heating"))

keyDict = {1:'rented', 2:'lend', 3:'own and paying', 4:'own', 5:'litigated', 6:'other'}
frames_list.append(Service_Availability(keyDict, 'tenencia', "Household ownership"))

In [34]:
with pd.ExcelWriter('csv_files/data_check_2018_nanAsZero.xlsx') as writer:
    percent_completeness.T.to_excel(writer, sheet_name='Data available')
    for df, legend in frames_list:
        df.to_excel(writer, sheet_name=legend)

## Data Metropolis/State