# Sample Size

In [5]:
import pandas as pd
import os

In [6]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

## CPV2010 data extraction

In [7]:
# Read 2015 and 2010 csv files and extract Metropolis
ZM_2015 = pd.read_csv("D:\Tesis\Datos\Zonas metropolitanas\ZM_2015.csv", encoding='latin-1', usecols=list(range(7)))              

In [4]:
# Columns to take from Censo de Población y Vivienda
cpyv_col = [0,2] + list(range(4,12)) + [29,32,35,38,41,44,26,48,54] + [108,129] + \
            [132,135,138,141] + list(range(157,165)) + [171,172] + list(range(189,198))

# Read Censo de Población y Vivienda files
CPV_path = 'D:/Tesis/Datos/Censo Poblacion y Vivienda 2010/AGEB/conjunto_de_datos'

# Locate filenames
CPV_files = []
for r, d, f in os.walk(CPV_path):
    for file in f:
        CPV_files.append(os.path.join(r, file))

# Put all csv files in a list to concatenate it
CPVlist = []
for file in CPV_files:
    CPV_ent = pd.read_csv(file, usecols=cpyv_col, dtype={'ageb': str}, na_values=['*','N/D'])
    CPVlist.append(CPV_ent)
CPV = pd.concat(CPVlist)

# Create a new CVE_MUN index
CPV['mun'] = CPV['entidad']*1000 + CPV['mun']

# Set names for the merging
CPV.rename(columns={'entidad':'CVE_ENT', 'loc':'CVE_LOC', 'mun':'CVE_MUN'}, inplace=True)

# Drop summary rows
CPV = CPV.loc[CPV['mza']!=0]

# Create missing age range
CPV['p_25a59'] = CPV['p_18ymas']-CPV['p_18a24']-CPV['p_60ymas']

# Organize columns
cols = CPV.columns.tolist()
cols = cols[:cols.index('p_60ymas')] + ['p_25a59'] + cols[cols.index('p_60ymas'):-1]
CPV = CPV[cols]

# Merge CPV with ZM dataframe
ZM_CPV = CPV.merge(ZM_2015, how='left', on=['CVE_MUN','CVE_ENT'])
ZM_CPV.reset_index(inplace=True, drop=True)

# Save CPV
pkls_path = pkls = r'D:\Tesis\ResEleCon-MX\pickles'
ZM_CPV.to_pickle(pkls + '\CPV2010_ZM.pkl')

del CPV

KeyboardInterrupt: 

In [8]:
pkls_path = pkls = r'D:\Tesis\ResEleCon-MX\pickles'
enigh = pd.read_pickle(pkls_path + '\enigh_fullMX_2016.pkl')
ZM_CPV = pd.read_pickle(pkls_path + '\CPV2010_ZM.pkl')

### CPV

In [9]:
sample_estado = ZM_CPV[['CVE_ENT','tothog']].groupby('CVE_ENT').sum()
sample_zm = ZM_CPV[['CVE_ENT','CVE_ZM','tothog']].groupby(['CVE_ENT','CVE_ZM']).sum().reset_index()

sample = sample_estado.merge(sample_zm, on='CVE_ENT')
sample.rename(columns={'tothog_x':'estado_CVP', 'tothog_y':'ZM_CVP'}, inplace=True)

sample['ratio_CVP'] = sample['ZM_CVP']/sample['estado_CVP']

### ENIGH

In [10]:
sample_e1 = enigh[['estado','clase_hog']].groupby('estado').count()
sample_e2 = enigh[['estado','CVE_ZM','clase_hog']].groupby(['estado','CVE_ZM']).count().reset_index()

sample_e = sample_e1.merge(sample_e2, on='estado')
sample_e.rename(columns={'estado':'CVE_ENT','clase_hog_x':'estado_enigh', 'clase_hog_y':'ZM_enigh'}, inplace=True)

sample_e['ratio_enigh'] = sample_e['ZM_enigh']/sample_e['estado_enigh']

### Both

In [11]:
S_comparison = sample.merge(sample_e, on=['CVE_ENT','CVE_ZM'])

In [12]:
S_comparison['enigh_cpv_e'] = S_comparison['estado_enigh']/S_comparison['estado_CVP']
S_comparison['enigh_cpv_zm'] = S_comparison['ZM_enigh']/S_comparison['ZM_CVP']

In [13]:
S_comparison

Unnamed: 0,CVE_ENT,estado_CVP,CVE_ZM,ZM_CVP,ratio_CVP,estado_enigh,ZM_enigh,ratio_enigh,enigh_cpv_e,enigh_cpv_zm
0,1,237057.0,1.01,207901.0,0.877008,2382,1695,0.711587,0.010048,0.008153
1,2,789668.0,2.01,109210.0,0.138299,3583,831,0.231929,0.004537,0.007609
2,2,789668.0,2.02,231740.0,0.293465,3583,1162,0.324309,0.004537,0.005014
3,2,789668.0,2.03,448718.0,0.568236,3583,1590,0.443762,0.004537,0.003543
4,3,149748.0,3.01,62860.0,0.419772,2251,796,0.353621,0.015032,0.012663
5,4,161662.0,4.01,64856.0,0.401183,1732,546,0.315242,0.010714,0.008419
6,5,642275.0,5.01,186258.0,0.289997,3289,1061,0.32259,0.005121,0.005696
7,5,642275.0,5.02,87501.0,0.136236,3289,318,0.096686,0.005121,0.003634
8,5,642275.0,5.03,45781.0,0.071279,3289,146,0.04439,0.005121,0.003189
9,5,642275.0,5.04,198641.0,0.309277,3289,946,0.287625,0.005121,0.004762


In [14]:
S_comparison.to_clipboard(index=False)