# Administrative Descriptive Stats Preparation

After appending travel time information to each populated place in an administrative center we can prepare any number of descriptive stats. Given the quantity of data in question these are best prepared with Dask Dataframes. This notebook separates out the descriptive stats preparations,

In [1]:
import os, sys
from datetime import date

import pandas as pd
import geopandas as gpd
import numpy as np
from scipy import stats

import re

# custom functions
import sensitivity_testing as st

## Setup

### Data prep

Dates

In [2]:
today = date.today().strftime("%y%m%d")

In [3]:
# data_date = '211022'
data_date = '211221'

Directories

In [4]:
geo_dir = r'P:\PAK\GEO'
data_dir = r'../../data'

rast_dir = r'rast_inputs'
vect_in_dir = r'vect_inputs'
vect_out_dir = r'vect_out'

rds_dir = r'roads'
dest_dir = r'destinations'
speed_dir = r'speed'
fric_dir = r'friction'
acc_dir = r'access'
tab_dir = r'tabular'

Projections

In [5]:
# change this to whatever the desired output projection is
DEST_CRS = 'EPSG:32642'

dcrs_int = int(re.findall('[0-9]+',DEST_CRS)[0])
dcrs_int

32642

## Create master files merging admin aggregates and secondary data

There's a lot of primary (modeled) data, secondary data, and spatial data to bring together. Let's do that first

Spatial data

In [6]:
##### TEMPORARILY INSERTING "BEFORE OSM INVESTMENT" DATA HERE

In [7]:
# adm2_geo = gpd.read_file('../../Boundaries/KP_Analysis/KP_Analysis_Focus_Districts.gpkg')
# adm3_geo = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_Focus_Tehsils.gpkg')).rename({'ADM1_PCODE':'Adm1_Code','ADM2_PCODE' : 'Adm2_Code','ADM3_PCODE':'Adm3_Code'},axis=1)
adm3_geo = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_All_Tehsils.gpkg')).rename({'ADM1_PCODE':'Adm1_Code','ADM2_PCODE' : 'Adm2_Code','ADM3_PCODE':'Adm3_Code'},axis=1)

In [8]:
adm2_ds = adm3_geo.dissolve(by='Adm2_Code')
adm2_geo = adm2_ds.reset_index()[['geometry','ADM1_EN','ADM2_EN','Adm1_Code','Adm2_Code']]
adm3_geo = adm3_geo.drop('Adm2_Code',axis=1)

Tabular data

In [9]:
# access mean

adm2_acc =  pd.read_csv(os.path.join(data_dir,tab_dir,f"processed//BeforeOSMInvestment_adm2_mean_{data_date}.csv"))
adm3_acc =  pd.read_csv(os.path.join(data_dir,tab_dir,f"processed//BeforeOSMInvestment_adm3_mean_{data_date}.csv"))

# # access standard deviation

# adm2_sd =  pd.read_csv(os.path.join(data_dir,tab_dir,f"processed//adm2_sd_{data_date}.csv"))
# adm3_sd =  pd.read_csv(os.path.join(data_dir,tab_dir,f"processed//adm3_sd_{data_date}.csv"))

# elevation
adm2_elev =  pd.read_csv(os.path.join(data_dir,tab_dir,"processed//adm2_elev.csv"))
adm3_elev =  pd.read_csv(os.path.join(data_dir,tab_dir,"processed//adm3_elev.csv"))

# educational gender ratios
adm3_educ_ratio = pd.read_csv(os.path.join(data_dir,tab_dir,'processed//adm3_educ_gender_ratio.csv')).drop(['Adm2_Code'],axis=1)

# agricultural land usage
adm3_agr = pd.read_csv(os.path.join(data_dir,tab_dir,'processed//adm3_KP_Agricultural_Areas.csv'))

# agricultural production
adm2_agrprod = pd.read_csv(os.path.join(data_dir,tab_dir,'processed//adm2_crop_hazard_d4p_KPK_211015.csv'))

# terrain roughness index
adm3_TRI = pd.read_csv(os.path.join(data_dir,tab_dir,'processed//adm3_TRI.csv'))


In [10]:
# clean up ag data slightly
adm2_agrprod = adm2_agrprod[adm2_agrprod['year'] == 2013].iloc[:,5:].drop(['district_data4pakistan','district_data4pakistan_num'],axis=1)

Unify tabular data

In [11]:
from functools import reduce

adm2_dfs = [adm2_acc,adm2_elev] # agrprod is missing 3 districts and thus returning nulls for them. Took it out for now.
adm3_dfs = [adm3_acc,adm3_elev,adm3_TRI,adm3_agr,adm3_educ_ratio] # 

adm2_tab = reduce(lambda left,right: pd.merge(left,right,on='Adm2_Code'), adm2_dfs)
adm3_tab = reduce(lambda left,right: pd.merge(left,right,how='left',on='Adm3_Code'), adm3_dfs)

In [12]:
adm3_tab

Unnamed: 0.1,Unnamed: 0,Adm3_Code,childwalk_dry_District_HQs_avg_adm3,childwalk_dry_education_allboys_avg_adm3,childwalk_dry_education_allgirls_avg_adm3,childwalk_dry_education_boys_avg_adm3,childwalk_dry_education_boys_high_avg_adm3,childwalk_dry_education_boys_middle_avg_adm3,childwalk_dry_education_boys_primary_avg_adm3,childwalk_dry_education_girls_avg_adm3,...,irrig_sqkm,non_irrig_sqkm,total_agr_sqkm,orch_pct,irrig_pct,non_irrig_pct,total_agr_pct,Adm3_En,Boys_schools,Girls_schools
0,0,PK20602,27.598384,0.597785,1.364126,0.597785,1.939344,1.756274,0.606823,1.364126,...,7.007183,43.932157,54.911789,0.000455,0.000802,0.005031,0.006288,Mastuj,2.402586,9.160957
1,1,PK20601,7.395280,0.474356,0.937578,0.474356,1.300650,1.894393,0.490474,0.937578,...,6.931465,31.564670,42.336770,0.000649,0.001171,0.005335,0.007155,Chitral,2.476649,10.744338
2,2,PK21201,9.349773,1.298185,1.698798,1.298185,5.097626,3.042713,1.314433,1.698798,...,1.589743,28.673123,30.635728,0.000066,0.000283,0.005102,0.005451,Dassu,1.417964,inf
3,3,PK22206,24.582286,0.621943,0.815934,0.621943,1.657484,1.508284,0.627695,0.815934,...,1.395410,22.138573,23.610657,0.000036,0.000663,0.010514,0.011214,Kalam,2.156381,inf
4,4,PK23003,15.762571,0.698686,1.356720,0.698686,1.911438,1.877487,0.729373,1.356720,...,0.907668,16.886898,18.000187,0.000268,0.001181,0.021972,0.023420,Kalkot,13.805527,inf
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,111,PK23808,2.904620,0.737904,0.814151,0.737904,1.383095,1.181543,0.786788,0.814151,...,3.268404,54.088002,58.332932,0.000437,0.001462,0.024187,0.026085,,,
112,112,PK20703,8.089872,0.218852,0.550919,0.219352,0.996699,1.532518,0.229353,0.552720,...,67.146160,717.361577,796.755828,0.010939,0.059968,0.640669,0.711576,Kulachi,5.683960,56.793002
113,113,PK23807,11.271521,2.150485,2.178054,2.150485,4.459155,2.507794,2.155306,2.178054,...,0.915049,34.596888,35.544239,0.000062,0.001746,0.066024,0.067832,,,
114,114,PK20701,16.702657,0.224358,0.586715,0.224358,2.454329,1.580521,0.230002,0.586752,...,30.244979,1154.694670,1209.756072,0.014533,0.017713,0.676233,0.708479,Daraban,7.875748,18.156053


In [13]:
adm3_tab = pd.merge(adm3_tab,adm2_agrprod,how='left',on='Adm2_Code')

In [14]:
adm3_tab.columns

Index(['Unnamed: 0', 'Adm3_Code', 'childwalk_dry_District_HQs_avg_adm3',
       'childwalk_dry_education_allboys_avg_adm3',
       'childwalk_dry_education_allgirls_avg_adm3',
       'childwalk_dry_education_boys_avg_adm3',
       'childwalk_dry_education_boys_high_avg_adm3',
       'childwalk_dry_education_boys_middle_avg_adm3',
       'childwalk_dry_education_boys_primary_avg_adm3',
       'childwalk_dry_education_girls_avg_adm3',
       ...
       'sp3avg_by_s_cat56', 'sp3avg_by_s_cat5', 'sp3avg_by_s_cat6',
       'mean_povrate', 'pov_quintile', 'pov_quintile_2014',
       'flood_1in10_mean_tercile', 'flood_1in10_max_tercile',
       'flood_1in50_mean_tercile', 'flood_1in50_max_tercile'],
      dtype='object', length=409)

#### Compute agricultural totals

In [15]:
yield_cols = ['yield_wt','yield_ba','yield_by','yield_cn','yield_gs','yield_jr','yield_re','yield_st','yield_se']
share_irriP_cols = ['share_irriP_wt','share_irriP_ba','share_irriP_by','share_irriP_cn','share_irriP_gs','share_irriP_jr','share_irriP_re','share_irriP_st','share_irriP_se']
share_irriA_cols = ['share_irriA_wt','share_irriA_ba','share_irriA_by','share_irriA_cn','share_irriA_gs','share_irriA_jr','share_irriA_re','share_irriA_st','share_irriA_se']

In [16]:
adm3_tab['yield_to'] = np.sum(adm3_tab[yield_cols],axis=1)
adm3_tab['share_irriP_to'] = np.sum(adm3_tab[share_irriP_cols],axis=1)
adm3_tab['share_irriA_to'] = np.sum(adm3_tab[share_irriA_cols],axis=1)

In [17]:
adm3_tab[['yield_to','share_irriP_to','share_irriA_to']]

Unnamed: 0,yield_to,share_irriP_to,share_irriA_to
0,5.010840,1.977012,2.963415
1,5.010840,1.977012,2.963415
2,2.857143,0.653846,0.571429
3,4.204707,0.343217,0.594551
4,3.869048,0.565217,0.444444
...,...,...,...
111,1.041667,0.600000,0.583333
112,57.634802,3.066572,3.039518
113,1.041667,0.600000,0.583333
114,57.634802,3.066572,3.039518


#### Consolidate final datasets

Unify spatial and tabular data

In [18]:
adm2_final = pd.merge(adm2_geo,adm2_tab,how='left',on='Adm2_Code')
adm3_final = pd.merge(adm3_geo,adm3_tab,how='left',on='Adm3_Code')

In [19]:
adm3_final

Unnamed: 0,Shape_Leng,Shape_Area,ADM3_EN,Adm3_Code,ADM3_REF,ADM3ALT1EN,ADM3ALT2EN,ADM2_EN,ADM1_EN,Adm1_Code,...,sp3avg_by_s_cat56,sp3avg_by_s_cat5,sp3avg_by_s_cat6,mean_povrate,pov_quintile,pov_quintile_2014,flood_1in10_mean_tercile,flood_1in10_max_tercile,flood_1in50_mean_tercile,flood_1in50_max_tercile
0,2.187495,0.171564,Abbottabad,PK20101,,,,Abbottabad,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,1.0,3.0,2.0,3.0,2.0
1,1.581975,0.080696,Alai,PK20301,,,,Batagram,Khyber Pakhtunkhwa,PK2,...,,,,,,3.0,3.0,2.0,3.0,2.0
2,1.189227,0.056426,Alpuri,PK22001,,,,Shangla,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,3.0,3.0,2.0,3.0,2.0
3,0.818266,0.022875,Ambar Utman Khel,PK23501,,,,Mohmand,Khyber Pakhtunkhwa,PK2,...,,,,,,,3.0,2.0,3.0,2.0
4,0.948441,0.031578,Babuzai,PK22201,,,,Swat,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,2.0,3.0,2.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,0.671976,0.016740,Utman Khel Tehsil,PK23207,,Utman Khel,,Bajaur,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,,2.0,2.0,2.0,2.0
112,3.496818,0.213873,Wana,PK23808,,,,South Waziristan,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,,2.0,2.0,1.0,2.0
113,1.075965,0.064361,Wari,PK23004,,,,Upper Dir,Khyber Pakhtunkhwa,PK2,...,0.0,0.0,0.0,,,3.0,3.0,2.0,3.0,2.0
114,1.995740,0.079313,Wazir,PK23901,,,,FR Bannu,Khyber Pakhtunkhwa,PK2,...,,,,,,,2.0,2.0,2.0,2.0


#### Export

Export these interim products

In [20]:
## TEMP BEFORE OSM INVESTMENT 
# all KP
adm2_tab.to_csv(os.path.join(data_dir,tab_dir,f"final//BeforeOSMInvestment_adm2_all_KP_raw_access_aggregates_{today}.csv"),index=False)
adm3_tab.to_csv(os.path.join(data_dir,tab_dir,f"final//BeforeOSMInvestment_adm3_all_KP_raw_access_aggregates_{today}.csv"),index=False)

# # all KP
# adm2_tab.to_csv(os.path.join(data_dir,tab_dir,f"final//adm2_all_KP_raw_access_aggregates_{today}.csv"),index=False)
# adm3_tab.to_csv(os.path.join(data_dir,tab_dir,f"final//adm3_all_KP_raw_access_aggregates_{today}.csv"),index=False)

In [21]:
# all KP -- geo
# adm2_final.to_file(os.path.join(data_dir,acc_dir,f"vector/adm2_all_KP_access_raw_aggregates_{today}.gpkg"),driver="GPKG")
# adm3_final.to_file(os.path.join(data_dir,acc_dir,f"vector/adm3_all_KP_access_raw_aggregates_{today}.gpkg"),driver="GPKG")

## Consolidate access variables into master columns per category, and thence an overall dataset

This step slims down the administrative dataset into only the columns we anticipate using for our analysis (ie. we drop children's walking speeds to Peshawar).</br></br>
Seasonal figures are weighted by the season's length in that tehsil and then merged into a master value per column (e.g. health_primary) and then again merged, with weighting, into a sectoral master (e.g. health_idx). These are merged into a final inaccessibility index value.</br></br>We preserve the intermediate columns in this process to enable sensitivity testing of the importance of the weights used.

#### Define admin level of analysis

In [86]:
adm_level = 'adm2'
# adm_level = 'adm3'

In [87]:
if adm_level == 'adm2':
    df = adm2_final.copy()
    adm_cols = ('ADM2_EN','Adm2_Code')
elif adm_level == 'adm3':
    df = adm3_final.copy()
    adm_cols = ('ADM2_EN','Adm2_Code','ADM3_EN','Adm3_Code')

#### Load in data

Spatial data

In [88]:
# adm2_geo = gpd.read_file('../../Boundaries/KP_Analysis/KP_Analysis_Focus_Districts.gpkg')
# adm3_geo = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_Focus_Tehsils.gpkg')).rename({'ADM1_PCODE':'Adm1_Code','ADM2_PCODE' : 'Adm2_Code','ADM3_PCODE':'Adm3_Code'},axis=1)
adm3_geo = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_All_Tehsils.gpkg')).rename({'ADM1_PCODE':'Adm1_Code','ADM2_PCODE' : 'Adm2_Code','ADM3_PCODE':'Adm3_Code'},axis=1)

Tabular data

In [89]:
# adm3 = pd.read_csv(os.path.join(data_dir,tab_dir,r"final//adm3_allKP_raw_access_aggregates_{today}.csv"))

In [90]:
df.head(2)

Unnamed: 0.1,geometry,ADM1_EN,ADM2_EN,Adm1_Code,Adm2_Code,Unnamed: 0,childwalk_dry_District_HQs_avg_adm2,childwalk_dry_education_allboys_avg_adm2,childwalk_dry_education_allgirls_avg_adm2,childwalk_dry_education_boys_avg_adm2,...,winter_health_pharmacies_avg_adm2,winter_health_primary_avg_adm2,winter_health_private_avg_adm2,winter_health_public_avg_adm2,winter_health_secondary_avg_adm2,winter_health_tertiary_avg_adm2,winter_markets_All_avg_adm2,winter_markets_Central_avg_adm2,winter_Provincial_HQ_avg_adm2,Mean_elevation
0,"POLYGON ((910881.442 3803046.496, 911557.385 3...",Khyber Pakhtunkhwa,Abbottabad,PK2,PK201,14,5.719609,0.235093,0.295345,0.248319,...,0.548865,0.413843,0.618175,0.370501,0.634336,1.104411,1.02003,1.135257,3.231324,1317.718793
1,"POLYGON ((660737.131 3632666.421, 660402.682 3...",Khyber Pakhtunkhwa,Bannu,PK2,PK202,30,3.744117,0.199008,0.237959,0.199008,...,0.512182,0.328809,0.459506,0.30751,0.493972,0.525173,0.535008,2.1496,2.819661,722.978865


### Prepare for consolidation

Prepare a slimmed down dataframe only containing the access columns for analysis

In [91]:
# first save out the raw data -- we'll join this back in later
unfiltered_acc_cols = list(df.filter(regex='^(childwalk|dry|msn|winter)',axis=1).columns)
raw_acc_data = df[unfiltered_acc_cols].copy().add_suffix('_hrs')

In [92]:
df[unfiltered_acc_cols][:3]

Unnamed: 0,childwalk_dry_District_HQs_avg_adm2,childwalk_dry_education_allboys_avg_adm2,childwalk_dry_education_allgirls_avg_adm2,childwalk_dry_education_boys_avg_adm2,childwalk_dry_education_boys_high_avg_adm2,childwalk_dry_education_boys_middle_avg_adm2,childwalk_dry_education_boys_primary_avg_adm2,childwalk_dry_education_girls_avg_adm2,childwalk_dry_education_girls_high_avg_adm2,childwalk_dry_education_girls_middle_avg_adm2,...,winter_health_family_avg_adm2,winter_health_pharmacies_avg_adm2,winter_health_primary_avg_adm2,winter_health_private_avg_adm2,winter_health_public_avg_adm2,winter_health_secondary_avg_adm2,winter_health_tertiary_avg_adm2,winter_markets_All_avg_adm2,winter_markets_Central_avg_adm2,winter_Provincial_HQ_avg_adm2
0,5.719609,0.235093,0.295345,0.248319,0.731196,0.890366,0.259262,0.309646,1.196463,0.872648,...,1.025457,0.548865,0.413843,0.618175,0.370501,0.634336,1.104411,1.02003,1.135257,3.231324
1,3.744117,0.199008,0.237959,0.199008,0.567903,0.615294,0.213073,0.237959,1.076659,0.892447,...,0.764053,0.512182,0.328809,0.459506,0.30751,0.493972,0.525173,0.535008,2.1496,2.819661
2,5.042905,0.299211,0.503555,0.299211,0.919168,1.192296,0.307101,0.503555,4.06838,1.637842,...,2.337019,1.364274,0.703493,1.227318,0.682792,0.889203,2.358741,2.255746,2.653706,4.988345


In [93]:
raw_acc_data.filter(regex='girls',axis=1).columns[::15]

Index(['childwalk_dry_education_allgirls_avg_adm2_hrs', 'dry_education_allgirls_avg_adm2_hrs'], dtype='object')

In [94]:
df[unfiltered_acc_cols][:3]

Unnamed: 0,childwalk_dry_District_HQs_avg_adm2,childwalk_dry_education_allboys_avg_adm2,childwalk_dry_education_allgirls_avg_adm2,childwalk_dry_education_boys_avg_adm2,childwalk_dry_education_boys_high_avg_adm2,childwalk_dry_education_boys_middle_avg_adm2,childwalk_dry_education_boys_primary_avg_adm2,childwalk_dry_education_girls_avg_adm2,childwalk_dry_education_girls_high_avg_adm2,childwalk_dry_education_girls_middle_avg_adm2,...,winter_health_family_avg_adm2,winter_health_pharmacies_avg_adm2,winter_health_primary_avg_adm2,winter_health_private_avg_adm2,winter_health_public_avg_adm2,winter_health_secondary_avg_adm2,winter_health_tertiary_avg_adm2,winter_markets_All_avg_adm2,winter_markets_Central_avg_adm2,winter_Provincial_HQ_avg_adm2
0,5.719609,0.235093,0.295345,0.248319,0.731196,0.890366,0.259262,0.309646,1.196463,0.872648,...,1.025457,0.548865,0.413843,0.618175,0.370501,0.634336,1.104411,1.02003,1.135257,3.231324
1,3.744117,0.199008,0.237959,0.199008,0.567903,0.615294,0.213073,0.237959,1.076659,0.892447,...,0.764053,0.512182,0.328809,0.459506,0.30751,0.493972,0.525173,0.535008,2.1496,2.819661
2,5.042905,0.299211,0.503555,0.299211,0.919168,1.192296,0.307101,0.503555,4.06838,1.637842,...,2.337019,1.364274,0.703493,1.227318,0.682792,0.889203,2.358741,2.255746,2.653706,4.988345


In [95]:
# assign the three sets of access stats to separate lists

childwalk_cols = list(df.filter(regex='^(childwalk)',axis=1).columns)
walk_cols = list(df.filter(regex='^(walk)',axis=1).columns)
multimodal_cols = list(df.filter(regex='^(dry|msn|winter)',axis=1))
multimodal_cols = [item for item in multimodal_cols if not re.match('^(walk|childwalk)',item)]

# create a master list of all access cols

import itertools
acc_cols = list(itertools.chain(childwalk_cols,walk_cols,multimodal_cols))

# save out non access values to later join back in as needed

df_non_acc = df.drop(df[acc_cols].columns,axis=1)

In [96]:
childwalk_cols[::20]

['childwalk_dry_District_HQs_avg_adm2',
 'childwalk_dry_health_tertiary_avg_adm2',
 'childwalk_msn_health_primary_avg_adm2',
 'childwalk_winter_education_middle_avg_adm2']

In [97]:
walk_cols[::20]

[]

In [98]:
multimodal_cols[::20]

['dry_District_HQs_avg_adm2',
 'dry_health_tertiary_avg_adm2',
 'msn_health_primary_avg_adm2',
 'winter_education_middle_avg_adm2']

In [99]:
# Children's walking speeds should be used for primary school access. This requires some adjustment

# create sets of the columns to add and remove
childwalk_ed_add = set([f'childwalk_dry_education_primary_avg_{adm_level}',f'childwalk_msn_education_primary_avg_{adm_level}',f'childwalk_winter_education_primary_avg_{adm_level}',
                        f'childwalk_dry_education_girls_primary_avg_{adm_level}',f'childwalk_msn_education_girls_primary_avg_{adm_level}',f'childwalk_winter_education_girls_primary_avg_{adm_level}',
                        f'childwalk_dry_education_boys_primary_avg_{adm_level}',f'childwalk_msn_education_boys_primary_avg_{adm_level}',f'childwalk_winter_education_boys_primary_avg_{adm_level}'])

ed_remove = set([f'dry_education_primary_avg_{adm_level}',f'childwalk_dry_education_middle_avg_{adm_level}',
f'msn_education_primary_avg_{adm_level}',f'childwalk_msn_education_middle_avg_{adm_level}',
f'winter_education_primary_avg_{adm_level}',f'childwalk_winter_education_middle_avg_{adm_level}',
f'dry_education_girls_primary_avg_{adm_level}',f'childwalk_dry_education_girls_middle_avg_{adm_level}',f'dry_education_girls_middle_avg_{adm_level}'
f'msn_education_girls_primary_avg_{adm_level}',f'childwalk_msn_education_girls_middle_avg_{adm_level}',f'msn_education_girls_middle_avg_{adm_level}'
f'winter_education_girls_primary_avg_{adm_level}',f'childwalk_winter_education_girls_middle_avg_{adm_level}' ,f'winter_education_girls_middle_avg_{adm_level}'
f'dry_education_boys_primary_avg_{adm_level}',f'childwalk_dry_education_boys_middle_avg_{adm_level}',f'dry_education_boys_middle_avg_{adm_level}'
f'msn_education_boys_primary_avg_{adm_level}',f'childwalk_msn_education_boys_middle_avg_{adm_level}',f'msn_education_boys_middle_avg_{adm_level}'
f'winter_education_boys_primary_avg_{adm_level}',f'childwalk_winter_education_boys_middle_avg_{adm_level}'])

# Remove and add these columns
standard_cols = list(set(multimodal_cols).difference(ed_remove))
standard_cols.extend(childwalk_ed_add)

In [100]:
standard_cols

['winter_education_boys_primary_avg_adm2',
 'msn_markets_Central_avg_adm2',
 'msn_education_girls_high_avg_adm2',
 'winter_health_public_avg_adm2',
 'msn_education_middle_avg_adm2',
 'dry_health_public_avg_adm2',
 'msn_health_secondary_avg_adm2',
 'msn_Provincial_HQ_avg_adm2',
 'winter_education_allgirls_avg_adm2',
 'winter_education_boys_middle_avg_adm2',
 'dry_health_secondary_avg_adm2',
 'winter_education_allboys_avg_adm2',
 'msn_education_girls_avg_adm2',
 'dry_District_HQs_avg_adm2',
 'dry_education_boys_middle_avg_adm2',
 'msn_education_boys_primary_avg_adm2',
 'dry_education_allboys_avg_adm2',
 'msn_education_boys_middle_avg_adm2',
 'winter_education_middle_avg_adm2',
 'msn_health_tertiary_avg_adm2',
 'winter_health_tertiary_avg_adm2',
 'msn_health_pharmacies_avg_adm2',
 'msn_education_high_avg_adm2',
 'winter_health_pharmacies_avg_adm2',
 'dry_education_boys_high_avg_adm2',
 'dry_education_allgirls_avg_adm2',
 'winter_health_family_avg_adm2',
 'dry_Provincial_HQ_avg_adm2',
 'ms

In [101]:
# reduce the DF down to just identifying information, elevation, and the columns being used for analysis
df = pd.concat([df[(['Mean_elevation'] + list(adm_cols))],df[standard_cols]],axis=1,ignore_index=False)

In [102]:
df.head(3)

Unnamed: 0,Mean_elevation,ADM2_EN,Adm2_Code,winter_education_boys_primary_avg_adm2,msn_markets_Central_avg_adm2,msn_education_girls_high_avg_adm2,winter_health_public_avg_adm2,msn_education_middle_avg_adm2,dry_health_public_avg_adm2,msn_health_secondary_avg_adm2,...,winter_education_girls_middle_avg_adm2,childwalk_dry_education_boys_primary_avg_adm2,childwalk_msn_education_primary_avg_adm2,childwalk_msn_education_boys_primary_avg_adm2,childwalk_dry_education_primary_avg_adm2,childwalk_winter_education_boys_primary_avg_adm2,childwalk_msn_education_girls_primary_avg_adm2,childwalk_dry_education_girls_primary_avg_adm2,childwalk_winter_education_girls_primary_avg_adm2,childwalk_winter_education_primary_avg_adm2
0,1317.718793,Abbottabad,PK201,0.165632,1.262006,0.602283,0.370501,0.410571,0.344358,0.742125,...,0.428063,0.259262,0.289952,0.340668,0.221225,0.259866,0.42854,0.325166,0.325708,0.22176
1,722.978865,Bannu,PK202,0.13749,2.515163,0.530147,0.30751,0.325927,0.30751,0.676689,...,0.350885,0.213073,0.223484,0.280073,0.169659,0.213073,0.323946,0.245005,0.245005,0.169659
2,3220.849075,Batagram,PK203,0.254726,2.87405,1.614119,0.682792,0.673078,0.599131,0.981075,...,0.825492,0.307101,0.355866,0.398854,0.274783,0.326274,0.664016,0.505933,0.526454,0.292014


### Prepare seasonal master values

### Education overall

Education is a bit complicated because of Transport's requirements. First we have to define our custom weighting schemas and the main columns to operate on

In [103]:
# equal weighting schema for comparison's sake

educ_equal_wts = np.array([1,1,1,1,1,1]) / 6

# set up weighting schema to prioritize lower schools and girls' education

girls_wts = np.array([.4,.4,.2]) * (2/3)
boys_wts = np.array([.4,.4,.2]) * (1/3)

# put the girls/boys arrays together
educ_mast_wts = np.concatenate((girls_wts,boys_wts),axis=0)
educ_mast_wts

array([0.26666667, 0.26666667, 0.13333333, 0.13333333, 0.13333333,
       0.06666667])

In [104]:
# because we mix childwalking and multimodal for education's master value it's easiest just to manually specify which columns to use
educ_mast_cols = [f'childwalk_dry_education_girls_primary_avg_{adm_level}',\
                  f'dry_education_girls_middle_avg_{adm_level}',\
                  f'dry_education_girls_high_avg_{adm_level}',\
                  f'childwalk_dry_education_boys_primary_avg_{adm_level}',\
                  f'dry_education_boys_middle_avg_{adm_level}',\
                  f'dry_education_boys_high_avg_{adm_level}']

Convert every column to an index (so features with naturally higher travel times, like hospitals, don't unduly  weight results)

In [105]:
# OLD

filtered_acc_cols = list(df.filter(regex='^(dry|msn|winter)',axis=1).columns) # only the access columns remaining after we remove those note used for index calcs

max_acc = np.array(np.max(df[filtered_acc_cols],axis=0)) # column-wise max

df[filtered_acc_cols] = df[filtered_acc_cols] / max_acc # divide by max for column-wise index
df[filtered_acc_cols][:3]

Unnamed: 0,winter_education_boys_primary_avg_adm2,msn_markets_Central_avg_adm2,msn_education_girls_high_avg_adm2,winter_health_public_avg_adm2,msn_education_middle_avg_adm2,dry_health_public_avg_adm2,msn_health_secondary_avg_adm2,msn_Provincial_HQ_avg_adm2,winter_education_allgirls_avg_adm2,winter_education_boys_middle_avg_adm2,...,dry_education_boys_primary_avg_adm2,msn_education_boys_avg_adm2,winter_education_high_avg_adm2,winter_education_girls_avg_adm2,dry_education_girls_middle_avg_adm2,msn_education_girls_middle_avg_adm2,dry_education_middle_avg_adm2,dry_health_family_avg_adm2,msn_health_primary_avg_adm2,winter_education_girls_middle_avg_adm2
0,0.196108,0.185992,0.143824,0.159272,0.222141,0.148034,0.179563,0.401334,0.172033,0.229335,...,0.232962,0.233979,0.139294,0.175397,0.169857,0.168208,0.219943,0.172092,0.118516,0.148294
1,0.162788,0.37068,0.126598,0.132193,0.176344,0.132193,0.16373,0.363105,0.135024,0.152893,...,0.201365,0.199242,0.107203,0.135024,0.152095,0.161958,0.171228,0.139824,0.107411,0.121557
2,0.301596,0.423572,0.385448,0.293521,0.364171,0.257557,0.237379,0.609048,0.332713,0.353313,...,0.340572,0.341331,0.23703,0.332713,0.310063,0.300085,0.366233,0.38828,0.190639,0.285976


In [106]:
# # NEW -- have lower values represent worse index, and vice-versa
# # filter columns
# filtered_acc_cols = list(df.filter(regex='^(dry|msn|winter)',axis=1).columns) # only the access columns remaining after we remove those note used for index calcs

# # compute min and max for indexing calculations
# min_acc = np.array(np.min(df[filtered_acc_cols],axis=0)) # column-wise min
# max_acc = np.array(np.max(df[filtered_acc_cols],axis=0)) # column-wise max

# # calculate highest TT value as worst (lowest) index value and lowest TT as best (1)
# df[filtered_acc_cols] = (max_acc - df[filtered_acc_cols]) / (max_acc - min_acc)
# df[filtered_acc_cols][:3]

In [107]:
np.max(df[filtered_acc_cols].iloc[:,4])

1.0

In [108]:
df = pd.concat([df,raw_acc_data],axis=1,ignore_index=False)

Now consolidate into master values, weighting appropriately

In [109]:
# education by levels

df['dry_educ_idx'] = np.nansum((df[educ_mast_cols] * educ_mast_wts),axis=1)

df['msn_educ_idx'] = np.nansum((df[[col.replace('dry','msn') for col in educ_mast_cols]] * educ_mast_wts),axis=1) # use the same mast_cols list, but with monsoon data instead

df['winter_educ_idx'] = np.nansum((df[[col.replace('dry','winter') for col in educ_mast_cols]] * educ_mast_wts),axis=1)

# education weighting equally -- for comparison's sake

df['dry_educ_eqwt_idx'] = np.nansum((df[educ_mast_cols] * educ_equal_wts),axis=1)

df['msn_educ_eqwt_idx'] = np.nansum((df[[col.replace('dry','msn') for col in educ_mast_cols]] * educ_equal_wts),axis=1)

df['winter_educ_eqwt_idx'] = np.nansum((df[[col.replace('dry','winter') for col in educ_mast_cols]] * educ_equal_wts),axis=1)


**Optional**</br>
Sensitivity test educational weighting schemes

In [110]:
wt_samp_arr, vals_arr, ranks_arr = st.Sensitivity_weighting(df,educ_mast_cols,iterations=500000)

Calculate descriptive stats for sensitivity tested rankings of education per admin

In [111]:
# compute basic stats for each entity's ranking
# axis=0 for operating by column
ranks_mode = stats.mode(ranks_arr,axis=0)
ranks_mean = np.mean(ranks_arr,axis=0)
ranks_std = np.std(ranks_arr,axis=0)
vals_mean = np.sum(np.mean(vals_arr,axis=0),axis=1)

In [112]:
ranks_std

array([0.85052572, 1.26817909, 1.06700351, 0.53379792, 0.83268411,
       2.34421906, 0.63920259, 1.35692967, 0.76928055, 1.32332408,
       0.67904164, 0.09114718, 1.63227849, 1.56854995, 0.57425919,
       0.22671839, 1.31804812, 1.0635501 , 0.24725016, 0.66763948,
       0.76939186, 0.66470672, 0.76181308, 0.39236408, 1.79946729,
       0.81906874, 0.87666994, 1.23813756, 0.78593975, 1.09631983,
       1.69761426, 1.25263462, 0.95833227, 0.60941235, 1.525163  ,
       1.05793055, 1.43773021, 1.38242321])

In [113]:
# join in the key stats for sensitivity tested educational rankings
df['sens_test_educ_rank_mean'] = ranks_mean
df['sens_test_educ_rank_mode'] = ranks_mode[0][0]
df['sens_test_educ_rank_std'] = ranks_std

In [114]:
# Now calculate the same for an equal weighting schema and the unbalanced weighting scheme requested by Transport

df['educ_eqwt_rank'] = st.rank_by_weight(df,educ_mast_cols,educ_equal_wts)
df['educ_eqwt_val'] = np.nansum(df[educ_mast_cols] * educ_equal_wts, axis=1)

df['educ_altwt_rank'] = st.rank_by_weight(df,educ_mast_cols,educ_mast_wts)
df['educ_altwt_val'] = np.nansum(df[educ_mast_cols] * educ_mast_wts, axis=1)

df['educ_sens_test_val_mean'] = vals_mean

In [115]:
df[['sens_test_educ_rank_mean','sens_test_educ_rank_mode','sens_test_educ_rank_std']].head(10)

Unnamed: 0,sens_test_educ_rank_mean,sens_test_educ_rank_mode,sens_test_educ_rank_std
0,26.24998,27,0.850526
1,31.615212,31,1.268179
2,16.189234,17,1.067004
3,20.547284,21,0.533798
4,35.487624,35,0.832684
5,8.900446,11,2.344219
6,17.802906,18,0.639203
7,24.951032,24,1.35693
8,28.71398,29,0.769281
9,25.56133,26,1.323324


In [116]:
df[['educ_sens_test_val_mean','educ_eqwt_val','educ_altwt_val']].head(10)

Unnamed: 0,educ_sens_test_val_mean,educ_eqwt_val,educ_altwt_val
0,0.22056,0.220523,0.231124
1,0.17557,0.175548,0.183821
2,0.363776,0.363754,0.381845
3,0.277256,0.277208,0.306948
4,0.159398,0.159368,0.171111
5,0.568801,0.568695,0.638116
6,0.330706,0.330662,0.363756
7,0.230324,0.230291,0.25392
8,0.199038,0.199004,0.219041
9,0.223801,0.223757,0.239144


In [117]:
# join in the key stats for sensitivity tested educational rankings
df['sens_test_educ_rank_mean'] = ranks_mean
df['sens_test_educ_rank_mode'] = ranks_mode[0][0]
df['sens_test_educ_rank_std'] = ranks_std

In [118]:
# Now calculate the same for an equal weighting schema and the unbalanced weighting scheme requested by Transport

df['educ_eqwt_rank'] = st.rank_by_weight(df,educ_mast_cols,educ_equal_wts)
df['educ_eqwt_val'] = np.nansum(df[educ_mast_cols] * educ_equal_wts, axis=1)

df['educ_altwt_rank'] = st.rank_by_weight(df,educ_mast_cols,educ_mast_wts)
df['educ_altwt_val'] = np.nansum(df[educ_mast_cols] * educ_mast_wts, axis=1)

df['educ_sens_test_val_mean'] = vals_mean

In [119]:
df[['sens_test_educ_rank_mean','sens_test_educ_rank_mode','sens_test_educ_rank_std']].head(10)

Unnamed: 0,sens_test_educ_rank_mean,sens_test_educ_rank_mode,sens_test_educ_rank_std
0,26.24998,27,0.850526
1,31.615212,31,1.268179
2,16.189234,17,1.067004
3,20.547284,21,0.533798
4,35.487624,35,0.832684
5,8.900446,11,2.344219
6,17.802906,18,0.639203
7,24.951032,24,1.35693
8,28.71398,29,0.769281
9,25.56133,26,1.323324


In [120]:
df[['educ_sens_test_val_mean','educ_eqwt_val','educ_altwt_val']].head(10)

Unnamed: 0,educ_sens_test_val_mean,educ_eqwt_val,educ_altwt_val
0,0.22056,0.220523,0.231124
1,0.17557,0.175548,0.183821
2,0.363776,0.363754,0.381845
3,0.277256,0.277208,0.306948
4,0.159398,0.159368,0.171111
5,0.568801,0.568695,0.638116
6,0.330706,0.330662,0.363756
7,0.230324,0.230291,0.25392
8,0.199038,0.199004,0.219041
9,0.223801,0.223757,0.239144


#### Other destinations

Health, markets, and administrative access are more straightforward to consolidate

In [121]:
# health by levels

df[f'dry_health_idx'] = np.mean(df[[f'dry_health_primary_avg_{adm_level}',f'dry_health_secondary_avg_{adm_level}',f'dry_health_tertiary_avg_{adm_level}']],axis=1) # using np.mean implies equal weighting of sub-categories
df[f'msn_health_idx'] = np.mean(df[[f'msn_health_primary_avg_{adm_level}',f'msn_health_secondary_avg_{adm_level}',f'msn_health_tertiary_avg_{adm_level}']],axis=1)
df[f'winter_health_idx'] = np.mean(df[[f'winter_health_primary_avg_{adm_level}',f'winter_health_secondary_avg_{adm_level}',f'winter_health_tertiary_avg_{adm_level}']],axis=1)

In [122]:
# markets
df[f'dry_markets_idx'] = np.mean(df[[f'dry_markets_All_avg_{adm_level}',f'dry_markets_Central_avg_{adm_level}']],axis=1) # using np.mean implies equal weighting of sub-categories
df[f'msn_markets_idx'] = np.mean(df[[f'msn_markets_All_avg_{adm_level}',f'msn_markets_Central_avg_{adm_level}']],axis=1)
df[f'winter_markets_idx'] = np.mean(df[[f'winter_markets_All_avg_{adm_level}',f'winter_markets_Central_avg_{adm_level}']],axis=1)


In [123]:
# administrative
# I assume that access to the Provincial HQ is not materially significant for administrative functions -- therefore this is just District HQ access

df[f'dry_admin_idx'] = df[f'dry_District_HQs_avg_{adm_level}']
df[f'msn_admin_idx'] = df[f'msn_District_HQs_avg_{adm_level}']
df[f'winter_admin_idx'] = df[f'winter_District_HQs_avg_{adm_level}']


#### Prepare categorical and overall master values, weighted by season

In [124]:
# equal weights for variables
eq_wts = [0.3333, 0.3333, 0.3333]

In [125]:

# categorize admins  by the population-weighted mean elevation of populated places in that tehsil
df['Elevation_category'] = pd.cut(df['Mean_elevation'],bins=[0,1500,2250,100000],labels=['Low','Medium','High'])

# Define the weighting of a season according to the tehsils's classification

season_wts_dct = {
    float("NaN") : np.array([0.3333,0.3333,0.3333]),
    'Low' : np.array([0.3333,0.3333,0.3333]),
    'Medium' : np.array([0.2667,0.3333,0.4]),
    'High' : np.array([0.25,0.25,0.5])   
}

# Assign the seasonal weighting as a list, for later use
df['seasonal_wts'] = df['Elevation_category'].map(season_wts_dct)
seasonal_wts_arr = np.stack(df['seasonal_wts']) # turn the Series of weights into an array of shape (3,number_of_admins) so it can be multiplied by the 3 seasonal values for each category

TypeError: unhashable type: 'numpy.ndarray'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'numpy.ndarray'


Master indices

In [126]:
# calculate the master value for each feature type by weighting each tehsil's seasonal master values by its seasonal weights
df['educ_altwt_idx'] = np.nansum(np.multiply(df[['dry_educ_idx','msn_educ_idx','winter_educ_idx']],seasonal_wts_arr),axis=1)
df['educ_eqwt_idx'] = np.nansum(np.multiply(df[['dry_educ_eqwt_idx','msn_educ_eqwt_idx','winter_educ_eqwt_idx']],seasonal_wts_arr),axis=1)
df['health_idx'] = np.nansum(np.multiply(df[['dry_health_idx','msn_health_idx','winter_health_idx']],seasonal_wts_arr),axis=1)
df['markets_idx'] = np.nansum(np.multiply(df[['dry_markets_idx','msn_markets_idx','winter_markets_idx']],seasonal_wts_arr),axis=1)
df['admin_idx'] = np.nansum(np.multiply(df[['dry_admin_idx','msn_admin_idx','winter_admin_idx']],seasonal_wts_arr),axis=1)


#### Specialty educational index calculations

In [127]:
# Differences in index values

# overall index vals -- with and without private schools

df[f'dry_educ_alllevels_gender_dif'] = df[f'dry_education_girls_avg_{adm_level}'] - df[f'dry_education_boys_avg_{adm_level}']
df[f'msn_educ_alllevels_gender_dif'] = df[f'msn_education_girls_avg_{adm_level}'] - df[f'msn_education_boys_avg_{adm_level}']
df[f'winter_educ_alllevels_gender_dif'] = df[f'winter_education_girls_avg_{adm_level}'] - df[f'winter_education_boys_avg_{adm_level}']

df[f'dry_educ_alllevels_w_privschool_dif'] = df[f'dry_education_allgirls_avg_{adm_level}'] - df[f'dry_education_allboys_avg_{adm_level}']
df[f'msn_educ_alllevels_w_privschool_dif'] = df[f'msn_education_allgirls_avg_{adm_level}'] - df[f'msn_education_allboys_avg_{adm_level}']
df[f'winter_educ_alllevels_w_privschool_dif'] = df[f'winter_education_allgirls_avg_{adm_level}'] - df[f'winter_education_allboys_avg_{adm_level}']

# primary

df[f'childwalk_dry_educ_primary_gender_dif'] = df[f'childwalk_dry_education_girls_primary_avg_{adm_level}'] - df[f'childwalk_dry_education_boys_primary_avg_{adm_level}']
df[f'childwalk_msn_educ_primary_gender_dif'] = df[f'childwalk_msn_education_girls_primary_avg_{adm_level}'] - df[f'childwalk_msn_education_boys_primary_avg_{adm_level}']
df[f'childwalk_winter_educ_primary_gender_dif'] = df[f'childwalk_winter_education_girls_primary_avg_{adm_level}'] - df[f'childwalk_winter_education_boys_primary_avg_{adm_level}']

# middle

df[f'dry_educ_middle_gender_dif'] = df[f'dry_education_girls_middle_avg_{adm_level}'] - df[f'dry_education_boys_middle_avg_{adm_level}']
df[f'msn_educ_middle_gender_dif'] = df[f'msn_education_girls_middle_avg_{adm_level}'] - df[f'msn_education_boys_middle_avg_{adm_level}']
df[f'winter_educ_middle_gender_dif'] = df[f'winter_education_girls_middle_avg_{adm_level}'] - df[f'winter_education_boys_middle_avg_{adm_level}']

# high

df[f'dry_educ_high_gender_dif'] = df[f'dry_education_girls_high_avg_{adm_level}'] - df[f'dry_education_boys_high_avg_{adm_level}']
df[f'msn_educ_high_gender_dif'] = df[f'msn_education_girls_high_avg_{adm_level}'] - df[f'msn_education_boys_high_avg_{adm_level}']
df[f'winter_educ_high_gender_dif'] = df[f'winter_education_girls_high_avg_{adm_level}'] - df[f'winter_education_boys_high_avg_{adm_level}']


In [128]:
# Differences in hours

# overall in hours -- with and without private schools

df[f'dry_educ_gender_dif_hrs'] = df[f'dry_education_girls_avg_{adm_level}_hrs'] - df[f'dry_education_boys_avg_{adm_level}_hrs']
df[f'msn_educ_gender_dif_hrs'] = df[f'msn_education_girls_avg_{adm_level}_hrs'] - df[f'msn_education_boys_avg_{adm_level}_hrs']
df[f'winter_educ_gender_dif_hrs'] = df[f'winter_education_girls_avg_{adm_level}_hrs'] - df[f'winter_education_boys_avg_{adm_level}_hrs']

df[f'dry_educ_allgender_dif_hrs'] = df[f'dry_education_allgirls_avg_{adm_level}_hrs'] - df[f'dry_education_allboys_avg_{adm_level}_hrs']
df[f'msn_educ_allgender_dif_hrs'] = df[f'msn_education_allgirls_avg_{adm_level}_hrs'] - df[f'msn_education_allboys_avg_{adm_level}_hrs']
df[f'winter_educ_allgender_dif_hrs'] = df[f'winter_education_allgirls_avg_{adm_level}_hrs'] - df[f'winter_education_allboys_avg_{adm_level}_hrs']

# primary

df[f'childwalk_dry_educ_primary_gender_dif_hrs'] = df[f'childwalk_dry_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_dry_education_boys_primary_avg_{adm_level}_hrs']
df[f'childwalk_msn_educ_primary_gender_dif_hrs'] = df[f'childwalk_msn_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_msn_education_boys_primary_avg_{adm_level}_hrs']
df[f'childwalk_winter_educ_primary_gender_dif_hrs'] = df[f'childwalk_winter_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_winter_education_boys_primary_avg_{adm_level}_hrs']

# middle

df[f'dry_educ_middle_gender_dif_hrs'] = df[f'dry_education_girls_middle_avg_{adm_level}_hrs'] - df[f'dry_education_boys_middle_avg_{adm_level}_hrs']
df[f'msn_educ_middle_gender_dif_hrs'] = df[f'msn_education_girls_middle_avg_{adm_level}_hrs'] - df[f'msn_education_boys_middle_avg_{adm_level}_hrs']
df[f'winter_educ_middle_gender_dif_hrs'] = df[f'winter_education_girls_middle_avg_{adm_level}_hrs'] - df[f'winter_education_boys_middle_avg_{adm_level}_hrs']

# high

df[f'dry_educ_high_gender_dif_hrs'] = df[f'dry_education_girls_high_avg_{adm_level}_hrs'] - df[f'dry_education_boys_high_avg_{adm_level}_hrs']
df[f'msn_educ_high_gender_dif_hrs'] = df[f'msn_education_girls_high_avg_{adm_level}_hrs'] - df[f'msn_education_boys_high_avg_{adm_level}_hrs']
df[f'winter_educ_high_gender_dif_hrs'] = df[f'winter_education_girls_high_avg_{adm_level}_hrs'] - df[f'winter_education_boys_high_avg_{adm_level}_hrs']


In [129]:
# use dry only as we'll replace inline below

educ_dif_cols = ['childwalk_dry_educ_primary_gender_dif',
 'dry_educ_middle_gender_dif',
 'dry_educ_high_gender_dif']

# create weights for just three columns

educ_dif_wts = [0.4,0.4,0.2]
educ_eq_dif_wts = [1/3, 1/3, 1/3]

In [130]:
# education access differences by levels

df['dry_educ_gender_dif_idx'] = np.nansum((df[educ_dif_cols] * educ_dif_wts),axis=1)
df['msn_educ_gender_dif_idx'] = np.nansum((df[[col.replace('dry','msn') for col in educ_dif_cols]] * educ_dif_wts),axis=1) # use the same dif_cols list, but with monsoon data instead
df['winter_educ_gender_dif_idx'] = np.nansum((df[[col.replace('dry','winter') for col in educ_dif_cols]] * educ_dif_wts),axis=1)

# education access differences weighting equally -- for comparison's sake

df['dry_educ_eqwt_gender_dif_idx'] = np.nansum((df[educ_dif_cols] * educ_eq_dif_wts),axis=1)
df['msn_educ_eqwt_gender_dif_idx'] = np.nansum((df[[col.replace('dry','msn') for col in educ_dif_cols]] * educ_eq_dif_wts),axis=1)
df['winter_educ_eqwt_gender_dif_idx'] = np.nansum((df[[col.replace('dry','winter') for col in educ_dif_cols]] * educ_eq_dif_wts),axis=1)


In [131]:
# Differences in hours

# overall in hours -- with and without private schools

df[f'dry_educ_gender_dif_hrs'] = df[f'dry_education_girls_avg_{adm_level}_hrs'] - df[f'dry_education_boys_avg_{adm_level}_hrs']
df[f'msn_educ_gender_dif_hrs'] = df[f'msn_education_girls_avg_{adm_level}_hrs'] - df[f'msn_education_boys_avg_{adm_level}_hrs']
df[f'winter_educ_gender_dif_hrs'] = df[f'winter_education_girls_avg_{adm_level}_hrs'] - df[f'winter_education_boys_avg_{adm_level}_hrs']

df[f'dry_educ_allgender_dif_hrs'] = df[f'dry_education_allgirls_avg_{adm_level}_hrs'] - df[f'dry_education_allboys_avg_{adm_level}_hrs']
df[f'msn_educ_allgender_dif_hrs'] = df[f'msn_education_allgirls_avg_{adm_level}_hrs'] - df[f'msn_education_allboys_avg_{adm_level}_hrs']
df[f'winter_educ_allgender_dif_hrs'] = df[f'winter_education_allgirls_avg_{adm_level}_hrs'] - df[f'winter_education_allboys_avg_{adm_level}_hrs']

# primary

df[f'childwalk_dry_educ_primary_gender_dif_hrs'] = df[f'childwalk_dry_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_dry_education_boys_primary_avg_{adm_level}_hrs']
df[f'childwalk_msn_educ_primary_gender_dif_hrs'] = df[f'childwalk_msn_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_msn_education_boys_primary_avg_{adm_level}_hrs']
df[f'childwalk_winter_educ_primary_gender_dif_hrs'] = df[f'childwalk_winter_education_girls_primary_avg_{adm_level}_hrs'] - df[f'childwalk_winter_education_boys_primary_avg_{adm_level}_hrs']

# middle

df[f'dry_educ_middle_gender_dif_hrs'] = df[f'dry_education_girls_middle_avg_{adm_level}_hrs'] - df[f'dry_education_boys_middle_avg_{adm_level}_hrs']
df[f'msn_educ_middle_gender_dif_hrs'] = df[f'msn_education_girls_middle_avg_{adm_level}_hrs'] - df[f'msn_education_boys_middle_avg_{adm_level}_hrs']
df[f'winter_educ_middle_gender_dif_hrs'] = df[f'winter_education_girls_middle_avg_{adm_level}_hrs'] - df[f'winter_education_boys_middle_avg_{adm_level}_hrs']

# high

df[f'dry_educ_high_gender_dif_hrs'] = df[f'dry_education_girls_high_avg_{adm_level}_hrs'] - df[f'dry_education_boys_high_avg_{adm_level}_hrs']
df[f'msn_educ_high_gender_dif_hrs'] = df[f'msn_education_girls_high_avg_{adm_level}_hrs'] - df[f'msn_education_boys_high_avg_{adm_level}_hrs']
df[f'winter_educ_high_gender_dif_hrs'] = df[f'winter_education_girls_high_avg_{adm_level}_hrs'] - df[f'winter_education_boys_high_avg_{adm_level}_hrs']


In [132]:
# Education w/ and w/out private schools, per gender

## BY INDEX VALUES
df['educ_girls_alllevels_pubschool_idx'] = np.nansum(df[[f'dry_education_girls_avg_{adm_level}',f'msn_education_girls_avg_{adm_level}',f'winter_education_girls_avg_{adm_level}']] * seasonal_wts_arr,axis=1)

df['educ_boys_alllevels_pubschool_idx'] = np.nansum(df[[f'dry_education_boys_avg_{adm_level}',f'msn_education_boys_avg_{adm_level}',f'winter_education_boys_avg_{adm_level}']] * seasonal_wts_arr,axis=1)

df['educ_girls_alllevels_w_privschool_idx'] = np.nansum(df[[f'dry_education_allgirls_avg_{adm_level}',f'msn_education_allgirls_avg_{adm_level}',f'winter_education_allgirls_avg_{adm_level}']] * seasonal_wts_arr,axis=1)

df['educ_boys_alllevels_w_privschool_idx'] = np.nansum(df[[f'dry_education_allboys_avg_{adm_level}',f'msn_education_allboys_avg_{adm_level}',f'winter_education_allboys_avg_{adm_level}']] * seasonal_wts_arr,axis=1)

## BY HOURS

df['educ_girls_alllevels_pubschool_hrs'] = np.nansum(df[[f'dry_education_girls_avg_{adm_level}_hrs',f'msn_education_girls_avg_{adm_level}_hrs',f'winter_education_girls_avg_{adm_level}_hrs']] * seasonal_wts_arr,axis=1)

df['educ_boys_alllevels_pubschool_hrs'] = np.nansum(df[[f'dry_education_boys_avg_{adm_level}_hrs',f'msn_education_boys_avg_{adm_level}_hrs',f'winter_education_boys_avg_{adm_level}_hrs']] * seasonal_wts_arr,axis=1)

df['educ_girls_alllevels_w_privschool_hrs'] = np.nansum(df[[f'dry_education_allgirls_avg_{adm_level}_hrs',f'msn_education_allgirls_avg_{adm_level}_hrs',f'winter_education_allgirls_avg_{adm_level}_hrs']] * seasonal_wts_arr,axis=1)

df['educ_boys_alllevels_w_privschool_hrs'] = np.nansum(df[[f'dry_education_allboys_avg_{adm_level}_hrs',f'msn_education_allboys_avg_{adm_level}_hrs',f'winter_education_allboys_avg_{adm_level}_hrs']] * seasonal_wts_arr,axis=1)

In [133]:
# Calculate educational difference indices, weighting by seasons.

## One index
df['educ_gender_dif_idx'] = np.nansum(np.multiply(df[['dry_educ_gender_dif_idx',\
                                                      'msn_educ_gender_dif_idx',\
                                                      'winter_educ_gender_dif_idx']],\
                                                  seasonal_wts_arr),axis=1)

df['educ_gender_eqwt_dif_idx'] = np.nansum(np.multiply(df[['dry_educ_eqwt_gender_dif_idx',\
                                                           'msn_educ_eqwt_gender_dif_idx',\
                                                           'winter_educ_eqwt_gender_dif_idx']],\
                                                  seasonal_wts_arr),axis=1)

## Differences in hours
df['educ_primary_gender_dif_hrs'] = np.nansum(np.multiply(df[['childwalk_dry_educ_primary_gender_dif_hrs',\
                                                              'childwalk_msn_educ_primary_gender_dif_hrs',\
                                                              'childwalk_winter_educ_primary_gender_dif_hrs']],seasonal_wts_arr),axis=1)

df['educ_middle_gender_dif_hrs'] = np.nansum(np.multiply(df[['dry_educ_middle_gender_dif_hrs',\
                                                              'msn_educ_middle_gender_dif_hrs',\
                                                              'winter_educ_middle_gender_dif_hrs']],seasonal_wts_arr),axis=1)


df['educ_high_gender_dif_hrs'] = np.nansum(np.multiply(df[['dry_educ_high_gender_dif_hrs',\
                                                              'msn_educ_high_gender_dif_hrs',\
                                                              'winter_educ_high_gender_dif_hrs']],seasonal_wts_arr),axis=1)

# Overall difference indices for all levels,w/out and w/ private schools included

df['educ_alllevels_dif'] = np.nansum(np.multiply(df[[f'dry_educ_alllevels_gender_dif',\
                                                              f'msn_educ_alllevels_gender_dif',\
                                                              f'winter_educ_alllevels_gender_dif']],seasonal_wts_arr),axis=1)

df['educ_alllevels_w_privschool_dif'] = np.nansum(np.multiply(df[[f'dry_educ_alllevels_w_privschool_dif',\
                                                              f'msn_educ_alllevels_w_privschool_dif',\
                                                              f'winter_educ_alllevels_w_privschool_dif']],seasonal_wts_arr),axis=1)

#### Finalize

Check out the indices

In [134]:
df[['educ_eqwt_idx','health_idx','markets_idx','admin_idx']].tail(5)

Unnamed: 0,educ_eqwt_idx,health_idx,markets_idx,admin_idx
33,0.781065,0.637092,0.523565,0.80711
34,0.218679,0.17553,0.180659,0.234544
35,0.746447,0.824063,0.737597,0.9999
36,0.47355,0.509018,0.474449,0.72659
37,0.579848,0.613028,0.505264,0.671387


In [135]:
# normalize all values to enable comparability

df['educ_altwt_idx'] = df['educ_altwt_idx'] / np.max(df['educ_altwt_idx'])
df['educ_eqwt_idx'] = df['educ_eqwt_idx'] / np.max(df['educ_eqwt_idx'])
df['health_idx'] = df['health_idx'] / np.max(df['health_idx'])
df['markets_idx'] = df['markets_idx'] / np.max(df['markets_idx'])
df['admin_idx'] = df['admin_idx'] / np.max(df['admin_idx'])


Finally, rank the admins by their overall inaccessibility index, weighting each of education, health, and markets equally (admin_idx is held out for separate analysis)

In [136]:
# weight education double relative to other sectors
educ_priority_wts = [0.5,0.25,0.25]

# identify rank according to equal weighting schemes

df['overall_eq_wt_rank'] = st.rank_by_weight(df,['educ_eqwt_idx','health_idx','markets_idx'],eq_wts)
df['overall_eq_wt_idx'] = np.nansum(df[['educ_eqwt_idx','health_idx','markets_idx']] * eq_wts, axis=1)

df['overall_educ_altwt_rank'] = st.rank_by_weight(df,['educ_altwt_idx','health_idx','markets_idx'],eq_wts)
df['overall_educ_altwt_idx'] = np.nansum(df[['educ_altwt_idx','health_idx','markets_idx']] * eq_wts, axis=1)

df['overall_educ_priority_wt_rank'] = st.rank_by_weight(df,['educ_altwt_idx','health_idx','markets_idx'],educ_priority_wts)
df['overall_educ_priority_wt_idx'] = np.nansum(df[['educ_altwt_idx','health_idx','markets_idx']] * educ_priority_wts, axis=1)


In [137]:
df[['educ_eqwt_idx','educ_altwt_idx','health_idx','markets_idx']].tail(5)

Unnamed: 0,educ_eqwt_idx,educ_altwt_idx,health_idx,markets_idx
33,0.740467,0.761943,0.740918,0.52706
34,0.207313,0.202354,0.204135,0.181865
35,0.707648,0.671018,0.958359,0.74252
36,0.448936,0.445246,0.591972,0.477616
37,0.549709,0.51695,0.712933,0.508636


#### Find top N% for overall and each index

In [138]:
def col_pctile(df,col):
    
    col_pctile = df[col].rank(method='max').apply(lambda x: 100.0 * (x-1) / (df[col].size -1))
    
    return col_pctile

In [139]:
df['educ_ewqt_pctile'] = col_pctile(df,'educ_eqwt_idx')
df['educ_altwt_pctile'] = col_pctile(df,'educ_altwt_idx')
df['health_pctile'] = col_pctile(df,'health_idx')
df['markets_pctile'] = col_pctile(df,'markets_idx')
df['admin_pctile'] = col_pctile(df,'admin_idx')
df['overall_pctile'] = col_pctile(df,'overall_eq_wt_idx')
df['overall_educpriority_pctile'] = col_pctile(df,'overall_educ_priority_wt_idx')

In [140]:
pctile_cols = ['educ_altwt_pctile','health_pctile','markets_pctile','admin_pctile']

In [141]:
df[pctile_cols]

Unnamed: 0,educ_altwt_pctile,health_pctile,markets_pctile,admin_pctile
0,29.72973,32.432432,24.324324,48.648649
1,16.216216,21.621622,45.945946,16.216216
2,56.756757,59.459459,64.864865,67.567568
3,48.648649,45.945946,37.837838,37.837838
4,5.405405,8.108108,8.108108,8.108108
5,86.486486,81.081081,91.891892,83.783784
6,54.054054,51.351351,35.135135,51.351351
7,37.837838,29.72973,43.243243,24.324324
8,27.027027,27.027027,18.918919,27.027027
9,32.432432,40.540541,59.459459,32.432432


In [142]:
pctiles_20pct = np.digitize(df[pctile_cols], bins = [0,80,100])
pctiles_20pct_overall = np.nansum(np.where(pctiles_20pct == 2, 1, 0),axis=1) / pctiles_20pct.shape[1]

In [143]:
df['deprivation_20pct'] = pctiles_20pct_overall

#### Append focus district yes/no info

In [144]:
non_focus_adm2_lst = ['PK201','PK204','PK205','PK241','PK243','PK209','PK211','PK215','PK216','PK217','PK218','PK219','PK221','PK222']

df['Adm2_Focus'] = 'Yes'
df.loc[df['Adm2_Code'].isin(non_focus_adm2_lst),'Adm2_Focus'] = 'No'

In [145]:
df[['Adm2_Code','Adm2_Focus']]

Unnamed: 0,Adm2_Code,Adm2_Focus
0,PK201,No
1,PK202,Yes
2,PK203,Yes
3,PK204,No
4,PK205,No
5,PK206,Yes
6,PK207,Yes
7,PK208,Yes
8,PK209,No
9,PK210,Yes


In [146]:
# checking this worked
df['Adm2_Focus'].unique()

array(['No', 'Yes'], dtype=object)

TEMP -- Quick analysis of all schools vs. just public schools results

In [147]:
pubschool_girls = df['educ_girls_alllevels_pubschool_idx'] - df['educ_girls_alllevels_w_privschool_idx']
pubschool_boys = df['educ_boys_alllevels_pubschool_idx'] - df['educ_boys_alllevels_w_privschool_idx']

allschools_girls = (df['educ_girls_alllevels_pubschool_hrs'] - df['educ_girls_alllevels_w_privschool_hrs']) * 60
allschools_boys = (df['educ_boys_alllevels_pubschool_hrs'] - df['educ_boys_alllevels_w_privschool_hrs']) * 60

### Export

Export tabular data for charting, sensitivity analysis, etc.

In [148]:
df.sort_values('overall_eq_wt_rank').to_csv(os.path.join(data_dir,tab_dir,f"final//BeforeOSMInvestment_{adm_level}_idx_access_stats_{today}.csv"),index=False)
# df.sort_values('overall_eq_wt_rank').to_csv(os.path.join(data_dir,tab_dir,f"final//{adm_level}_idx_access_stats_{today}.csv"),index=False)

Export shapefile for use elsewhere

In [149]:
# export tehsils to geographic file
if adm_level == 'adm3':
    
    adm3_geo = pd.merge(df[(['ADM2_EN','Adm2_Code',
           'Elevation_category',
           'dry_educ_idx', 'msn_educ_idx','winter_educ_idx',
           'educ_gender_dif_idx','educ_gender_eqwt_dif_idx',
           'educ_primary_gender_dif_hrs','educ_middle_gender_dif_hrs','educ_high_gender_dif_hrs',
           'dry_educ_alllevels_gender_dif','msn_educ_alllevels_gender_dif', 'winter_educ_alllevels_gender_dif',
           'dry_educ_alllevels_w_privschool_dif','msn_educ_alllevels_w_privschool_dif', 'winter_educ_alllevels_w_privschool_dif',
           'dry_educ_gender_dif_hrs', 'msn_educ_gender_dif_hrs','winter_educ_gender_dif_hrs',
           'dry_educ_allgender_dif_hrs','msn_educ_allgender_dif_hrs', 'winter_educ_allgender_dif_hrs',
           'dry_health_idx', 'msn_health_idx', 'winter_health_idx',
           'dry_markets_idx', 'msn_markets_idx', 'winter_markets_idx',
           'dry_admin_idx', 'msn_admin_idx', 'winter_admin_idx',
           'educ_eqwt_idx','educ_altwt_idx', 'health_idx', 'markets_idx', 'admin_idx',
           'overall_eq_wt_rank', 'overall_eq_wt_idx', 'overall_educ_altwt_rank','overall_educ_altwt_idx', 'overall_educ_priority_wt_rank', 'overall_educ_priority_wt_idx'] + list(adm_cols))], adm3_geo[['Adm3_Code','geometry']],on='Adm3_Code').sort_values('overall_eq_wt_rank')

    adm3_geo.Elevation_category = adm3_geo.Elevation_category.astype(str) # geopackages can't handle categorical variable types

    gpd.GeoDataFrame(adm3_geo,geometry='geometry').to_file(os.path.join(data_dir,acc_dir,f"vector/BeforeOSMInvestment_adm3_idx_access_stats_{today}.gpkg"),driver="GPKG")
#     gpd.GeoDataFrame(adm3_geo,geometry='geometry').to_file(os.path.join(data_dir,acc_dir,f"vector/adm3_idx_access_stats_{today}.gpkg"),driver="GPKG")
else:
    None