# Roads Descriptive Stats Preparation

After appending travel time information to each populated place in an administrative center we can prepare any number of descriptive stats. Given the quantity of data in question these are best prepared with Dask Dataframes. This notebook separates out the descriptive stats preparations,

In [3]:
import os, sys
from datetime import date

import pandas as pd
import geopandas as gpd
import numpy as np
from scipy import stats

import re

# custom functions
import sensitivity_testing as st

## Setup

Dates

In [4]:
today = date.today().strftime("%y%m%d")

In [5]:
data_date = '211215'

Directories

In [6]:
geo_dir = r'P:\PAK\GEO'
data_dir = r'../../data'

rast_dir = r'rast_inputs'
vect_in_dir = r'vect_inputs'
vect_out_dir = r'vect_out'

rds_dir = r'roads'
dest_dir = r'destinations'
speed_dir = r'speed'
fric_dir = r'friction'
acc_dir = r'access'
tab_dir = r'tabular'

Projections

In [7]:
# change this to whatever the desired output projection is
DEST_CRS = 'EPSG:32642'

dcrs_int = int(re.findall('[0-9]+',DEST_CRS)[0])
dcrs_int

32642

## Consolidate access variables into master indices per category and overall

This step slims down the dataset into only the columns we anticipate using for our analysis (ie. we drop children's walking speeds to Peshawar).</br></br>
Seasonal figures are weighted by the season's length in that tehsil and then merged into a master value per column (health_primary) and then again merged, with weighting, into a sectoral master (health_idx). These are merged into a final inaccessibility index value.</br></br>We preserve the intermediate columns in this process to enable sensitivity testing of the importance of the weights used.

#### Load in data and join relevant info

In [59]:
df_pth = os.path.join(data_dir,tab_dir,f'final//roads_absolute_access_improvement_data_{data_date}.csv')
rds = pd.read_csv(df_pth)
rds = rds.rename({'Cost':'Cost_PKR'},axis=1)

Pull in elevation data, for classifying and thence weighting by season

In [60]:
rds_extra = gpd.read_file(os.path.join(data_dir,rds_dir,f'Proposed_final//Proposed_roads_processed_211208.gpkg'),driver="GPKG")
rds_extra = rds_extra[['SN', 'SLength', 'Z_Mean','geometry']].rename({'SN':'Road_ID','SLength': 'length', 'Z_Mean':'Mean_elevation'},axis=1)
rds_extra.head(2)

Unnamed: 0,Road_ID,length,Mean_elevation,geometry
0,1,14183.006138,2276.075397,"MULTILINESTRING ((772711.858 4003961.232, 7727..."
1,2,53706.61617,2365.936973,"MULTILINESTRING ((802013.540 4030501.458, 8020..."


In [61]:
rds = gpd.GeoDataFrame(pd.merge(rds,rds_extra,how='left',on='Road_ID'),geometry='geometry')
rds.head(2)

Unnamed: 0,Road_ID,District,Current_Road_Cond,Current_Surface,Current_Road_Class,Upgrade_Road_Cond,Upgrade_Surface,Upgrade_Road_Class,Terrain,Cost_PKR,...,winter_health_primary,winter_health_private,winter_health_public,winter_health_secondary,winter_health_tertiary,winter_markets_All,winter_markets_Central,length,Mean_elevation,geometry
0,1,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,531862700.0,...,299.80243,2260.7961,1727.482,2365.0818,604863.3,2386.4465,310162.94,14183.006138,2276.075397,"MULTILINESTRING ((772711.858 4003961.232, 7727..."
1,2,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,2013998000.0,...,17603.945,17923.287,14925.823,17923.287,643576.0,34185.12,630105.8,53706.61617,2365.936973,"MULTILINESTRING ((802013.540 4030501.458, 8020..."


Refactor PKR costs to USD

In [62]:
# using the exchange rate reported by Google on Oct 27, 2021
rds['Cost_USD'] = rds['Cost_PKR'] / 174.88

Spatial data

In [63]:
# pull in admin data so we can classify roads by district/tehsil if needed

adm3_geo = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_All_Tehsils.gpkg'))
adm3_geo = adm3_geo[['ADM3_EN', 'ADM3_PCODE','ADM2_EN', 'ADM2_PCODE','geometry']].rename({'ADM2_PCODE' : 'Adm2_Code','ADM3_PCODE' : 'Adm3_Code'},axis=1)
adm3_geo = adm3_geo.to_crs(32642)

In [64]:
adm3_geo.head(2)

Unnamed: 0,ADM3_EN,Adm3_Code,ADM2_EN,Adm2_Code,geometry
0,Abbottabad,PK20101,Abbottabad,PK201,"MULTIPOLYGON (((910881.442 3803046.496, 911557..."
1,Alai,PK20301,Batagram,PK203,"MULTIPOLYGON (((875813.940 3872821.650, 878768..."


In [65]:
# assign admin data to rds based on where the roads' centroid falls

rds['geom_center'] = rds.geometry.centroid
rds = rds.set_geometry('geom_center')
rds = gpd.sjoin(rds,adm3_geo,how='left',op='intersects')
rds.set_geometry('geometry')
rds = rds.drop({'geom_center','index_right'},axis=1)

In [66]:
rds.head(3)

Unnamed: 0,Road_ID,District,Current_Road_Cond,Current_Surface,Current_Road_Class,Upgrade_Road_Cond,Upgrade_Surface,Upgrade_Road_Class,Terrain,Cost_PKR,...,winter_markets_All,winter_markets_Central,length,Mean_elevation,geometry,Cost_USD,ADM3_EN,Adm3_Code,ADM2_EN,Adm2_Code
0,1,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,531862700.0,...,2386.4465,310162.94,14183.006138,2276.075397,"MULTILINESTRING ((772711.858 4003961.232, 7727...",3041301.0,Mastuj,PK20602,Chitral,PK206
1,2,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,2013998000.0,...,34185.12,630105.8,53706.61617,2365.936973,"MULTILINESTRING ((802013.540 4030501.458, 8020...",11516460.0,Mastuj,PK20602,Chitral,PK206
2,3,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Plains,253587200.0,...,17877.154,325982.34,11270.543504,2359.721489,"MULTILINESTRING ((815812.425 4020685.300, 8158...",1450064.0,Mastuj,PK20602,Chitral,PK206


### Prepare for consolidation

#### Prepare seasonal master values

Education is a bit complicated. First we have to define our custom weighting schemas and the main columns to operate on

In [67]:
# equal weighting schema for comparison's sake

educ_equal_wts = np.array([1,1,1,1,1,1]) / 6

# set up weighting schema to prioritize lower schools and girls' education

girls_wts = np.array([.4,.4,.2]) * (2/3)
boys_wts = np.array([.4,.4,.2]) * (1/3)

# put the girls/boys arrays together
educ_mast_wts = np.concatenate((girls_wts,boys_wts),axis=0)
educ_mast_wts

array([0.26666667, 0.26666667, 0.13333333, 0.13333333, 0.13333333,
       0.06666667])

In [68]:
# because we mix childwalking and multimodal for education's master value it's easiest just to manually specify which columns to use
educ_mast_cols = ['dry_education_girls_primary','dry_education_girls_middle','dry_education_girls_high',
                 'dry_education_boys_primary','dry_education_boys_middle','dry_education_boys_high']

In [69]:
rds.columns

Index(['Road_ID', 'District', 'Current_Road_Cond', 'Current_Surface',
       'Current_Road_Class', 'Upgrade_Road_Cond', 'Upgrade_Surface',
       'Upgrade_Road_Class', 'Terrain', 'Cost_PKR',
       ...
       'winter_markets_All', 'winter_markets_Central', 'length',
       'Mean_elevation', 'geometry', 'Cost_USD', 'ADM3_EN', 'Adm3_Code',
       'ADM2_EN', 'Adm2_Code'],
      dtype='object', length=104)

Compute all figures as indexes

In [70]:
# first save out the raw data -- we'll join this back in later
acc_cols = list(rds.filter(regex='^(dry|msn|winter)',axis=1).columns)
raw_acc_data = rds[acc_cols].copy().add_suffix('_hrs')

In [71]:
rds[acc_cols][:3]

Unnamed: 0,dry_District_HQs,dry_Provincial_HQ,dry_education_allboys,dry_education_allgirls,dry_education_boys,dry_education_boys_high,dry_education_boys_middle,dry_education_boys_primary,dry_education_girls,dry_education_girls_high,...,winter_education_primary,winter_health_family,winter_health_pharmacies,winter_health_primary,winter_health_private,winter_health_public,winter_health_secondary,winter_health_tertiary,winter_markets_All,winter_markets_Central
0,31276.365,145493.16,0.574245,5.970305,0.574245,10.605741,46.245304,334.00314,5.970305,1204.3168,...,1.591759,2366.6921,2307.788,299.80243,2260.7961,1727.482,2365.0818,604863.3,2386.4465,310162.94
1,125080.67,153855.7,540.8683,1044.5232,540.8683,2996.6067,9186.516,617.97754,1044.5232,9448.1,...,1069.4727,18927.045,24472.879,17603.945,17923.287,14925.823,17923.287,643576.0,34185.12,630105.8
2,40349.496,162369.12,186.0485,333.99506,186.0485,1646.1072,583.0483,224.03566,333.99506,12786.724,...,236.13625,17156.736,2533.783,2510.559,17651.578,818.38,78986.24,535013.3,17877.154,325982.34


In [72]:
max_acc = np.array(np.max(rds[acc_cols],axis=0)) # column-wise max
rds[acc_cols] = rds[acc_cols] / max_acc # divide by max for column-wise index
rds[acc_cols][:3]

Unnamed: 0,dry_District_HQs,dry_Provincial_HQ,dry_education_allboys,dry_education_allgirls,dry_education_boys,dry_education_boys_high,dry_education_boys_middle,dry_education_boys_primary,dry_education_girls,dry_education_girls_high,...,winter_education_primary,winter_health_family,winter_health_pharmacies,winter_health_primary,winter_health_private,winter_health_public,winter_health_secondary,winter_health_tertiary,winter_markets_All,winter_markets_Central
0,0.208161,0.129367,1.85139e-07,3.090037e-07,1.850989e-07,0.000177,1.4e-05,0.000119,3.090037e-07,0.012323,...,1.12574e-07,0.009782,0.015147,0.005902,0.000938,0.015679,0.010322,0.005523,0.009646,0.137828
1,0.832478,0.136802,0.0001743781,5.406114e-05,0.0001743403,0.049907,0.002763,0.000219,5.406114e-05,0.096675,...,7.563633e-05,0.078231,0.160626,0.346528,0.007439,0.135471,0.078222,0.005877,0.13818,0.280001
2,0.268547,0.144372,5.998277e-05,1.72865e-05,5.996978e-05,0.027415,0.000175,8e-05,1.72865e-05,0.130836,...,1.670027e-05,0.070914,0.01663,0.049419,0.007326,0.007428,0.344716,0.004885,0.072261,0.144857


In [73]:
rds = pd.concat([rds,raw_acc_data],axis=1,ignore_index=False)

Now consolidate into master values, weighting appropriately

In [74]:
# education by levels

rds['dry_educ_idx'] = np.nansum((rds[educ_mast_cols] * educ_mast_wts),axis=1)

rds['msn_educ_idx'] = np.nansum((rds[[col.replace('dry','msn') for col in educ_mast_cols]] * educ_mast_wts),axis=1) # use the same mast_cols list, but with monsoon data instead

rds['winter_educ_idx'] = np.nansum((rds[[col.replace('dry','winter') for col in educ_mast_cols]] * educ_mast_wts),axis=1)

# education weighting equally -- for comparison's sake

rds['dry_educ_eqwt_idx'] = np.nansum((rds[educ_mast_cols] * educ_equal_wts),axis=1)

rds['msn_educ_eqwt_idx'] = np.nansum((rds[[col.replace('dry','msn') for col in educ_mast_cols]] * educ_equal_wts),axis=1)

rds['winter_educ_eqwt_idx'] = np.nansum((rds[[col.replace('dry','winter') for col in educ_mast_cols]] * educ_equal_wts),axis=1)


**Optional**</br>
Sensitivity test educational weighting schemes

In [75]:
wt_samp_arr, vals_arr, ranks_arr = st.Sensitivity_weighting(rds,educ_mast_cols,iterations=500000)

Calculate descriptive stats for sensitivity tested rankings of education per admin

In [76]:
# compute basic stats for each entity's ranking
# axis=0 for operating by column
ranks_mode = stats.mode(ranks_arr,axis=0)
ranks_mean = np.mean(ranks_arr,axis=0)
ranks_std = np.std(ranks_arr,axis=0)
vals_mean = np.sum(np.mean(vals_arr,axis=0),axis=1)

In [77]:
vals_mean

array([5.75918938e-03, 3.66855066e-02, 3.32522608e-02, 1.73608553e-02,
       7.25063850e-03, 2.13248187e-01, 3.77016305e-01, 1.65655609e-01,
       4.52026688e-03, 6.14308570e-02, 3.13526434e-02, 4.47988404e-02,
       1.07386079e-01, 1.94200760e-01, 3.23527355e-02, 1.32056781e-01,
       4.73774464e-01, 1.92420788e-02, 4.80822984e-03, 1.04074944e-02,
       9.30470321e-04, 2.60777700e-02, 1.27600834e-03, 1.37189733e-01,
       3.24544990e-03, 3.33661713e-03, 1.73141463e-01, 4.26710653e-04,
       8.75531752e-04, 4.74135732e-03, 7.04708967e-03, 0.00000000e+00,
       7.06460130e-03, 6.58746123e-02, 4.18926575e-03, 1.43381016e-02,
       7.38993694e-03, 2.81508895e-03, 1.51387786e-02, 7.63146062e-03,
       7.23160262e-03, 5.13653214e-03, 1.08293106e-02, 2.23048715e-03,
       2.19145582e-03, 3.87692582e-02, 1.09229225e-02, 9.01239021e-02,
       9.37820015e-03, 8.06776882e-02, 2.28300707e-02, 5.19892308e-02,
       5.52897889e-03, 1.10608223e-02, 5.30830431e-02, 7.28151098e-02,
      

In [78]:
# join in the key stats for sensitivity tested educational rankings
rds['sens_test_educ_rank_mean'] = ranks_mean
rds['sens_test_educ_val_mean'] = vals_mean
rds['sens_test_educ_rank_mode'] = ranks_mode[0][0]
rds['sens_test_educ_rank_std'] = ranks_std

In [79]:
# Now calculate the same for an equal weighting schema and the unbalanced weighting scheme requested by Transport

rds['educ_eqwt_rank'] = st.rank_by_weight(rds,educ_mast_cols,educ_equal_wts)
rds['educ_eqwt_val'] = np.nansum(rds[educ_mast_cols] * educ_equal_wts, axis=1)

rds['educ_altwt_rank'] = st.rank_by_weight(rds,educ_mast_cols,educ_mast_wts)
rds['educ_altwt_val'] = np.nansum(rds[educ_mast_cols] * educ_mast_wts, axis=1)

In [80]:
rds[['sens_test_educ_rank_mean','sens_test_educ_val_mean','sens_test_educ_rank_mode','sens_test_educ_rank_std']].head(10)

Unnamed: 0,sens_test_educ_rank_mean,sens_test_educ_val_mean,sens_test_educ_rank_mode,sens_test_educ_rank_std
0,60.926246,0.005759,60,4.339355
1,32.764738,0.036686,35,2.487476
2,34.469736,0.033252,36,4.697008
3,42.25754,0.017361,43,2.9216
4,58.147432,0.007251,54,10.270143
5,5.095878,0.213248,3,1.938825
6,2.531704,0.377016,2,1.024408
7,7.306416,0.165656,7,2.161681
8,64.208322,0.00452,65,2.622876
9,22.04812,0.061431,22,2.931689


In [81]:
rds[['sens_test_educ_val_mean','educ_eqwt_val','educ_altwt_val']].head(10)

Unnamed: 0,sens_test_educ_val_mean,educ_eqwt_val,educ_altwt_val
0,0.005759,0.005757,0.007515
1,0.036686,0.03667,0.035403
2,0.033252,0.033217,0.030185
3,0.017361,0.017371,0.018583
4,0.007251,0.007261,0.002946
5,0.213248,0.213167,0.262759
6,0.377016,0.376942,0.40478
7,0.165656,0.165498,0.170857
8,0.00452,0.004518,0.004121
9,0.061431,0.061402,0.057531


Health, markets, and administrative access are more straightforward to consolidate

In [82]:
# health by levels

rds['dry_health_idx'] = np.mean(rds[['dry_health_primary','dry_health_secondary','dry_health_tertiary']],axis=1) # using np.mean implies equal weighting of sub-categories
rds['msn_health_idx'] = np.mean(rds[['msn_health_primary','msn_health_secondary','msn_health_tertiary']],axis=1)
rds['winter_health_idx'] = np.mean(rds[['winter_health_primary','winter_health_secondary','winter_health_tertiary']],axis=1)

In [83]:
# markets
rds['dry_markets_idx'] = np.mean(rds[['dry_markets_All','dry_markets_Central']],axis=1) # using np.mean implies equal weighting of sub-categories
rds['msn_markets_idx'] = np.mean(rds[['msn_markets_All','msn_markets_Central']],axis=1)
rds['winter_markets_idx'] = np.mean(rds[['winter_markets_All','winter_markets_Central']],axis=1)


In [84]:
# administrative
# I assume that access to the Provincial HQ is not materially significant for administrative functions -- therefore this is just District HQ access

rds['dry_admin_idx'] = rds['dry_District_HQs']
rds['msn_admin_idx'] = rds['msn_District_HQs']
rds['winter_admin_idx'] = rds['winter_District_HQs']


#### Prepare categorical and overall master values, weighted by season

In [85]:
# equal weights for variables
eq_wts = [0.3333, 0.3333, 0.3333]

In [86]:
# Version 2
# categorize Tehsils by the population-weighted mean elevation of populated places in that tehsil
rds['Elevation_category'] = pd.cut(rds['Mean_elevation'],bins=[0,1500,2250,100000],labels=['Low','Medium','High'])

# Define the weighting of a season according to the tehsils's classification

season_wts_dct = {
    'Low' : np.array([0.3333,0.3333,0.3333]),
    'Medium' : np.array([0.2667,0.3333,0.4]),
    'High' : np.array([0.25,0.25,0.5])   
}

# Assign the seasonal weighting as a list, for later use
rds['seasonal_wts'] = rds['Elevation_category'].map(season_wts_dct, na_action='ignore')
seasonal_wts_arr = np.stack(rds['seasonal_wts']) # turn the Series of weights into an array of shape (3,number_of_admins) so it can be multiplied by the 3 seasonal values for each category

rds.head()

TypeError: unhashable type: 'numpy.ndarray'

Exception ignored in: 'pandas._libs.index.IndexEngine._call_map_locations'
Traceback (most recent call last):
  File "pandas\_libs\hashtable_class_helper.pxi", line 5231, in pandas._libs.hashtable.PyObjectHashTable.map_locations
TypeError: unhashable type: 'numpy.ndarray'


Unnamed: 0,Road_ID,District,Current_Road_Cond,Current_Surface,Current_Road_Class,Upgrade_Road_Cond,Upgrade_Surface,Upgrade_Road_Class,Terrain,Cost_PKR,...,msn_health_idx,winter_health_idx,dry_markets_idx,msn_markets_idx,winter_markets_idx,dry_admin_idx,msn_admin_idx,winter_admin_idx,Elevation_category,seasonal_wts
0,1,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,531862700.0,...,0.009814,0.007249,0.115839,0.049228,0.073737,0.208161,0.078224,0.114108,High,"[0.25, 0.25, 0.5]"
1,2,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,2013998000.0,...,0.130875,0.143542,0.313742,0.173186,0.209091,0.832478,0.603507,0.68958,High,"[0.25, 0.25, 0.5]"
2,3,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Plains,253587200.0,...,0.184129,0.133007,0.173298,0.143567,0.108559,0.268547,0.247768,0.168372,High,"[0.25, 0.25, 0.5]"
3,4,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,711654400.0,...,0.018085,0.01757,0.115779,0.04736,0.073146,0.365808,0.176785,0.210553,High,"[0.25, 0.25, 0.5]"
4,5,Upper Chitral,Poor,Earthen,Collector Road,Very Good,Asphaltic,Collector Road,Mountains,196137900.0,...,0.004687,0.004084,0.200715,0.083395,0.118774,0.17752,0.06805,0.10686,Medium,"[0.2667, 0.3333, 0.4]"


Master indices

In [87]:
# calculate the master value for each feature type by weighting each tehsil's seasonal master values by its seasonal weights
rds['educ_altwt_idx'] = np.nansum(np.multiply(rds[['dry_educ_idx','msn_educ_idx','winter_educ_idx']],seasonal_wts_arr),axis=1)
rds['educ_eqwt_idx'] = np.nansum(np.multiply(rds[['dry_educ_eqwt_idx','msn_educ_eqwt_idx','winter_educ_eqwt_idx']],seasonal_wts_arr),axis=1)
rds['health_idx'] = np.nansum(np.multiply(rds[['dry_health_idx','msn_health_idx','winter_health_idx']],seasonal_wts_arr),axis=1)
rds['markets_idx'] = np.nansum(np.multiply(rds[['dry_markets_idx','msn_markets_idx','winter_markets_idx']],seasonal_wts_arr),axis=1)
rds['admin_idx'] = np.nansum(np.multiply(rds[['dry_admin_idx','msn_admin_idx','winter_admin_idx']],seasonal_wts_arr),axis=1)
rds['educ_girls_idx'] = np.nansum(np.multiply(rds[['dry_education_girls','msn_education_girls','winter_education_girls']],seasonal_wts_arr),axis=1)



#### Specialty educational index calculations

In [88]:
# Differences in index values

# overall index vals -- with and without private schools

rds[f'dry_educ_alllevels_gender_dif'] = rds[f'dry_education_girls'] - rds[f'dry_education_boys']
rds[f'msn_educ_alllevels_gender_dif'] = rds[f'msn_education_girls'] - rds[f'msn_education_boys']
rds[f'winter_educ_alllevels_gender_dif'] = rds[f'winter_education_girls'] - rds[f'winter_education_boys']

rds[f'dry_educ_alllevels_w_privschool_dif'] = rds[f'dry_education_allgirls'] - rds[f'dry_education_allboys']
rds[f'msn_educ_alllevels_w_privschool_dif'] = rds[f'msn_education_allgirls'] - rds[f'msn_education_allboys']
rds[f'winter_educ_alllevels_w_privschool_dif'] = rds[f'winter_education_allgirls'] - rds[f'winter_education_allboys']

# primary

rds[f'dry_educ_primary_gender_dif'] = rds[f'dry_education_girls_primary'] - rds[f'dry_education_boys_primary']
rds[f'msn_educ_primary_gender_dif'] = rds[f'msn_education_girls_primary'] - rds[f'msn_education_boys_primary']
rds[f'winter_educ_primary_gender_dif'] = rds[f'winter_education_girls_primary'] - rds[f'winter_education_boys_primary']

# middle

rds[f'dry_educ_middle_gender_dif'] = rds[f'dry_education_girls_middle'] - rds[f'dry_education_boys_middle']
rds[f'msn_educ_middle_gender_dif'] = rds[f'msn_education_girls_middle'] - rds[f'msn_education_boys_middle']
rds[f'winter_educ_middle_gender_dif'] = rds[f'winter_education_girls_middle'] - rds[f'winter_education_boys_middle']

# high

rds[f'dry_educ_high_gender_dif'] = rds[f'dry_education_girls_high'] - rds[f'dry_education_boys_high']
rds[f'msn_educ_high_gender_dif'] = rds[f'msn_education_girls_high'] - rds[f'msn_education_boys_high']
rds[f'winter_educ_high_gender_dif'] = rds[f'winter_education_girls_high'] - rds[f'winter_education_boys_high']


In [89]:
# Differences in hours

# overall in hours -- with and without private schools

rds[f'dry_educ_gender_dif_hrs'] = rds[f'dry_education_girls_hrs'] - rds[f'dry_education_boys_hrs']
rds[f'msn_educ_gender_dif_hrs'] = rds[f'msn_education_girls_hrs'] - rds[f'msn_education_boys_hrs']
rds[f'winter_educ_gender_dif_hrs'] = rds[f'winter_education_girls_hrs'] - rds[f'winter_education_boys_hrs']

rds[f'dry_educ_allgender_dif_hrs'] = rds[f'dry_education_allgirls_hrs'] - rds[f'dry_education_allboys_hrs']
rds[f'msn_educ_allgender_dif_hrs'] = rds[f'msn_education_allgirls_hrs'] - rds[f'msn_education_allboys_hrs']
rds[f'winter_educ_allgender_dif_hrs'] = rds[f'winter_education_allgirls_hrs'] - rds[f'winter_education_allboys_hrs']

# primary

rds[f'dry_educ_primary_gender_dif_hrs'] = rds[f'dry_education_girls_primary_hrs'] - rds[f'dry_education_boys_primary_hrs']
rds[f'msn_educ_primary_gender_dif_hrs'] = rds[f'msn_education_girls_primary_hrs'] - rds[f'msn_education_boys_primary_hrs']
rds[f'winter_educ_primary_gender_dif_hrs'] = rds[f'winter_education_girls_primary_hrs'] - rds[f'winter_education_boys_primary_hrs']

# middle

rds[f'dry_educ_middle_gender_dif_hrs'] = rds[f'dry_education_girls_middle_hrs'] - rds[f'dry_education_boys_middle_hrs']
rds[f'msn_educ_middle_gender_dif_hrs'] = rds[f'msn_education_girls_middle_hrs'] - rds[f'msn_education_boys_middle_hrs']
rds[f'winter_educ_middle_gender_dif_hrs'] = rds[f'winter_education_girls_middle_hrs'] - rds[f'winter_education_boys_middle_hrs']

# high

rds[f'dry_educ_high_gender_dif_hrs'] = rds[f'dry_education_girls_high_hrs'] - rds[f'dry_education_boys_high_hrs']
rds[f'msn_educ_high_gender_dif_hrs'] = rds[f'msn_education_girls_high_hrs'] - rds[f'msn_education_boys_high_hrs']
rds[f'winter_educ_high_gender_dif_hrs'] = rds[f'winter_education_girls_high_hrs'] - rds[f'winter_education_boys_high_hrs']


In [90]:
# use dry only as we'll replace inline below

educ_dif_cols = ['dry_educ_primary_gender_dif',
 'dry_educ_middle_gender_dif',
 'dry_educ_high_gender_dif']

# create weights for just three columns

educ_dif_wts = [0.4,0.4,0.2]
educ_eq_dif_wts = [1/3, 1/3, 1/3]

In [91]:
# education access differences by levels

rds['dry_educ_gender_dif_idx'] = np.nansum((rds[educ_dif_cols] * educ_dif_wts),axis=1)
rds['msn_educ_gender_dif_idx'] = np.nansum((rds[[col.replace('dry','msn') for col in educ_dif_cols]] * educ_dif_wts),axis=1) # use the same dif_cols list, but with monsoon data instead
rds['winter_educ_gender_dif_idx'] = np.nansum((rds[[col.replace('dry','winter') for col in educ_dif_cols]] * educ_dif_wts),axis=1)

# education access differences weighting equally -- for comparison's sake

rds['dry_educ_eqwt_gender_dif_idx'] = np.nansum((rds[educ_dif_cols] * educ_eq_dif_wts),axis=1)
rds['msn_educ_eqwt_gender_dif_idx'] = np.nansum((rds[[col.replace('dry','msn') for col in educ_dif_cols]] * educ_eq_dif_wts),axis=1)
rds['winter_educ_eqwt_gender_dif_idx'] = np.nansum((rds[[col.replace('dry','winter') for col in educ_dif_cols]] * educ_eq_dif_wts),axis=1)


In [93]:
# Education w/ and w/out private schools, per gender

## BY INDEX VALUES
rds['educ_girls_alllevels_pubschool_idx'] = np.nansum(rds[[f'dry_education_girls',f'msn_education_girls',f'winter_education_girls']] * seasonal_wts_arr,axis=1)

rds['educ_boys_alllevels_pubschool_idx'] = np.nansum(rds[[f'dry_education_boys',f'msn_education_boys',f'winter_education_boys']] * seasonal_wts_arr,axis=1)

rds['educ_girls_alllevels_w_privschool_idx'] = np.nansum(rds[[f'dry_education_allgirls',f'msn_education_allgirls',f'winter_education_allgirls']] * seasonal_wts_arr,axis=1)

rds['educ_boys_alllevels_w_privschool_idx'] = np.nansum(rds[[f'dry_education_allboys',f'msn_education_allboys',f'winter_education_allboys']] * seasonal_wts_arr,axis=1)

## BY HOURS

rds['educ_girls_alllevels_pubschool_hrs'] = np.nansum(rds[[f'dry_education_girls_hrs',f'msn_education_girls_hrs',f'winter_education_girls_hrs']] * seasonal_wts_arr,axis=1)

rds['educ_boys_alllevels_pubschool_hrs'] = np.nansum(rds[[f'dry_education_boys_hrs',f'msn_education_boys_hrs',f'winter_education_boys_hrs']] * seasonal_wts_arr,axis=1)

rds['educ_girls_alllevels_w_privschool_hrs'] = np.nansum(rds[[f'dry_education_allgirls_hrs',f'msn_education_allgirls_hrs',f'winter_education_allgirls_hrs']] * seasonal_wts_arr,axis=1)

rds['educ_boys_alllevels_w_privschool_hrs'] = np.nansum(rds[[f'dry_education_allboys_hrs',f'msn_education_allboys_hrs',f'winter_education_allboys_hrs']] * seasonal_wts_arr,axis=1)

In [94]:
# Calculate educational difference indices, weighting by seasons.

## One index
rds['educ_gender_dif_idx'] = np.nansum(np.multiply(rds[['dry_educ_gender_dif_idx',\
                                                      'msn_educ_gender_dif_idx',\
                                                      'winter_educ_gender_dif_idx']],\
                                                  seasonal_wts_arr),axis=1)

rds['educ_gender_eqwt_dif_idx'] = np.nansum(np.multiply(rds[['dry_educ_eqwt_gender_dif_idx',\
                                                           'msn_educ_eqwt_gender_dif_idx',\
                                                           'winter_educ_eqwt_gender_dif_idx']],\
                                                  seasonal_wts_arr),axis=1)

## Differences in hours
rds['educ_primary_gender_dif_hrs'] = np.nansum(np.multiply(rds[['dry_educ_primary_gender_dif_hrs',\
                                                              'msn_educ_primary_gender_dif_hrs',\
                                                              'winter_educ_primary_gender_dif_hrs']],seasonal_wts_arr),axis=1)

rds['educ_middle_gender_dif_hrs'] = np.nansum(np.multiply(rds[['dry_educ_middle_gender_dif_hrs',\
                                                              'msn_educ_middle_gender_dif_hrs',\
                                                              'winter_educ_middle_gender_dif_hrs']],seasonal_wts_arr),axis=1)


rds['educ_high_gender_dif_hrs'] = np.nansum(np.multiply(rds[['dry_educ_high_gender_dif_hrs',\
                                                              'msn_educ_high_gender_dif_hrs',\
                                                              'winter_educ_high_gender_dif_hrs']],seasonal_wts_arr),axis=1)

# Overall difference indices for all levels, w/out and w/ private schools included

rds['educ_alllevels_dif'] = np.nansum(np.multiply(rds[[f'dry_educ_alllevels_gender_dif',\
                                                              f'msn_educ_alllevels_gender_dif',\
                                                              f'winter_educ_alllevels_gender_dif']],seasonal_wts_arr),axis=1)

rds['educ_alllevels_w_privschool_dif'] = np.nansum(np.multiply(rds[[f'dry_educ_alllevels_w_privschool_dif',\
                                                              f'msn_educ_alllevels_w_privschool_dif',\
                                                              f'winter_educ_alllevels_w_privschool_dif']],seasonal_wts_arr),axis=1)

#### Finalize

Check out the indices

In [95]:
rds[['educ_eqwt_idx','health_idx','markets_idx','admin_idx', 'educ_girls_idx']].tail(5)

Unnamed: 0,educ_eqwt_idx,health_idx,markets_idx,admin_idx,educ_girls_idx
74,0.004685,0.031413,0.002367,0.004398,2.1e-05
75,0.00777,0.019691,0.003443,0.00546,2e-06
76,0.02986,0.033087,0.091949,0.082477,1.3e-05
77,0.059742,0.162906,0.10929,0.225093,2.8e-05
78,0.052029,0.12347,0.107158,0.048916,8.3e-05


In [96]:
# normalize all values to enable comparability

rds['educ_altwt_idx'] = rds['educ_altwt_idx'] / np.max(rds['educ_altwt_idx'])
rds['educ_eqwt_idx'] = rds['educ_eqwt_idx'] / np.max(rds['educ_eqwt_idx'])
rds['health_idx'] = rds['health_idx'] / np.max(rds['health_idx'])
rds['markets_idx'] = rds['markets_idx'] / np.max(rds['markets_idx'])
rds['admin_idx'] = rds['admin_idx'] / np.max(rds['admin_idx'])


Finally, rank the admins by their overall inaccessibility index, weighting each of education, health, and markets equally (admin_idx is held out for separate analysis)

In [97]:
# weight education double relative to other sectors
educ_priority_wts = [0.5,0.25,0.25]

# identify rank according to equal weighting schemes

rds['overall_eq_wt_rank'] = st.rank_by_weight(rds,['educ_eqwt_idx','health_idx','markets_idx'],eq_wts)
rds['overall_eq_wt_idx'] = np.nansum(rds[['educ_eqwt_idx','health_idx','markets_idx']] * eq_wts, axis=1)

rds['overall_educ_altwt_rank'] = st.rank_by_weight(rds,['educ_altwt_idx','health_idx','markets_idx'],eq_wts)
rds['overall_educ_altwt_idx'] = np.nansum(rds[['educ_altwt_idx','health_idx','markets_idx']] * eq_wts, axis=1)

rds['overall_educ_priority_wt_rank'] = st.rank_by_weight(rds,['educ_altwt_idx','health_idx','markets_idx'],educ_priority_wts)
rds['overall_educ_priority_wt_idx'] = np.nansum(rds[['educ_altwt_idx','health_idx','markets_idx']] * educ_priority_wts, axis=1)


In [98]:
rds[['educ_eqwt_idx','educ_altwt_idx','health_idx','markets_idx']].tail(5)

Unnamed: 0,educ_eqwt_idx,educ_altwt_idx,health_idx,markets_idx
74,0.009523,0.009822,0.048196,0.002389
75,0.015794,0.014603,0.030211,0.003475
76,0.060695,0.077588,0.050764,0.092798
77,0.121433,0.127743,0.249939,0.1103
78,0.105756,0.110564,0.189433,0.108148


#### Find top N% for overall and each index

In [99]:
def col_pctile(df,col):
    
    col_pctile = df[col].rank(method='max').apply(lambda x: 100.0 * (x-1) / (df[col].size -1))
    
    return col_pctile

In [100]:
rds['educ_ewqt_pctile'] = col_pctile(rds,'educ_eqwt_idx')
rds['educ_altwt_pctile'] = col_pctile(rds,'educ_altwt_idx')

rds['health_pctile'] = col_pctile(rds,'health_idx')
rds['markets_pctile'] = col_pctile(rds,'markets_idx')
rds['admin_pctile'] = col_pctile(rds,'admin_idx')

rds['overall_pctile'] = col_pctile(rds,'overall_eq_wt_idx')
rds['overall_educ_priority_pctile'] = col_pctile(rds,'overall_educ_priority_wt_idx')

In [101]:
pctile_cols = ['educ_altwt_pctile','health_pctile','markets_pctile','admin_pctile']

In [102]:
rds[pctile_cols]

Unnamed: 0,educ_altwt_pctile,health_pctile,markets_pctile,admin_pctile
0,38.461538,16.666667,50.000000,55.128205
1,60.256410,79.487179,87.179487,96.153846
2,55.128205,84.615385,74.358974,66.666667
3,46.153846,30.769231,48.717949,70.512821
4,10.256410,10.256410,71.794872,50.000000
...,...,...,...,...
74,24.358974,44.871795,5.128205,5.128205
75,32.051282,35.897436,8.974359,7.692308
76,58.974359,47.435897,56.410256,43.589744
77,74.358974,87.179487,70.512821,67.948718


In [103]:
pctiles_20pct = np.digitize(rds[pctile_cols], bins = [0,80,100])
pctiles_20pct_overall = np.nansum(np.where(pctiles_20pct == 2, 1, 0),axis=1) / pctiles_20pct.shape[1]

In [104]:
rds['deprivation_20pct'] = pctiles_20pct_overall

In [105]:
rds[['ADM3_EN','overall_educ_priority_wt_idx','overall_educ_priority_wt_rank','deprivation_20pct']].sort_values('deprivation_20pct')

Unnamed: 0,ADM3_EN,overall_educ_priority_wt_idx,overall_educ_priority_wt_rank,deprivation_20pct
0,Mastuj,0.031273,54,0.00
34,Karak,0.025378,61,0.00
35,Karak,0.022632,63,0.00
36,Takht E Nasrati,0.009935,72,0.00
37,Karak,0.016835,66,0.00
...,...,...,...,...
47,Lakki Marwat,0.169387,17,0.75
23,Karak,0.287400,10,0.75
62,Dir,0.404395,3,1.00
7,Chitral,0.350412,6,1.00


#### Append focus district yes/no info

In [106]:
non_focus_adm2_lst = ['PK201','PK204','PK205','PK241','PK243','PK209','PK211','PK215','PK216','PK217','PK218','PK219','PK221','PK222']

rds['Adm2_Focus'] = 'Yes'
rds.loc[rds['Adm2_Code'].isin(non_focus_adm2_lst),'Adm2_Focus'] = 'No'

In [107]:
rds[['Adm2_Code','Adm3_Code','Adm2_Focus']]

Unnamed: 0,Adm2_Code,Adm3_Code,Adm2_Focus
0,PK206,PK20602,Yes
1,PK206,PK20602,Yes
2,PK206,PK20602,Yes
3,PK206,PK20602,Yes
4,PK206,PK20602,Yes
...,...,...,...
74,PK234,PK23403,Yes
75,PK234,PK23403,Yes
76,PK236,PK23603,Yes
77,PK236,PK23601,Yes


In [108]:
rds['Adm2_Focus'].unique()

array(['Yes', 'No'], dtype=object)

### Export

Export tabular data for charting, sensitivity analysis, etc.

In [109]:
rds.drop('geometry',axis=1).sort_values('overall_eq_wt_rank').to_csv(os.path.join(data_dir,tab_dir,f"final//rds_idx_access_stats_{today}.csv"),index=False)

Export shapefile for use elsewhere

In [728]:
# rds_geo = pd.merge(rds[['ADM2_EN','rds_EN','Adm2_Code','rds_Code',
#        'Elevation_category','dry_educ_idx', 'msn_educ_idx',
#        'winter_educ_idx', 'dry_educ_gender_dif', 'msn_educ_gender_dif',
#        'winter_educ_gender_dif', 'dry_educ_allgender_dif',
#        'msn_educ_allgender_dif', 'winter_educ_allgender_dif',
#        'dry_health_idx', 'msn_health_idx', 'winter_health_idx',
#        'dry_markets_idx', 'msn_markets_idx', 'winter_markets_idx',
#        'dry_admin_idx', 'msn_admin_idx', 'winter_admin_idx',
#        'educ_eqwt_idx','educ_altwt_idx', 'health_idx', 'markets_idx', 'admin_idx',
#        'overall_eq_wt_rank', 'overall_eq_wt_idx', 'overall_educ_altwt_rank','educ_altwt_val']], rds_geo[['rds_Code','geometry']],on='rds_Code').sort_values('overall_eq_wt_rank')

rds.Elevation_category = rds.Elevation_category.astype(str)
rds.seasonal_wts = rds.seasonal_wts.astype(str)

In [729]:
rds.set_geometry('geometry').to_file(os.path.join(data_dir,rds_dir,f"vector/rds_idx_access_stats_{today}.gpkg"),driver="GPKG")