# Import packages

In [1]:
import os
import time 
import numpy as np
import pandas as pd
import geopandas as gp

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
import xarray as xr
import rioxarray
from rasterio.enums import Resampling

In [4]:
# Time the script
start_time = time.perf_counter ()

# Set up directories and file paths

In [5]:
path_data_in = '../2 Raw Data'
path_data_out = '../3 Output Data'

## Input data

In [6]:
path_pop_count = os.path.join(path_data_in, 
                              'Population', 
                              'Count',
                              'gha_ppp_2015_1km_Aggregated.tiff')
dir_pop_agesex= os.path.join(path_data_in, 
                             'Population', 
                             'AgeSex')
path_gdp=os.path.join(path_data_in,
                      'GDP_HDI',
                      'GDP_PPP_30arcsec_v3.nc')
path_nl = os.path.join(path_data_in,
                       'Nighttime_Lights',
                       'gha_viirs_100m_2015.tiff')
# path_birth=os.path.join(path_data_in,
#                         'Birth',
#                         'Ghana_1km_births',
#                         'GHA_births_pp_v2_2015.tif')
# path_death = os.path.join(path_data_in,
#                           'Death',
#                           'Egypt_death_data_by_age_and_sex_2019.csv') # using Egypt death counts data as a place holder 
#                                                                       # because we don't have Ghana data yet

## Output data

In [7]:
path_healthfacility_public = os.path.join(path_data_out,"public_health_facilities.csv")
path_healthfacility_private = os.path.join(path_data_out,"private_health_facilities.csv")
path_agesex = os.path.join(path_data_out,"age_sex_distribution_1km_2019.csv")
#bank_csv_path = os.path.join(data_in_path,"Egypt_Banks_and_ Atms.csv")

# User-defined functions

In [8]:
def find_grid_index(x,y,longs,lats,x_min=None,x_max=None,y_min=None,y_max=None):
    """
    DESCRIPTION: This function helps to find the index of the longitudes and 
    latitudes in the given longitude/latitude lists that is closest to the 
    given point (x,y) 
    
    INPUT -
    x: latitude of the given point
    y: longtidue of the given point
    longs: reference list of longitudes
    lats: reference list of latitudes
    x_min, x_max: min/max boundaries of longitude
    y_min, y_max: min/max boundaries of latitude
    
    OUTPUT -
    Index of the closest longitude/latitude to point (x,y)
    """
    if x_min is None and x_max is None:
        x_l = longs[0]- (longs[1]-longs[0])
        x_r = longs[-1] + (longs[-1]-longs[-2])
        x_min = min(x_l, x_r)
        x_max = max(x_l, x_r)
    if x < x_min or x > x_max:
        return -1,-1
    
    if y_min is None and y_max is None:
        y_l = lats[0]- (lats[1]-lats[0])
        y_r = lats[-1] + (lats[-1]-lats[-2])
        y_min = min(y_l, y_r)
        y_max = max(y_l, y_r)
    if y < y_min or y > y_max:
        return -1,-1
    
    ncol = np.argmin(abs(longs-x)) 
    nrow = np.argmin(abs(lats-y))
    if nrow<0 or nrow>=(len(lats)-1) or ncol<0 or ncol>=(len(longs)-1):
        return -1, -1
    else:
        return nrow,ncol

In [9]:
def points_in_grid_new(points_df,ref_data,value_colname=None):
    """
    DESCRIPTION: This function finds how many points (and how many non nan points and their sum  
    if a value_colname is provided) lying in the grids centered at the coordinates of the reference data
    
    INPUT -
    points_df: pandas dataframe that needs to be count 
    ref_data: xarray DataArray with reference coordinates
    value_colname: column name for the values in the dataframe
    
    OUTPUT -
    count: How many points are in each grid of the reference coordinate
    non_nan_count: How many non nan points are in each grid of the reference coordinate
    value_sum: The sum of the values of the non nan points within each grid of the reference coordinate
    """
    long_list = ref_data.coords['x'].values
    lat_list = ref_data.coords['y'].values
    long_list = np.append(long_list, long_list[-1]+ref_data.rio.resolution()[0])
    lat_list = np.append(lat_list, lat_list[-1]+ref_data.rio.resolution()[1])
    count = np.zeros(ref_data.values.shape)
    non_nan_count = np.zeros(ref_data.values.shape)
    if value_colname is not None:
        value_sum = np.empty(ref_data.values.shape)
        value_sum[:] = np.nan
        values = points_df[value_colname]
    else:
        value_sum = None
    grid_index = points_df.apply(lambda row: find_grid_index(row['Long'],row['Lat'],long_list,lat_list), axis=1)
    for j, x in enumerate(grid_index):
        if x[0]!=-1 and x[1]!=-1:
            count[x[0],x[1]] += 1
            if value_colname is not None and ~np.isnan(values[j]):
                non_nan_count[x[0],x[1]] = non_nan_count[x[0],x[1]]+1 
                value_sum[x[0],x[1]] = np.nansum([value_sum[x[0],x[1]],values[j]])
    return count, non_nan_count, value_sum

In [10]:
def spatial_aggregate(count_in,value_sum_in,block_size,agg_mode='sum', non_nan_count_in=None):
    """
    DESCRIPTION: This function apply square sum aggregation with stride 1 for 2D numpy array
    with the same padding
    
    INPUT -
    count_in: 2D numpy array for the count at each grid
    value_sum_in: 2D numpy array for the sum of values at each grid
    block_size: the size of the square sum aggregation
    agg_mode: aggregation operation mode "sum" or "mean"
    non_nan_count_in: 2D array for the non nan count at each grid, only needed for "mean" mode
    
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2

    new_count_in = np.zeros(count_in.shape)
    count_in = np.pad(count_in,half)

    return_value = np.zeros(value_sum_in.shape)
    value_sum_in = np.pad(value_sum_in,half)
    
    if agg_mode == 'mean':
        new_nonnan_count_in = np.zeros(non_nan_count_in.shape)
        non_nan_count_in = np.pad(non_nan_count_in,half)
    elif agg_mode == 'sum':
        new_nonnan_count_in = None
    else:
        raise Exception('Error: The function only supports agg_mode for sum or mean.')
    for nrow in range(half,count_in.shape[0]-half):
        for ncol in range(half,count_in.shape[1]-half):
            new_count_in[nrow-half,ncol-half] = np.sum(count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            if agg_mode == 'mean':
                new_nonnan_count_in[nrow-half,ncol-half] = np.sum(non_nan_count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            return_value[nrow-half,ncol-half] = np.nansum(value_sum_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    if agg_mode == 'mean':
        new_nonnan_count_in = np.where(new_nonnan_count_in == 0, np.nan, new_nonnan_count_in)
        return_value = return_value/new_nonnan_count_in
        return new_count_in, return_value, new_nonnan_count_in
    else:
        return new_count_in, return_value, new_nonnan_count_in

In [11]:
def layer_aggregate_helper(count_in,value_sum_in,block_size,non_nan_count_in):
    """
    DESCRIPTION: This function apply square sum aggregation with stride 1 for 2D numpy array
    with the same padding
    
    INPUT -
    count_in: 2D numpy array for the count at each grid
    value_sum_in: 2D numpy array for the sum of values at each grid
    block_size: the size of the square sum aggregation
    non_nan_count_in: 2D array for the non nan count at each grid
    
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2

    new_count_in = np.zeros(count_in.shape)
    count_in = np.pad(count_in,half)

    return_value = np.zeros(value_sum_in.shape)
    value_sum_in = np.pad(value_sum_in,half)
    
    new_nonnan_count_in = np.zeros(non_nan_count_in.shape)
    non_nan_count_in = np.pad(non_nan_count_in,half)
  
    for nrow in range(half,count_in.shape[0]-half):
        for ncol in range(half,count_in.shape[1]-half):
            new_count_in[nrow-half,ncol-half] = np.sum(count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            new_nonnan_count_in[nrow-half,ncol-half] = np.sum(non_nan_count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            return_value[nrow-half,ncol-half] = np.nansum(value_sum_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    return_value = np.where(new_nonnan_count_in == 0, np.nan, return_value)
    return new_count_in, return_value, new_nonnan_count_in

In [12]:
def layer_aggregate(value1,count1,value2,count2,mode = 'sum'):
    value1 = np.array(value1)
    value2 = np.array(value2)
    count1 = np.array(count1)
    count2 = np.array(count2)
    value2[np.where(count1==count2)]=value1[np.where(count1==count2)]
    value1 = np.where(np.isnan(value1), 0, value1)
    value2 = np.where(np.isnan(value2), 0, value2)
    value_diff = value2-value1
    count_diff = count2-count1
    if mode == 'sum':
        return_value = value_diff
        return_value = np.where(count_diff==0,np.nan,return_value)
    elif mode == 'mean':
        count_diff = np.where(count_diff == 0, np.nan, count_diff)
        return_value = value_diff/count_diff
    else:
        raise Exception('Only support sum or mean for mode')
    return return_value

In [13]:
def square_helper(arr, y, x, mode):
    """
    DESCRIPTION: This is a helper funtion to do mean or sum in square for 2D numpy array
    
    INPUT - 
    arr: 2D numpy array to be averaged
    (y,x): output dimension
    mode: sum or mean, NAs are ignored 

    OUTPUT - 
    Averaged 2D array
    """
    yy, xx = arr.shape
    if mode == "sum":
        vals = np.nansum(arr.reshape(y, yy//y, x, xx//x),(1,3))
    elif mode == "mean":
        vals = np.nanmean(arr.reshape(y, yy//y, x, xx//x),(1,3))
    else:
        raise Exception("Mode is not suppported, please input `sum' or `mean'")
    return vals

In [14]:
def xarray_square_aggregate(df_raw,stride=10,var_name="band_data",mode="mean"):
    """
    DESCRITPION: This is a function to do square aggregation (mean or sum) for a
    xarray dataArray, generating a new xarray dataArray
    
    INPUT -
    df_raw: 2D xarray DataArray
    stride: Size of the non-overlap square aggregation
    var_name: new DataArray variable name
    mode: square aggregation function
    
    OUTPU -
    A new 2D xarray DataArray with non-overlap square aggregation
    """
    nrow, ncol = df_raw.values.squeeze().shape
    new_nrow,new_ncol = int(nrow/stride), int(ncol/stride)
    data = df_raw.values.squeeze()[0:(new_nrow*stride),0:(new_ncol*stride)]
    new_data = square_helper(data, new_nrow, new_ncol, mode)
    new_x = np.mean(df_raw.coords['x'].values[0:(new_ncol*stride)].reshape(-1,stride),axis=1)
    new_y = np.mean(df_raw.coords['y'].values[0:(new_nrow*stride)].reshape(-1,stride),axis=1)
    # create new datarray
    df = xr.DataArray(
        data=new_data,
        dims=["y", "x"],
        coords=dict(
            x= new_x,
            y= new_y,
        ),
    )
    df.name = var_name
    if df_raw.rio.crs is not None:
        df = df.rio.write_crs(df_raw.rio.crs)
    return df

In [15]:
def grid_reproject(df,ref_data,resample_method = Resampling.nearest):
    """
    DESCRIPTION: This is a function to reproject a xarray dataArray to the coordinates
    of another refrence xarray dataArray.
    
    INPUT - 
    df: The xarray DataArray needing to be reprojected to the new grids
    ref_data: The reference xarray DataArray with reference coordinates (the new grids)
    resample_method: resample method
    
    OUTPU -
    A new xarray DataArray reprojected to the coordinates of ref_data 
    """
    df = df.rio.reproject_match(ref_data, resampling = Resampling.nearest, nodata= np.nan)
    df = df.assign_coords({
        "x": ref_data.x,
        "y": ref_data.y,
    })
    return df

In [16]:
def points_in_grid(points_df,ref_data):
    """
    DESCRIPTION: This function finds how many points are in the grids centered 
    at the coordinates of the reference data
    
    INPUT -
    points_df: xarray DataArray that needs to be count 
    ref_data: xarray DataArray with reference coordinates
    
    OUTPUT -
    How many points are in each grid of the reference coordinates
    """
    long_list = ref_data.coords['x'].values
    lat_list = ref_data.coords['y'].values
    long_list = np.append(long_list, long_list[-1]+ref_data.rio.resolution()[0])
    lat_list = np.append(lat_list, lat_list[-1]+ref_data.rio.resolution()[1])
    count = np.zeros(ref_data.values.shape)
    grid_index = points_df.apply(lambda row: find_grid_index(row['Long'],row['Lat'],long_list,lat_list), axis=1)
    for x in grid_index:
        if x[0]!=-1 and x[1]!=-1:
            count[x[0],x[1]] += 1
    return count

In [17]:
def aggregate_count(count,block_size):
    """
    DESCRIPTION: This function apply square sum aggregation with sride 1 for 2D numpy array
    with same padding
    
    INPUT -
    count: 2D numpy array
    block_size: the size of the square sum aggregation
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2
    new_count = np.zeros(count.shape)
    count = np.pad(count,half)
    for nrow in range(half,count.shape[0]-half):
        for ncol in range(half,count.shape[1]-half):
            new_count[nrow-half,ncol-half] = np.sum(count[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    return new_count

In [18]:
def print_summary(data):
    """
    DESCRIPTION: This is a function to print the sumamry of a xarray DataArray or Dataset
    
    INPUT - 
    data: xarray DataArray or Dataset
    
    OUTPUT - 
    summary statistics of data
    """
    print(f"shape: {data.rio.shape}")
    print(f"resolution: {data.rio.resolution()}")
    print(f"coordinates boundary: {data.rio.bounds()}")
    print(f"CRS: {data.rio.crs}")

# Specify reference coordinates/grids

1. The spatial distribution of population in 2015 Ghana
2. gha_ppp_2015_1km_Aggregated.tif
3. https://www.worldpop.org/geodata/summary?id=31435

Here we use the grids of Ghana population data as the reference data.
All other data will be reprojected to this reference grids to make all data consistent in space.

## Population

### Read in population data

In [19]:
pop_raw = rioxarray.open_rasterio(path_pop_count)

### Take a look at the data

In [20]:
pop_raw

## Clean variables and dimensions

### Select band index at 0 

In [21]:
pop = pop_raw.isel(band=0)

### Drop extra coordinates dimensions beyond latitude and longtitude


In [22]:
pop = pop.reset_coords(names=['band'],drop=True)
pop.name = 'population'

### Make population DataArray as the reference grids

In [23]:
ref_da = pop.copy()
ref_ds = ref_da.to_dataset()
ref_da

# Data Preprocessing

## Process healthcare facilities data from Healthsites.io into csv

1. Health facility data with OpenStreetMap
2. Ghana-node.shp
3. https://healthsites.io/map?country=Ghana

### Identifying files

In [24]:
file_shp=os.path.join(path_data_in, 'Facilities', "Ghana-node.shp")
file_csv=os.path.join(path_data_in, 'Facilities', "Ghana-node.csv")
file_out=os.path.join(path_data_out,"Ghana_healthsites_io.csv")

### Reading in shps and saving as CSV

In [25]:
file=gp.read_file(file_shp)
file.to_csv(file_csv)

### Clean CSV file

In [26]:
df_healthsites_io=pd.read_csv(file_csv)

#### Remove redundent characters in the geometry variable

In [27]:
df_healthsites_io['geometry']=df_healthsites_io['geometry'].str.replace('POINT \(','', regex=True)
df_healthsites_io['geometry']=df_healthsites_io['geometry'].str.replace('\)','', regex=True)

#### Create Longtitude and Latitude variables

In [28]:
df_healthsites_io_latlong=df_healthsites_io["geometry"].str.split(" ", expand=True)
df_healthsites_io_latlong=df_healthsites_io_latlong.rename(columns={0: "Long", 
                                                                    1: "Lat"})

#### Destring the Lat and Long variables

In [29]:
df_healthsites_io_latlong["Lat"]=pd.to_numeric(df_healthsites_io_latlong["Lat"])
df_healthsites_io_latlong["Long"]=pd.to_numeric(df_healthsites_io_latlong["Long"])

### Create final health facilities dataframe

In [30]:
df_healthsites_io = pd.concat([df_healthsites_io['name'],
                               df_healthsites_io['osm_id'],
                               df_healthsites_io['amenity'],
                               df_healthsites_io['operator_ty'],
                               df_healthsites_io_latlong], 
                              axis=1, 
                              join='inner')

#### Rename variables

In [31]:
df_healthsites_io=df_healthsites_io.rename(columns={"name": "Facility name", 
                                                    "amenity": "Amenity",
                                                    "operator_ty": "Ownership"})

In [32]:
df_healthsites_io

Unnamed: 0,Facility name,osm_id,Amenity,Ownership,Long,Lat
0,Pro-Life Pharmacy,1590950161,pharmacy,,-0.119871,5.601035
1,Beaver Clinic,1700719794,dentist,private,-0.178916,5.605701
2,Bethel Dental Clinic,1700719799,dentist,,-0.192542,5.616803
3,Adabraka clinic,1728032224,clinic,private,-0.209600,5.566273
4,Iran Clinic,1728032238,doctors,private,-0.212442,5.568037
...,...,...,...,...,...,...
1160,Dohamm Clinic Ltd,1378720189,clinic,,-1.780210,4.930476
1161,Eye Link Clinic,1385353948,doctors,,-0.222045,5.612410
1162,Odorna Medical Center,1405374124,clinic,,-0.215609,5.559084
1163,West African Rescue Association Head Office - ...,1417019958,doctors,,-0.174880,5.565374


### Two-way table for Facility type and Amenity of Ghana

In [33]:
df_healthsites_io["Ownership"].fillna("NA",inplace=True)
df_healthsites_io["Amenity"].fillna("NA",inplace=True)
pd.crosstab(index=df_healthsites_io["Ownership"], 
            columns=df_healthsites_io["Amenity"], 
            margins=True)

Amenity,NA,clinic,dentist,doctors,hospital,pharmacy,All
Ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,17,237,16,47,45,791,1153
community,0,2,0,0,0,0,2
government,0,1,0,0,0,0,1
private,0,2,2,2,0,2,8
public,0,1,0,0,0,0,1
All,17,243,18,49,45,793,1165


In [34]:
df_healthsites_io["Amenity"].value_counts()

pharmacy    793
clinic      243
doctors      49
hospital     45
dentist      18
NA           17
Name: Amenity, dtype: int64

Note: There're many missing values for both Amenity and Facility type. The data is unreliable for classifying public and private healthcare facilities

In [35]:
df_healthsites_io.to_csv(file_out)

## Run this part for Ghana:

## Process healthcare facilities data from WHO into csv

1. Health Facilities Sub-Saharan Africa	
2. who-cds-gmp-2019-01-eng.xlsx	
3. https://data.humdata.org/dataset/health-facilities-in-sub-saharan-africa

Abbreviation table for Ownership values

|Abbreviation| Meaning|
| ----- | ----------- |
|CBO	|Community Based Organization|
|FBO	|Faith Based Organization|
|Govt.	|Government|
|MoH	|Ministry of Health|
|MoHCDGEC	|Ministry of Health, Community Development, Gender, Elderly and Children|
|MoHL	|Ministry of Health and Labour|
|MoHQL	|Ministry of Health and Quality of Life |
|MoHSS	|Ministry of Health and Social Services|
|NGO	|Non Governmental Organization|
|ONG/non-lucratif	|Non Governmental Organization|
|SDA	|Seventh Day Adventist|

### Import WHO health facilities data

In [36]:
# set up the path
who_shp=os.path.join(path_data_in, 'Facilities' ,'suhsharan_health_facilities', 'sub-saharan_health_facilities.shp')
who_csv1=os.path.join(path_data_in, 'Facilities', 'suhsharan_health_facilities', 'sub-saharan_health_facilities.csv')
who_csv2 = os.path.join(path_data_out, 'Ghana_WHO_health_facilities.csv')

In [37]:
df=gp.read_file(who_shp)
df.to_csv(who_csv1)
df_healthsites_WHO = pd.read_csv(who_csv1)
df_healthsites_WHO = df_healthsites_WHO.drop(["Unnamed: 0","geometry"], axis=1)
df_healthsites_WHO = df_healthsites_WHO.rename(columns={"Facility n": "Facility name", "Facility t": "Facility type"})

#### WHO health facilities data overview

In [38]:
df_healthsites_WHO

Unnamed: 0,Country,Admin1,Facility name,Facility type,Ownership,Lat,Long,LL source
0,Angola,Bengo,Hospital Barra Do Dande,Hospital,Govt.,-8.6560,13.4919,Google Earth
1,Angola,Bengo,Hospital Dos Dembos,Hospital,Govt.,-8.5026,14.5862,Google Earth
2,Angola,Bengo,Hospital Municipal de Ambriz,Municipal Hospital,Govt.,-7.8522,13.1307,Google Earth
3,Angola,Bengo,Hospital Municipal de Bula Atumba,Municipal Hospital,Govt.,-8.6742,14.7925,Google Earth
4,Angola,Bengo,Hospital Municipal de Dande,Municipal Hospital,Govt.,-8.5835,13.6569,Google Earth
...,...,...,...,...,...,...,...,...
98740,Zimbabwe,Midlands,Zhombe Rural Hospital,Rural Hospital,FBO,-18.7034,29.3849,GPS
98741,Zimbabwe,Midlands,Zvamabande Rural Health Clinic,Rural Health Clinic,MoH,-19.8018,30.1359,GPS
98742,Zimbabwe,Midlands,Zvarota Rural Health Clinic,Rural Health Clinic,MoH,-19.5565,30.2212,GPS
98743,Zimbabwe,Midlands,Zvishavane District Hospital,District Hospital,MoH,-20.3058,30.0524,GPS


In [39]:
print('Table: Included countries in WHO health facilities data')
pd.crosstab(index=df_healthsites_WHO['Country'], 
            columns='count', 
            margins=True)

Table: Included countries in WHO health facilities data


col_0,count,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1
Angola,1575,1575
Benin,819,819
Botswana,624,624
Burkina Faso,1721,1721
Burundi,665,665
Cameroon,3061,3061
Cape Verde,66,66
Central African Republic,555,555
Chad,1283,1283
Comoros,66,66


#### Keep only requried variables for WHO data

In [40]:
df_healthsites_WHO = pd.concat([df_healthsites_WHO['Country'],
                                df_healthsites_WHO['Facility name'],
                                df_healthsites_WHO['Facility type'],
                                df_healthsites_WHO['Ownership'],
                                df_healthsites_WHO['Lat'],
                                df_healthsites_WHO['Long']], 
                               axis=1, 
                               join='inner')

### Filter data to Ghana

In [41]:
df_healthsites_WHO_Ghana=df_healthsites_WHO[df_healthsites_WHO['Country']=="Ghana"]

In [42]:
print('Table: Overview of Ghana health facilities data from WHO')
df_healthsites_WHO_Ghana

Table: Overview of Ghana health facilities data from WHO


Unnamed: 0,Country,Facility name,Facility type,Ownership,Lat,Long
33518,Ghana,A.M.E Zion Clinic,Clinic,FBO,7.4080,-1.9632
33519,Ghana,Aboabo Health Centre,Health Centre,MoH,6.2239,-1.3498
33520,Ghana,Aboabogya Health Centre,Health Centre,MoH,6.8418,-1.6110
33521,Ghana,Aboaboso Community-based Health Planning and S...,Community-based Health Planning and Services,MoH,6.1746,-1.9304
33522,Ghana,Aboaso Health Centre,Health Centre,MoH,6.8418,-1.6110
...,...,...,...,...,...,...
35473,Ghana,Wiawso Government Hospital,Hospital,MoH,6.2159,-2.4851
35474,Ghana,Wuratrem Community-based Health Planning and S...,Community-based Health Planning and Services,MoH,,
35475,Ghana,Yamfo Community-based Health Planning and Serv...,Community-based Health Planning and Services,MoH,5.9853,-2.7843
35476,Ghana,Yawkrom Community-based Health Planning and Se...,Community-based Health Planning and Services,MoH,6.2264,-2.6842


In [43]:
print('Table: Facility type and Ownership in Ghana - WHO data')
pd.crosstab(index=df_healthsites_WHO_Ghana['Facility type'], 
            columns=df_healthsites_WHO_Ghana['Ownership'], 
            margins=True)

Table: Facility type and Ownership in Ghana - WHO data


Ownership,FBO,MoH,All
Facility type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Clinic,126,272,398
Community-based Health Planning and Services,1,631,632
District Hospital,8,73,81
General Hospital,0,3,3
Health Centre,45,695,740
Hospital,44,33,77
Municipal Hospital,1,5,6
Polyclinic,1,11,12
Regional Hospital,0,8,8
Teaching Hospital,0,3,3


In [44]:
# save the data
df_healthsites_WHO_Ghana.to_csv(who_csv2)

We can see that Ghana has only two types of distinct ownership, Faith Based Organization (FBO) and Ministry of Health (MoH), which can be classified clearly as Private for the former and Public for the latter

#### **Unclear: public and private classication**

#### Reclassifile ownership for healthcare facilities at Ghana - WHO data

In [41]:
#Private facilities
df_healthsites_WHO_Ghana['Ownership'].replace(
    to_replace=['FBO'],
    value='Private',
    inplace=True
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_healthsites_WHO_Ghana['Ownership'].replace(


In [42]:
#Public facilities
df_healthsites_WHO_Ghana['Ownership'].replace(
    to_replace=['MoH'],
    value='Public',
    inplace=True
)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_healthsites_WHO_Ghana['Ownership'].replace(


#### Separate public and private facilites for Ghana

##### Public health facilites

In [43]:
healthfacility_public = df_healthsites_WHO_Ghana[df_healthsites_WHO_Ghana['Ownership']=='Public'].reset_index(drop=True)
healthfacility_public = healthfacility_public.dropna().copy()
healthfacility_public.reset_index(drop=True)
#Export to CSV
healthfacility_public.to_csv(path_healthfacility_public)

##### Private health facilites

In [44]:
healthfacility_private = df_healthsites_WHO_Ghana[df_healthsites_WHO_Ghana['Ownership']=='Private'].reset_index(drop=True)
healthfacility_private = healthfacility_private.dropna().copy()
healthfacility_private.reset_index(drop=True)
#Export to CSV
healthfacility_private.to_csv(path_healthfacility_private)

##### Overview of Public and Private health facilities

In [45]:
print("Number of public facility: %s, Number of private facility: %s" % 
      (len(healthfacility_public),
       len(healthfacility_private)))

print('Table: Public facility')
display(healthfacility_public)

print('Table: Private facility')
display(healthfacility_private)

Number of public facility: 1658, Number of private facility: 220
Table: Public facility


Unnamed: 0,Country,Facility name,Facility type,Ownership,Lat,Long
0,Ghana,Aboabo Health Centre,Health Centre,Public,6.223930,-1.349820
1,Ghana,Aboabogya Health Centre,Health Centre,Public,6.841820,-1.611030
2,Ghana,Aboaboso Community-based Health Planning and S...,Community-based Health Planning and Services,Public,6.174609,-1.930386
3,Ghana,Aboaso Health Centre,Health Centre,Public,6.841770,-1.610980
4,Ghana,Aboffour Health Centre,Health Centre,Public,7.129860,-1.732940
...,...,...,...,...,...,...
1728,Ghana,Wassa Saa Health Centre,Health Centre,Public,5.919240,-2.015330
1729,Ghana,Wiawso Government Hospital,Hospital,Public,6.215862,-2.485146
1731,Ghana,Yamfo Community-based Health Planning and Serv...,Community-based Health Planning and Services,Public,5.985319,-2.784344
1732,Ghana,Yawkrom Community-based Health Planning and Se...,Community-based Health Planning and Services,Public,6.226362,-2.684242


Table: Private facility


Unnamed: 0,Country,Facility name,Facility type,Ownership,Lat,Long
0,Ghana,A.M.E Zion Clinic,Clinic,Private,7.408010,-1.963170
1,Ghana,Aburaso Methodist clinic,Clinic,Private,6.661330,-1.676620
2,Ghana,Agogo Presby Hospital,Hospital,Private,6.796180,-1.085290
3,Ghana,Akomaa Memorial Hospital,Hospital,Private,6.439660,-1.538370
4,Ghana,Anyinasu SDA Clinic,Clinic,Private,7.240034,-1.580578
...,...,...,...,...,...,...
221,Ghana,St. Johns Of God Hospital,Hospital,Private,6.394010,-2.657790
222,Ghana,St. Martin de Porres Hospital,Hospital,Private,4.965194,-2.470262
223,Ghana,St.Marks Anglican Clinic,Clinic,Private,6.277150,-2.290920
224,Ghana,St.Theresa Clinic,Clinic,Private,5.441350,-2.503210


## Process age and gender population data

1. Ghana 100m Age structures in 2020
2.
3. https://www.worldpop.org/geodata/summary?id=16839

We have one Geotiff file for each sex class and each age bin, here we aggregate the 36 Geotiff files into one clean csv file, which will be used to calculate death data at each grid later.

### Set sex bins and age bins 

In [46]:
sex_bins = ['Female','Male']
age_bins = ['Age <1 year', 'Age 1 to 4','Age 5 to 9',
            'Age 10 to 14','Age 15 to 19','Age 20 to 24',
            'Age 25 to 29','Age 30 to 34','Age 35 to 39',
            'Age 40 to 44','Age 45 to 49','Age 50 to 54',
            'Age 55 to 59','Age 60 to 64','Age 65 to 69',
            'Age 70 to 74','Age 75 to 79','Age 80 plus']
age_start_bins = [0,1] + list(np.arange(5,80+1,5))

In [47]:
%%time
age_sex_df = None
for sex in sex_bins:
    for file_no, age in enumerate(age_bins):
        file_name ="gha_"+sex[0].lower()+"_"+str(age_start_bins[file_no])+"_2020_constrained.tif"
        col_name = sex + ' ' + age
        age_sex_raw = xr.open_dataarray(os.path.join(dir_pop_agesex,file_name))
        age_sex = xarray_square_aggregate(age_sex_raw,stride=10,var_name=col_name, mode="sum")
        age_sex = grid_reproject(age_sex, pop)
        df = age_sex.to_dataframe()
        df = df.drop("spatial_ref",axis=1)
        age_sex_df = df if age_sex_df is None else age_sex_df.merge(df, left_index=True, right_index=True) 
        print(col_name)

Female Age <1 year
Female Age 1 to 4
Female Age 5 to 9
Female Age 10 to 14
Female Age 15 to 19
Female Age 20 to 24
Female Age 25 to 29
Female Age 30 to 34
Female Age 35 to 39
Female Age 40 to 44
Female Age 45 to 49
Female Age 50 to 54
Female Age 55 to 59
Female Age 60 to 64
Female Age 65 to 69
Female Age 70 to 74
Female Age 75 to 79
Female Age 80 plus
Male Age <1 year
Male Age 1 to 4
Male Age 5 to 9
Male Age 10 to 14
Male Age 15 to 19
Male Age 20 to 24
Male Age 25 to 29
Male Age 30 to 34
Male Age 35 to 39
Male Age 40 to 44
Male Age 45 to 49
Male Age 50 to 54
Male Age 55 to 59
Male Age 60 to 64
Male Age 65 to 69
Male Age 70 to 74
Male Age 75 to 79
Male Age 80 plus
CPU times: user 19.7 s, sys: 5 s, total: 24.7 s
Wall time: 25.1 s


### Rename and take a look at age and sex distribution

In [48]:
%%time
age_sex_df = age_sex_df.reset_index().rename({'x':'Long','y':'Lat'},
                                             axis=1)
age_sex_df

CPU times: user 10 ms, sys: 16.7 ms, total: 26.7 ms
Wall time: 25.6 ms


Unnamed: 0,Lat,Long,Female Age <1 year,Female Age 1 to 4,Female Age 5 to 9,Female Age 10 to 14,Female Age 15 to 19,Female Age 20 to 24,Female Age 25 to 29,Female Age 30 to 34,...,Male Age 35 to 39,Male Age 40 to 44,Male Age 45 to 49,Male Age 50 to 54,Male Age 55 to 59,Male Age 60 to 64,Male Age 65 to 69,Male Age 70 to 74,Male Age 75 to 79,Male Age 80 plus
0,11.170417,-3.247083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11.170417,-3.238750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.170417,-3.230417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11.170417,-3.222083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11.170417,-3.213750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412004,4.737084,1.152917,,,,,,,,,...,,,,,,,,,,
412005,4.737084,1.161250,,,,,,,,,...,,,,,,,,,,
412006,4.737084,1.169583,,,,,,,,,...,,,,,,,,,,
412007,4.737084,1.177917,,,,,,,,,...,,,,,,,,,,


In [49]:
%%time
age_sex_df.to_csv(path_agesex)

CPU times: user 4.19 s, sys: 127 ms, total: 4.32 s
Wall time: 4.33 s


## Process GDP data

1. Gridded global datasets for Gross Domestic Product and Human Development Index over 1990-2015
2. GDP_PPP_30arcsec_v3.nc
3. https://datadryad.org/stash/dataset/doi:10.5061/dryad.dk1j0

### Read in raw gdp data 

In [50]:
gdp_raw = rioxarray.open_rasterio(path_gdp)

### Clean gdp variables and dimensions

In [51]:
# select year 2015 gdp 
gdp = gdp_raw.isel(time=2)
# drop extra coordinates dimensions beyond latitude and longtitude
gdp = gdp.reset_coords(names=['time'],drop=True)
# specify CRS
gdp = gdp.rio.write_crs("epsg:4326", inplace=True)

### Reproject gdp to the refrence coordinates

In [52]:
gdp = grid_reproject(gdp, ref_da)
gdp

## Process Night life lights data

1. VIIRS night-time lights (2012-2016), Ghana
2. gha_viirs_100m_2016.tif
3. https://www.worldpop.org/geodata/summary?id=18619

### Read in raw nightlight data 

In [53]:
nl_raw = rioxarray.open_rasterio(path_nl)

### Clean nightlight variables and dimensions

In [54]:
#Aggregate the 100m nightlight to 1km with square mean
nl = xarray_square_aggregate(nl_raw,stride=10,var_name="nightlight", mode="mean")

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


### Reproject the nightlight to reference coordinates

In [55]:
nl = grid_reproject(nl, ref_da)
nl

## Process Birth data

1. Ghana 1km births
2. GHA_births_pp_v2_2015.tif
3. https://www.worldpop.org/geodata/summary?id=772

### Read in the raw birth data

In [56]:
birth_raw = rioxarray.open_rasterio(path_birth)

### Clean birth vairables and dimensions

In [57]:
# Select band index at 0 
birth = birth_raw.isel(band=0)
# Drop extra coordinates dimensions beyond latitude and longtitude
birth = birth.reset_coords(names=['band'],drop=True)
birth.name = 'birth'

### Reproject the birth data to reference coordinates

In [58]:
birth = grid_reproject(birth, ref_da)
birth

## Merge GDP, population, nightlight, birth into one dataset

In [59]:
merge_ds = xr.merge([gdp, pop, nl, birth])
merge_ds

# Data Analysis

## Hospital facility counts within nearby grids

In [60]:
# count public/private facilities in each grid
public_count_1x1_grid = points_in_grid(healthfacility_public,pop)
private_count_1x1_grid = points_in_grid(healthfacility_private,pop)
# count public/private facilities in each 5x5 grids
public_count_5x5_grid=aggregate_count(public_count_1x1_grid,5)
private_count_5x5_grid=aggregate_count(private_count_1x1_grid,5)

In [61]:
# add the facilities counts to the merged dataset
merge_ds = merge_ds.assign(public_count_1x1_grid = (["y","x"],public_count_1x1_grid))
merge_ds = merge_ds.assign(public_count_5x5_grid = (["y","x"],public_count_5x5_grid))
merge_ds = merge_ds.assign(private_count_1x1_grid = (["y","x"],private_count_1x1_grid))
merge_ds = merge_ds.assign(private_count_5x5_grid = (["y","x"],private_count_5x5_grid))

In [62]:
merge_ds

In [63]:
#%%time
#merge_ds.to_netcdf(os.path.join(path_data_out,"intermediate_data_v0_2021_12_14.nc"))

In [64]:
# covert from geospatial xarray dataset to pandas dataframe
merge_df = merge_ds.to_dataframe().reset_index()
# assign the flag variable of whether the 1x1 grid (or 5x5 grid) has public/private facilities or not
for facility_type in ['public','private']:
    for grid_size in [1,5]:
        merge_df[f'{facility_type}_flag_{grid_size}x{grid_size}_grid'] = \
        merge_df[f'{facility_type}_count_{grid_size}x{grid_size}_grid'] > 0 

In [65]:
# take a look at the number of locations having public/private facilities
merge_df.loc[:,['public_flag_1x1_grid',
                'public_flag_5x5_grid',
                'private_flag_1x1_grid',
                'private_flag_5x5_grid']].sum(axis=0)

public_flag_1x1_grid      1607
public_flag_5x5_grid     34780
private_flag_1x1_grid      219
private_flag_5x5_grid     5052
dtype: int64

## GDP rating

In [66]:
# assign gdp rating 
gdp_values = merge_df['GDP_PPP']
gdp_cutoff = np.nanquantile(gdp_values,[0.7,0.95])
gdp_levels = np.array(['nan']*len(gdp_values))
gdp_levels[~np.isnan(gdp_values)] = 'C'
gdp_levels[gdp_values > gdp_cutoff[0]] = 'B'
gdp_levels[gdp_values > gdp_cutoff[1]] = 'A'
merge_df['gdp_rating'] = gdp_levels
merge_df['gdp_rating'].value_counts()

nan    347509
C       45150
B       16125
A        3225
Name: gdp_rating, dtype: int64

In [67]:
merge_df

Unnamed: 0,x,y,spatial_ref,GDP_PPP,population,nightlight,birth,public_count_1x1_grid,public_count_5x5_grid,private_count_1x1_grid,private_count_5x5_grid,public_flag_1x1_grid,public_flag_5x5_grid,private_flag_1x1_grid,private_flag_5x5_grid,gdp_rating
0,-3.247083,11.170417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
1,-3.247083,11.162083,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
2,-3.247083,11.153750,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
3,-3.247083,11.145417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
4,-3.247083,11.137083,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412004,1.186250,4.770417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
412005,1.186250,4.762084,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
412006,1.186250,4.753750,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,
412007,1.186250,4.745417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,


## Death data
Note: Death data is not available for Ghana yet so we won't be using it yet for Ghana

### Read in death data

In [68]:
death_dist = pd.read_csv(path_death)

In [69]:
death_dist = death_dist.loc[(death_dist['metric']=='Rate')&(death_dist['sex']!='Both'),].copy().reset_index(drop=True)
death_dist['age_sex_bin'] = death_dist['sex'] + ' Age ' + death_dist['age']
death_dist

Unnamed: 0,measure,location,sex,age,cause,metric,year,val,upper,lower,age_sex_bin
0,Deaths,Egypt,Male,1 to 4,All causes,Rate,2019,80.385925,103.658216,61.726483,Male Age 1 to 4
1,Deaths,Egypt,Female,1 to 4,All causes,Rate,2019,69.988965,87.356305,55.654315,Female Age 1 to 4
2,Deaths,Egypt,Male,5 to 9,All causes,Rate,2019,42.035679,48.677685,36.377695,Male Age 5 to 9
3,Deaths,Egypt,Female,5 to 9,All causes,Rate,2019,30.696493,34.229181,27.623722,Female Age 5 to 9
4,Deaths,Egypt,Male,10 to 14,All causes,Rate,2019,47.754604,59.045287,38.308904,Male Age 10 to 14
5,Deaths,Egypt,Female,10 to 14,All causes,Rate,2019,31.990091,38.543189,26.453484,Female Age 10 to 14
6,Deaths,Egypt,Male,15 to 19,All causes,Rate,2019,88.182445,112.092075,68.526601,Male Age 15 to 19
7,Deaths,Egypt,Female,15 to 19,All causes,Rate,2019,40.839282,49.404875,33.582003,Female Age 15 to 19
8,Deaths,Egypt,Male,20 to 24,All causes,Rate,2019,115.116449,140.946893,93.523528,Male Age 20 to 24
9,Deaths,Egypt,Female,20 to 24,All causes,Rate,2019,51.357819,67.300059,38.864239,Female Age 20 to 24


### Read in age and sex data

In [70]:
age_sex_df = pd.read_csv(path_agesex,index_col=0)
# divided by the row sum to get the age and sex distrubution
age_sex_df.iloc[:,2:] = age_sex_df.iloc[:,2:].div(age_sex_df.iloc[:,2:].sum(axis=1), axis=0)
A = age_sex_df[death_dist['age_sex_bin'].to_list()].values
nan_index =age_sex_df[death_dist['age_sex_bin'].to_list()].isnull().all(axis=1)
A = np.nan_to_num(A)
B = death_dist['val'].values
age_sex_df['death_rate'] = pd.DataFrame(A).dot(pd.DataFrame(B))
age_sex_df['death_rate'] = age_sex_df['death_rate']/100000
age_sex_df.loc[nan_index,'death_rate'] = np.nan
age_sex_df

Unnamed: 0,Lat,Long,Female Age <1 year,Female Age 1 to 4,Female Age 5 to 9,Female Age 10 to 14,Female Age 15 to 19,Female Age 20 to 24,Female Age 25 to 29,Female Age 30 to 34,...,Male Age 40 to 44,Male Age 45 to 49,Male Age 50 to 54,Male Age 55 to 59,Male Age 60 to 64,Male Age 65 to 69,Male Age 70 to 74,Male Age 75 to 79,Male Age 80 plus,death_rate
0,11.170417,-3.247083,,,,,,,,,...,,,,,,,,,,
1,11.170417,-3.238750,,,,,,,,,...,,,,,,,,,,
2,11.170417,-3.230417,,,,,,,,,...,,,,,,,,,,
3,11.170417,-3.222083,,,,,,,,,...,,,,,,,,,,
4,11.170417,-3.213750,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412004,4.737084,1.152917,,,,,,,,,...,,,,,,,,,,
412005,4.737084,1.161250,,,,,,,,,...,,,,,,,,,,
412006,4.737084,1.169583,,,,,,,,,...,,,,,,,,,,
412007,4.737084,1.177917,,,,,,,,,...,,,,,,,,,,


In [71]:
merge_df = merge_df.rename({'x':'Long','y':'Lat'},axis=1)
# if read in age sex csv, there will be some decimal rounding difference due to float type
merge_df['Lat_5_decimal'] = np.round(merge_df['Lat'],5).astype('str')
merge_df['Long_5_decimal'] = np.round(merge_df['Long'],5).astype('str')
display(merge_df.head())
age_sex_df['Lat_5_decimal'] = np.round(age_sex_df['Lat'],5).astype('str')
age_sex_df['Long_5_decimal'] = np.round(age_sex_df['Long'],5).astype('str')
death = age_sex_df[['Lat_5_decimal','Long_5_decimal','death_rate']].copy()
display(death.head())

Unnamed: 0,Long,Lat,spatial_ref,GDP_PPP,population,nightlight,birth,public_count_1x1_grid,public_count_5x5_grid,private_count_1x1_grid,private_count_5x5_grid,public_flag_1x1_grid,public_flag_5x5_grid,private_flag_1x1_grid,private_flag_5x5_grid,gdp_rating,Lat_5_decimal,Long_5_decimal
0,-3.247083,11.170417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,11.17042,-3.24708
1,-3.247083,11.162083,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,11.16208,-3.24708
2,-3.247083,11.15375,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,11.15375,-3.24708
3,-3.247083,11.145417,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,11.14542,-3.24708
4,-3.247083,11.137083,0,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,11.13708,-3.24708


Unnamed: 0,Lat_5_decimal,Long_5_decimal,death_rate
0,11.17042,-3.24708,
1,11.17042,-3.23875,
2,11.17042,-3.23042,
3,11.17042,-3.22208,
4,11.17042,-3.21375,


In [72]:
merge_df = merge_df.merge(death,how='left',on=['Lat_5_decimal','Long_5_decimal'])
merge_df['death'] = merge_df['population'] * merge_df['death_rate']

In [73]:
merge_df = merge_df.drop(['Lat_5_decimal','Long_5_decimal', 'spatial_ref'], axis=1)
merge_df.head()

Unnamed: 0,Long,Lat,GDP_PPP,population,nightlight,birth,public_count_1x1_grid,public_count_5x5_grid,private_count_1x1_grid,private_count_5x5_grid,public_flag_1x1_grid,public_flag_5x5_grid,private_flag_1x1_grid,private_flag_5x5_grid,gdp_rating,death_rate,death
0,-3.247083,11.170417,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,,
1,-3.247083,11.162083,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,,
2,-3.247083,11.15375,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,,
3,-3.247083,11.145417,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,,
4,-3.247083,11.137083,,-99999.0,inf,,0.0,0.0,0.0,0.0,False,False,False,False,,,


In [74]:
merge_df[~(merge_df.population.isna() & merge_df.GDP_PPP.isna())].groupby(['gdp_rating'])\
.agg({'public_count_1x1_grid':'mean',
      'public_count_5x5_grid':'mean',
      'private_count_1x1_grid':'mean',
      'private_count_5x5_grid':'mean',})

Unnamed: 0_level_0,public_count_1x1_grid,public_count_5x5_grid,private_count_1x1_grid,private_count_5x5_grid
gdp_rating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,0.087132,0.981395,0.017674,0.24186
B,0.036341,0.420155,0.005023,0.066109
C,0.0068,0.18392,0.000819,0.025736
,0.00139,0.066677,0.000129,0.007171


# Save the final data

In [75]:
%%time
merge_df.to_csv(os.path.join(path_data_out, "ghana_final_data_v0_2022_April.csv"))

CPU times: user 1.84 s, sys: 59.6 ms, total: 1.9 s
Wall time: 1.92 s


# Total runtime

In [76]:
end_time = time.perf_counter ()
print(round((end_time - start_time)/60,2), "minutes")

0.81 minutes


# Data that has not yet included into the final data

## Phamarcies

In [77]:
df_healthsites_io

Unnamed: 0,Facility name,osm_id,Amenity,Ownership,Lat,Long
0,Pro-Life Pharmacy,1590950161,pharmacy,,-0.119871,5.601035
1,Beaver Clinic,1700719794,dentist,private,-0.178916,5.605701
2,Bethel Dental Clinic,1700719799,dentist,,-0.192542,5.616803
3,Adabraka clinic,1728032224,clinic,private,-0.209600,5.566273
4,Iran Clinic,1728032238,doctors,private,-0.212442,5.568037
...,...,...,...,...,...,...
1160,Dohamm Clinic Ltd,1378720189,clinic,,-1.780210,4.930476
1161,Eye Link Clinic,1385353948,doctors,,-0.222045,5.612410
1162,Odorna Medical Center,1405374124,clinic,,-0.215609,5.559084
1163,West African Rescue Association Head Office - ...,1417019958,doctors,,-0.174880,5.565374


In [78]:
df_phamarcy_io = df_healthsites_io[['osm_id','Facility name','Ownership','Amenity','Long','Lat']][df_healthsites_io.Amenity.isin(['pharmacy'])].copy()

In [79]:
df_phamarcy_io.groupby('Amenity').count()

Unnamed: 0_level_0,osm_id,Facility name,Ownership,Long,Lat
Amenity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
pharmacy,793,632,793,793,793


In [80]:
count_pharmacy,non_nan_count_pharmacy, value_sum_pharmacy = points_in_grid_new(df_phamarcy_io,ref_da)

In [81]:
%%time
var = 'pha'
merge_ds = ref_ds.copy()
size_list = [3,5,11,25,51,75,101]
for size in size_list:
    new_count, new_value, _ = spatial_aggregate(count_pharmacy,count_pharmacy,size,'sum')
    merge_ds = merge_ds.assign({f"{var}_{size}_cnt" :(["y","x"],new_value)})
    print(f"size {size} x {size} done.")

size 3 x 3 done.
size 5 x 5 done.
size 11 x 11 done.
size 25 x 25 done.
size 51 x 51 done.
size 75 x 75 done.
size 101 x 101 done.
CPU times: user 32 s, sys: 143 ms, total: 32.1 s
Wall time: 32.1 s


In [82]:
merge_ds

In [83]:
# covert from geospatial xarray dataset to pandas dataframe
merge_ds = merge_ds.assign({f"{var}_1_cnt" :(["y","x"],count_pharmacy)})
merge_df = merge_ds.to_dataframe().reset_index()
merge_df = merge_df.drop(['spatial_ref','population'],axis=1)
merge_df = merge_df.rename({'x':'Long','y':'Lat'},axis=1)
all_sizes = [1,3,5,11,25,51,75,101]
merge_df = merge_df[['Long','Lat'] + [f"{var}_{size}_cnt" for size in all_sizes]]
for size in all_sizes:
    merge_df[f"{var}_{size}_flag"] = merge_df[f"{var}_{size}_cnt"] > 0 
for idx in range(len(all_sizes)-1):
    size1 = all_sizes[idx]
    size2 = all_sizes[idx+1]
    merge_df[f'{var}_{size1}-{size2}'] = merge_df[f'{var}_{size2}_cnt'] - merge_df[f'{var}_{size1}_cnt']
merge_df

Unnamed: 0,Long,Lat,pha_1_cnt,pha_3_cnt,pha_5_cnt,pha_11_cnt,pha_25_cnt,pha_51_cnt,pha_75_cnt,pha_101_cnt,...,pha_51_flag,pha_75_flag,pha_101_flag,pha_1-3,pha_3-5,pha_5-11,pha_11-25,pha_25-51,pha_51-75,pha_75-101
0,-3.247083,11.170417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-3.247083,11.162083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-3.247083,11.153750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-3.247083,11.145417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-3.247083,11.137083,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412004,1.186250,4.770417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
412005,1.186250,4.762084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
412006,1.186250,4.753750,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0
412007,1.186250,4.745417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
%%time 
# save the data to shapefile
pharmacy = gp.GeoDataFrame(
     merge_df, geometry=gp.points_from_xy(merge_df.Long, merge_df.Lat))
pharmacy.to_file(os.path.join(path_data_out,f'{var}_2022_April.shp') )

  pd.Int64Index,


CPU times: user 59.8 s, sys: 1.64 s, total: 1min 1s
Wall time: 1min 1s


## Banks and ATMs
Note: Data is not yet available