# Import packages

In [1]:
import os
import time 
import numpy as np
import pandas as pd
import geopandas as gp

In [2]:
from sklearn.linear_model import LinearRegression

In [3]:
import xarray as xr
import rioxarray
from rasterio.enums import Resampling

In [4]:
# Time the script
start_time = time.perf_counter ()

# Set up directories and file paths

In [5]:
path_data_in = '../2 Raw Data'
path_data_out = '../3 Output Data'

## Input data

In [6]:
path_pop_count = os.path.join(path_data_in, 
                              'Population', 
                              'Count',
                              'zaf_ppp_2015_1km_Aggregated.tif')  


dir_pop_agesex= os.path.join(path_data_in, 
                             'Population', 
                             'AgeSex')
#file_healthsiteIO_shp=os.path.join(path_data_in, 
                                 #  'Facilities',
                                  # 'Uzbekistan_HealthsitesIO_SHP',
                                  # "Uzbekistan-node.shp")
#file_healthsiteIO_csv=os.path.join(path_data_out,
                                #   'Uzbekistan_healthsites_io.csv')
path_gdp=os.path.join(path_data_in,
                      'GDP_HDI',
                      'GDP_PPP_30arcsec_v3.nc')
path_nl = os.path.join(path_data_in,
                       'Nighttime_Lights',
                       'zaf_viirs_100m_2015.tiff')
#no birth data for Albania
#path_birth=os.path.join(path_data_in,
 #                       'Birth',
  #                      'Uzbekistan_1km_Births',
   #                     'uzb_births_pp_v2_2015.tif')
# path_death = os.path.join(path_data_in,
#                           'Death',
#                           'Albania_death_data_by_age_and_sex_2019.csv') 

## Output data

In [7]:
#path_healthfacility_public = os.path.join(path_data_out,"public_health_facilities.csv")
#path_healthfacility_private = os.path.join(path_data_out,"private_health_facilities.csv")
path_agesex = os.path.join(path_data_out,"age_sex_distribution_1km_2019.csv")
#bank_csv_path = os.path.join(data_in_path,"Egypt_Banks_and_ Atms.csv")

# User-defined functions

In [8]:
def find_grid_index(x,y,longs,lats,x_min=None,x_max=None,y_min=None,y_max=None):
    """
    DESCRIPTION: This function helps to find the index of the longitudes and 
    latitudes in the given longitude/latitude lists that is closest to the 
    given point (x,y) 
    
    INPUT -
    x: latitude of the given point
    y: longtidue of the given point
    longs: reference list of longitudes
    lats: reference list of latitudes
    x_min, x_max: min/max boundaries of longitude
    y_min, y_max: min/max boundaries of latitude
    
    OUTPUT -
    Index of the closest longitude/latitude to point (x,y)
    """
    if x_min is None and x_max is None:
        x_l = longs[0]- (longs[1]-longs[0])
        x_r = longs[-1] + (longs[-1]-longs[-2])
        x_min = min(x_l, x_r)
        x_max = max(x_l, x_r)
    if x < x_min or x > x_max:
        return -1,-1
    
    if y_min is None and y_max is None:
        y_l = lats[0]- (lats[1]-lats[0])
        y_r = lats[-1] + (lats[-1]-lats[-2])
        y_min = min(y_l, y_r)
        y_max = max(y_l, y_r)
    if y < y_min or y > y_max:
        return -1,-1
    
    ncol = np.argmin(abs(longs-x)) 
    nrow = np.argmin(abs(lats-y))
    if nrow<0 or nrow>=(len(lats)-1) or ncol<0 or ncol>=(len(longs)-1):
        return -1, -1
    else:
        return nrow,ncol

In [9]:
def points_in_grid_new(points_df,ref_data,value_colname=None):
    """
    DESCRIPTION: This function finds how many points (and how many non nan points and their sum  
    if a value_colname is provided) lying in the grids centered at the coordinates of the reference data
    
    INPUT -
    points_df: pandas dataframe that needs to be count 
    ref_data: xarray DataArray with reference coordinates
    value_colname: column name for the values in the dataframe
    
    OUTPUT -
    count: How many points are in each grid of the reference coordinate
    non_nan_count: How many non nan points are in each grid of the reference coordinate
    value_sum: The sum of the values of the non nan points within each grid of the reference coordinate
    """
    long_list = ref_data.coords['x'].values
    lat_list = ref_data.coords['y'].values
    long_list = np.append(long_list, long_list[-1]+ref_data.rio.resolution()[0])
    lat_list = np.append(lat_list, lat_list[-1]+ref_data.rio.resolution()[1])
    count = np.zeros(ref_data.values.shape)
    non_nan_count = np.zeros(ref_data.values.shape)
    if value_colname is not None:
        value_sum = np.empty(ref_data.values.shape)
        value_sum[:] = np.nan
        values = points_df[value_colname]
    else:
        value_sum = None
    grid_index = points_df.apply(lambda row: find_grid_index(row['Long'],row['Lat'],long_list,lat_list), axis=1)
    for j, x in enumerate(grid_index):
        if x[0]!=-1 and x[1]!=-1:
            count[x[0],x[1]] += 1
            if value_colname is not None and ~np.isnan(values[j]):
                non_nan_count[x[0],x[1]] = non_nan_count[x[0],x[1]]+1 
                value_sum[x[0],x[1]] = np.nansum([value_sum[x[0],x[1]],values[j]])
    return count, non_nan_count, value_sum

In [10]:
def spatial_aggregate(count_in,value_sum_in,block_size,agg_mode='sum', non_nan_count_in=None):
    """
    DESCRIPTION: This function apply square sum aggregation with stride 1 for 2D numpy array
    with the same padding
    
    INPUT -
    count_in: 2D numpy array for the count at each grid
    value_sum_in: 2D numpy array for the sum of values at each grid
    block_size: the size of the square sum aggregation
    agg_mode: aggregation operation mode "sum" or "mean"
    non_nan_count_in: 2D array for the non nan count at each grid, only needed for "mean" mode
    
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2

    new_count_in = np.zeros(count_in.shape)
    count_in = np.pad(count_in,half)

    return_value = np.zeros(value_sum_in.shape)
    value_sum_in = np.pad(value_sum_in,half)
    
    if agg_mode == 'mean':
        new_nonnan_count_in = np.zeros(non_nan_count_in.shape)
        non_nan_count_in = np.pad(non_nan_count_in,half)
    elif agg_mode == 'sum':
        new_nonnan_count_in = None
    else:
        raise Exception('Error: The function only supports agg_mode for sum or mean.')
    for nrow in range(half,count_in.shape[0]-half):
        for ncol in range(half,count_in.shape[1]-half):
            new_count_in[nrow-half,ncol-half] = np.sum(count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            if agg_mode == 'mean':
                new_nonnan_count_in[nrow-half,ncol-half] = np.sum(non_nan_count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            return_value[nrow-half,ncol-half] = np.nansum(value_sum_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    if agg_mode == 'mean':
        new_nonnan_count_in = np.where(new_nonnan_count_in == 0, np.nan, new_nonnan_count_in)
        return_value = return_value/new_nonnan_count_in
        return new_count_in, return_value, new_nonnan_count_in
    else:
        return new_count_in, return_value, new_nonnan_count_in

In [11]:
def layer_aggregate_helper(count_in,value_sum_in,block_size,non_nan_count_in):
    """
    DESCRIPTION: This function apply square sum aggregation with stride 1 for 2D numpy array
    with the same padding
    
    INPUT -
    count_in: 2D numpy array for the count at each grid
    value_sum_in: 2D numpy array for the sum of values at each grid
    block_size: the size of the square sum aggregation
    non_nan_count_in: 2D array for the non nan count at each grid
    
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2

    new_count_in = np.zeros(count_in.shape)
    count_in = np.pad(count_in,half)

    return_value = np.zeros(value_sum_in.shape)
    value_sum_in = np.pad(value_sum_in,half)
    
    new_nonnan_count_in = np.zeros(non_nan_count_in.shape)
    non_nan_count_in = np.pad(non_nan_count_in,half)
  
    for nrow in range(half,count_in.shape[0]-half):
        for ncol in range(half,count_in.shape[1]-half):
            new_count_in[nrow-half,ncol-half] = np.sum(count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            new_nonnan_count_in[nrow-half,ncol-half] = np.sum(non_nan_count_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
            return_value[nrow-half,ncol-half] = np.nansum(value_sum_in[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    return_value = np.where(new_nonnan_count_in == 0, np.nan, return_value)
    return new_count_in, return_value, new_nonnan_count_in

In [12]:
def layer_aggregate(value1,count1,value2,count2,mode = 'sum'):
    value1 = np.array(value1)
    value2 = np.array(value2)
    count1 = np.array(count1)
    count2 = np.array(count2)
    value2[np.where(count1==count2)]=value1[np.where(count1==count2)]
    value1 = np.where(np.isnan(value1), 0, value1)
    value2 = np.where(np.isnan(value2), 0, value2)
    value_diff = value2-value1
    count_diff = count2-count1
    if mode == 'sum':
        return_value = value_diff
        return_value = np.where(count_diff==0,np.nan,return_value)
    elif mode == 'mean':
        count_diff = np.where(count_diff == 0, np.nan, count_diff)
        return_value = value_diff/count_diff
    else:
        raise Exception('Only support sum or mean for mode')
    return return_value

In [13]:
def square_helper(arr, y, x, mode):
    """
    DESCRIPTION: This is a helper funtion to do mean or sum in square for 2D numpy array
    
    INPUT - 
    arr: 2D numpy array to be averaged
    (y,x): output dimension
    mode: sum or mean, NAs are ignored 

    OUTPUT - 
    Averaged 2D array
    """
    yy, xx = arr.shape
    if mode == "sum":
        vals = np.nansum(arr.reshape(y, yy//y, x, xx//x),(1,3))
    elif mode == "mean":
        vals = np.nanmean(arr.reshape(y, yy//y, x, xx//x),(1,3))
    else:
        raise Exception("Mode is not suppported, please input `sum' or `mean'")
    return vals

In [14]:
def xarray_square_aggregate(df_raw,stride=10,var_name="band_data",mode="mean"):
    """
    DESCRITPION: This is a function to do square aggregation (mean or sum) for a
    xarray dataArray, generating a new xarray dataArray
    
    INPUT -
    df_raw: 2D xarray DataArray
    stride: Size of the non-overlap square aggregation
    var_name: new DataArray variable name
    mode: square aggregation function
    
    OUTPU -
    A new 2D xarray DataArray with non-overlap square aggregation
    """
    nrow, ncol = df_raw.values.squeeze().shape
    new_nrow,new_ncol = int(nrow/stride), int(ncol/stride)
    data = df_raw.values.squeeze()[0:(new_nrow*stride),0:(new_ncol*stride)]
    new_data = square_helper(data, new_nrow, new_ncol, mode)
    new_x = np.mean(df_raw.coords['x'].values[0:(new_ncol*stride)].reshape(-1,stride),axis=1)
    new_y = np.mean(df_raw.coords['y'].values[0:(new_nrow*stride)].reshape(-1,stride),axis=1)
    # create new datarray
    df = xr.DataArray(
        data=new_data,
        dims=["y", "x"],
        coords=dict(
            x= new_x,
            y= new_y,
        ),
    )
    df.name = var_name
    if df_raw.rio.crs is not None:
        df = df.rio.write_crs(df_raw.rio.crs)
    return df

In [15]:
def grid_reproject(df,ref_data,resample_method = Resampling.nearest):
    """
    DESCRIPTION: This is a function to reproject a xarray dataArray to the coordinates
    of another refrence xarray dataArray.
    
    INPUT - 
    df: The xarray DataArray needing to be reprojected to the new grids
    ref_data: The reference xarray DataArray with reference coordinates (the new grids)
    resample_method: resample method
    
    OUTPUT -
    A new xarray DataArray reprojected to the coordinates of ref_data 
    """
    df = df.rio.reproject_match(ref_data, resampling = Resampling.nearest, nodata= np.nan)
    df = df.assign_coords({
        "x": ref_data.x,
        "y": ref_data.y,
    })
    return df

In [16]:
def points_in_grid(points_df,ref_data):
    """
    DESCRIPTION: This function finds how many points are in the grids centered 
    at the coordinates of the reference data
    
    INPUT -
    points_df: xarray DataArray that needs to be count 
    ref_data: xarray DataArray with reference coordinates
    
    OUTPUT -
    How many points are in each grid of the reference coordinates
    """
    long_list = ref_data.coords['x'].values
    lat_list = ref_data.coords['y'].values
    long_list = np.append(long_list, long_list[-1]+ref_data.rio.resolution()[0])
    lat_list = np.append(lat_list, lat_list[-1]+ref_data.rio.resolution()[1])
    count = np.zeros(ref_data.values.shape)
    grid_index = points_df.apply(lambda row: find_grid_index(row['Long'],row['Lat'],long_list,lat_list), axis=1)
    for x in grid_index:
        if x[0]!=-1 and x[1]!=-1:
            count[x[0],x[1]] += 1
    return count

In [17]:
def aggregate_count(count,block_size):
    """
    DESCRIPTION: This function apply square sum aggregation with sride 1 for 2D numpy array
    with same padding
    
    INPUT -
    count: 2D numpy array
    block_size: the size of the square sum aggregation
    
    OUTPUT - 
    a new 2D numpy array with square sum aggreation in the same size
    """
    if block_size%2 != 1:
        raise Exception('block_size must be an odd integer!')
    half = block_size//2
    new_count = np.zeros(count.shape)
    count = np.pad(count,half)
    for nrow in range(half,count.shape[0]-half):
        for ncol in range(half,count.shape[1]-half):
            new_count[nrow-half,ncol-half] = np.sum(count[(nrow-half):(nrow+half+1),(ncol-half):(ncol+half+1)])
    return new_count

In [18]:
def print_summary(data):
    """
    DESCRIPTION: This is a function to print the sumamry of a xarray DataArray or Dataset
    
    INPUT - 
    data: xarray DataArray or Dataset
    
    OUTPUT - 
    summary statistics of data
    """
    print(f"shape: {data.rio.shape}")
    print(f"resolution: {data.rio.resolution()}")
    print(f"coordinates boundary: {data.rio.bounds()}")
    print(f"CRS: {data.rio.crs}")

# Specify reference coordinates/grids

1. The spatial distribution of population in 2015 South Africa
2. zaf_ppp_2015_1km_Aggregated.tif
3. https://hub.worldpop.org/geodata/summary?id=33887

Here we use the grids of Uzbekistan population data as the reference data.
All other data will be reprojected to this reference grids to make all data consistent in space.

## Population

### Read in population data

In [19]:
pop_raw = rioxarray.open_rasterio(path_pop_count)
display(pop_raw)
print_summary(pop_raw)

shape: (2983, 2586)
resolution: (0.0083333333, -0.0083333333)
coordinates boundary: (16.457083269328677, -46.983749590994, 38.00708318312868, -22.125416357093997)
CRS: EPSG:4326


## Clean variables and dimensions

### Select band index at 0 

In [20]:
pop = pop_raw.isel(band=0)
pop

### Drop extra coordinates dimensions beyond latitude and longtitude


In [21]:
pop = pop.reset_coords(names=['band'],drop=True)
pop.name = 'population'
pop

### Make population DataArray as the reference grids

In [22]:
ref_da = pop.copy()
ref_ds = ref_da.to_dataset()
ref_da

# Data Preprocessing

Process age and gender population data

1. Albania 100m Age and Gender structures in 2020
2.
3. https://www.worldpop.org/geodata/summary?id=50113

We have one Geotiff file for each sex class and each age bin, here we aggregate the 36 Geotiff files into one clean csv file, which will be used to calculate death data at each grid later.

### Set sex bins and age bins 

In [23]:
# sex_bins = ['Female','Male']
# age_bins = ['Age <1 year', 'Age 1 to 4','Age 5 to 9',
#             'Age 10 to 14','Age 15 to 19','Age 20 to 24',
#             'Age 25 to 29','Age 30 to 34','Age 35 to 39',
#             'Age 40 to 44','Age 45 to 49','Age 50 to 54',
#             'Age 55 to 59','Age 60 to 64','Age 65 to 69',
#             'Age 70 to 74','Age 75 to 79','Age 80 plus']
# age_start_bins = [0,1] + list(np.arange(5,80+1,5))

In [24]:
# %%time
# age_sex_df = None
# for sex in sex_bins:
#     for file_no, age in enumerate(age_bins):
#         file_name ="alb_"+sex[0].lower()+"_"+str(age_start_bins[file_no])+"_2020_constrained.tif"
#         col_name = sex + ' ' + age
#         age_sex_raw = xr.open_dataarray(os.path.join(dir_pop_agesex,file_name))
#         age_sex = xarray_square_aggregate(age_sex_raw,stride=10,var_name=col_name, mode="sum")
#         age_sex = grid_reproject(age_sex, pop)
#         df = age_sex.to_dataframe()
#         df = df.drop("spatial_ref",axis=1)
#         age_sex_df = df if age_sex_df is None else age_sex_df.merge(df, left_index=True, right_index=True) 
#         print(col_name)

### Rename and take a look at age and sex distribution

In [25]:
# %%time
# age_sex_df = age_sex_df.reset_index().rename({'x':'Long','y':'Lat'},
#                                              axis=1)
# age_sex_df

In [26]:
# %%time
# age_sex_df.to_csv(path_agesex)

## Process GDP data

1. Gridded global datasets for Gross Domestic Product and Human Development Index over 1990-2015
2. GDP_PPP_30arcsec_v3.nc
3. https://datadryad.org/stash/dataset/doi:10.5061/dryad.dk1j0

### Read in raw gdp data 

In [19]:
gdp_raw = rioxarray.open_rasterio(path_gdp)
display(gdp_raw)
print_summary(gdp_raw)

shape: (21600, 43200)
resolution: (0.008333333380429452, -0.00833333342752775)
coordinates boundary: (-180.00000101727616, -90.00000101729967, 180.00000101727616, 90.0000010172997)
CRS: None


In [20]:
type(gdp_raw)

xarray.core.dataarray.DataArray

### Clean gdp variables and dimensions

In [28]:
# select year 2015 gdp 
gdp = gdp_raw.isel(time=2)
# drop extra coordinates dimensions beyond latitude and longtitude
gdp = gdp.reset_coords(names=['time'],drop=True)
# specify CRS
gdp = gdp.rio.write_crs("epsg:4326", inplace=True)

gdp

### Reproject gdp to the refrence coordinates

In [29]:
gdp = grid_reproject(gdp, ref_da)
gdp.name = 'GDP_PPP'
gdp

## Process Night life lights data

1. VIIRS night-time lights (2012-2016), South Africa
2. zaf_viirs_100m_2015.tiff
3. https://hub.worldpop.org/geodata/summary?id=18738 

### Read in raw nightlight data 

In [30]:
nl_raw = rioxarray.open_rasterio(path_nl)
display(nl_raw)
print_summary(nl_raw)

shape: (29827, 25856)
resolution: (0.00083333333, -0.00083333333)
coordinates boundary: (16.457916615829987, -46.98291627736, 38.00458319630999, -22.12708304345)
CRS: EPSG:4326


### Clean nightlight variables and dimensions

In [31]:
# Aggregate the 100m nightlight to 1km with square mean
nl = xarray_square_aggregate(nl_raw,stride=10,
                             var_name="nightlight", 
                             mode="mean")

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


### Reproject the nightlight to reference coordinates

In [32]:
nl = grid_reproject(nl, ref_da)
nl.name = 'nightlight'
display(nl)

## Merge GDP, population, nightlight into one dataset

In [33]:
merge_ds = xr.merge([gdp, pop, nl])
display(merge_ds)
print_summary(merge_ds) 

shape: (2983, 2586)
resolution: (0.0083333333, -0.0083333333)
coordinates boundary: (16.457083269328677, -46.983749590994, 38.00708318312868, -22.125416357093997)
CRS: EPSG:4326


In [34]:
merge_df = merge_ds.to_dataframe().reset_index()
merge_df

Unnamed: 0,x,y,spatial_ref,GDP_PPP,population,nightlight
0,16.461250,-22.129583,0,,-99999.0,inf
1,16.461250,-22.137916,0,,-99999.0,inf
2,16.461250,-22.146250,0,,-99999.0,inf
3,16.461250,-22.154583,0,,-99999.0,inf
4,16.461250,-22.162916,0,,-99999.0,inf
...,...,...,...,...,...,...
7714033,38.002917,-46.946250,0,,-99999.0,
7714034,38.002917,-46.954583,0,,-99999.0,
7714035,38.002917,-46.962916,0,,-99999.0,
7714036,38.002917,-46.971250,0,,-99999.0,


In [35]:
merge_df[merge_df['population']<=0]['population'].value_counts()

-99999.0    6082529
 0.0          56163
Name: population, dtype: int64

In [36]:
merge_df.isnull().sum()

x                    0
y                    0
spatial_ref          0
GDP_PPP        7345006
population           0
nightlight        5568
dtype: int64

In [37]:
# drop observations with missing population or missing GDP [note: missing was coded as -99999]
merge_df2 = (
    merge_df[(merge_df["population"] > 0) & (merge_df["GDP_PPP"].notnull())]
    .copy()
    .reset_index(drop=True)
)

# generate GDP PC
merge_df2["pc"] = merge_df2["GDP_PPP"] / merge_df2["population"]

# rename x and y to longitude and latitude
merge_df2.rename({"x": "Long", "y": "Lat"}, axis="columns", inplace=True)

# drop spatial ref and band
merge_df2.drop(["spatial_ref"], axis="columns", inplace=True)

# take a look at the data
merge_df2


Unnamed: 0,Long,Lat,GDP_PPP,population,nightlight,pc
0,16.461250,-28.629583,4.084906e+01,0.496830,inf,82.219322
1,16.461250,-28.637916,1.384301e+02,0.031538,inf,4389.322266
2,16.469583,-28.596250,1.267384e+05,2.256491,inf,56166.128906
3,16.469583,-28.604583,5.550490e+04,1.165269,inf,47632.695312
4,16.469583,-28.612916,7.755882e+04,0.555839,inf,139534.656250
...,...,...,...,...,...,...
281679,32.869583,-26.904583,3.856997e+05,3.285650,0.032067,117389.164062
281680,32.869583,-26.954583,5.806393e+05,18.166996,inf,31961.216797
281681,32.877917,-26.896250,2.201439e+05,6.398738,inf,34404.273438
281682,32.886250,-26.854583,1.059373e+06,46.258816,inf,22901.000000


In [38]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
merge_df2.describe()

Unnamed: 0,Long,Lat,GDP_PPP,population,nightlight,pc
count,281684.0,281684.0,281684.0,281684.0,281684.0,281684.0
mean,27.47276,-28.40912,2421425.0,184.50818,inf,220910.625
std,3.4149,3.02859,10262999.0,798.21094,,19777708.0
min,16.46125,-34.83792,0.01906,4e-05,-0.0276,0.01142
25%,26.16958,-30.80458,16909.39648,1.36368,0.05182,5874.85217
50%,28.26958,-28.09625,117215.73438,7.80724,0.15034,13947.30762
75%,29.91125,-26.01292,858486.14062,59.60357,0.67458,34991.27441
max,32.88625,-22.12958,818588736.0,52822.44922,inf,9298811904.0


### Get percentiles for mapping

GDP PC percentiles - bottom 40%, 40-80%, 80-95%, above 95%

In [39]:
gdp_pc_values = merge_df2['pc']
gdp_pc_cutoff = np.nanquantile(gdp_pc_values,[0.4, 0.8, 0.95, 1])

# turn off scientific notation in numpy arrays
np.set_printoptions(formatter={'float': '{:0.4f}'.format})
print(gdp_pc_cutoff)

[10394.1750 46228.4672 232189.8328 9298811904.0000]


Population percentiles - bottom 10%, 10-50%, 50-75%, 75-90%, 90-95%, 95-99%, above 99%

In [40]:
pop_values = merge_df2['population']
pop_cutoff = np.nanquantile(pop_values,[0.1, 0.5, 0.75, 0.9, 0.95, 0.99, 1])
pop_cutoff

array([0.2487, 7.8072, 59.6036, 315.9083, 833.0137, 3540.1640, 52822.4492])

# Data Analysis

## GDP rating

In [41]:
# # assign gdp rating 
# gdp_values = merge_df2['GDP_PPP']
# gdp_cutoff = np.nanquantile(gdp_values,[0.7,0.95])
# gdp_levels = np.array(['nan']*len(gdp_values))
# gdp_levels[~np.isnan(gdp_values)] = 'C'
# gdp_levels[gdp_values > gdp_cutoff[0]] = 'B'
# gdp_levels[gdp_values > gdp_cutoff[1]] = 'A'
# merge_df2['gdp_rating'] = gdp_levels
# merge_df2['gdp_rating'].value_counts()

In [42]:
# %%time
# merge_df2 = merge_df2.rename({'x':'Long','y':'Lat'},axis=1)
# if read in age sex csv, there will be some decimal rounding difference due to float type
# merge_df2['Lat_5_decimal'] = np.round(merge_df2['Lat'],5).astype('str')
# merge_df2['Long_5_decimal'] = np.round(merge_df2['Long'],5).astype('str')
# display(merge_df2.head())


# Save the final data

In [43]:
%%time
merge_df2.to_csv(os.path.join(path_data_out, "zaf_final_data_2022_0712_SY.csv"))

CPU times: user 1.64 s, sys: 118 ms, total: 1.75 s
Wall time: 1.92 s


# Total runtime

In [44]:
end_time = time.perf_counter ()
print(round((end_time - start_time)/60,2), "minutes")

0.99 minutes
