## Process a raster into a large collection of points, then process those points using Dask to clip and spatial join key information from the study's focus area

In [1]:
import os, sys
import re
import pprint
# from pprint import pprint

import numpy as np

import rasterio
from rasterio import features, transform
from rasterio.mask import mask
from rasterio.transform import Affine
from rasterio.warp import calculate_default_transform, reproject, Resampling

import pandas as pd
import geopandas as gpd

import shapely
from shapely.geometry import shape, box, Polygon

### Setup

Directories

In [2]:
geo_dir = r'P:\PAK\GEO'
data_dir = r'../../data'

rast_dir = r'rast_inputs'
vect_in_dir = r'vect_inputs'
vect_out_dir = r'vect_out'

rds_dir = r'roads'
dest_dir = r'destinations'
speed_dir = r'speed'
fric_dir = r'friction'
acc_dir = r'access'
tab_dir = r'tabular'

Projections

In [3]:
dest_crs = 'EPSG:32642'
dcrs_int = int(re.findall('[0-9]+',dest_crs)[0])

### Set up dask cluster (if this is/will be a lot points)

In [4]:

import dask
import coiled
from dask.distributed import Client, LocalCluster, Lock
from dask.utils import SerializableLock
import dask.dataframe as dd

from dask_control import *

In [6]:
client=get_dask_client(cluster_type='local',n_workers=4,processes=True,threads_per_worker=4)
client

0,1
Connection method: Direct,
Dashboard: http://127.0.0.1:8787/status,

0,1
Comm: tcp://127.0.0.1:8786,Workers: 2
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: 4 minutes ago,Total memory: 63.98 GiB

0,1
Comm: tcp://10.175.66.81:63232,Total threads: 4
Dashboard: http://10.175.66.81:63233/status,Memory: 31.99 GiB
Nanny: tcp://127.0.0.1:61983,
Local directory: P:\PAK\Code\kpgit_robert\notebooks\RB\dask-worker-space\worker-sny__hm8,Local directory: P:\PAK\Code\kpgit_robert\notebooks\RB\dask-worker-space\worker-sny__hm8
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 42.56 MiB,Spilled bytes: 0 B
Read bytes: 0.0 B,Write bytes: 59.70 kiB

0,1
Comm: tcp://10.175.66.81:63229,Total threads: 4
Dashboard: http://10.175.66.81:63230/status,Memory: 31.99 GiB
Nanny: tcp://127.0.0.1:61984,
Local directory: P:\PAK\Code\kpgit_robert\notebooks\RB\dask-worker-space\worker-ep2od3c1,Local directory: P:\PAK\Code\kpgit_robert\notebooks\RB\dask-worker-space\worker-ep2od3c1
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 0.0%,Last seen: Just now
Memory usage: 42.52 MiB,Spilled bytes: 0 B
Read bytes: 0.95 MiB,Write bytes: 1.02 MiB


### Load and process raster to points

Load in the base raster we are using as a template so we can match up exactly to its grid and cell size

In [None]:
rast_pth = os.path.join(geo_dir,r'Population/Settlements/KPK_WSF2019.tif')

In [None]:
with rasterio.open(rast_pth, 'r') as src1:
    
    rast = src1.read(1).astype(np.float32)
    
    # populate geoms list

    results = (
        {'properties': {'VALUE': v}, 'geometry': s}
        for i, (s, v) 
        in enumerate(
            rasterio.features.shapes(rast, transform=src1.transform)))

    geoms = list(results)

    # convert to GDF, clean up, and dissolve

    poly = gpd.GeoDataFrame.from_features(geoms)
    pts = poly.copy()
    pts.geometry = pts.geometry.centroid
    

In [None]:
pts = pts.set_crs(4326)
pts['lon_4326'] = pts.geometry.x
pts['lat_4326'] = pts.geometry.y

In [None]:
pts = pts.to_crs(32642)
pts['lon_32642'] = pts.geometry.x
pts['lat_32642'] = pts.geometry.y

In [None]:
pts.dtypes

In [None]:
len(pts)

In [None]:
import dask.dataframe as dd
# pts_dd = dd.from_pandas(pts,chunksize=100000)
# pts_dd.to_csv(os.path.join(geo_dir,'Population/Settlements/KPK_WSF2019_pts.csv'), header=True, index=True, single_file=True)

#### (Re-)Load in points data

In [7]:
# pts_loc = r'Population/HRSL/kp_general_v15_pts.csv'
pts_loc = r'Population/Settlements/KPK_WSF2019_pts.csv'

In [8]:
# # Load in points if needing to redo for some reason
pts = dd.read_csv(os.path.join(geo_dir,pts_loc),
                     na_values = ' ',
                     blocksize='100mb'
                )

# pts = pts_dd

In [None]:
# pts = gpd.GeoDataFrame(pts, geometry = gpd.points_from_xy(x=pts.lon_4326,y=pts.lat_4326)).set_crs("EPSG:4326")

#### Clip to desired extent

Load in KP as clipping object

In [None]:
kp = gpd.read_file(os.path.join(geo_dir,'Boundaries/OCHA/pak_admbnda_adm1_ocha_pco_gaul_20181218.shp'))
kp = kp[kp['ADM1_EN'] == 'Khyber Pakhtunkhwa']
kp = kp.to_crs(dest_crs)

# Buffer the polygon by 20km so we take in nearby markets and roads that may be used
# kp.geometry = kp.buffer(20000)

In [None]:
kp = kp.to_crs(4326)

In [None]:
# kp_pts = gpd.clip(pts,kp)

In [None]:
# len(gpd.points_from_xy(pts.lon_4326, y = pts.lat_4326))

In [None]:
# pts = gpd.GeoDataFrame(
#     pts, geometry = gpd.points_from_xy(pts.lon_4326, pts.lat_4326, crs = "EPSG:4326"))

In [None]:
def clip_pts(df, polys):
    
    # ensure that broadcast polys are compiled before running the clip
    if isinstance(polys, gpd.GeoDataFrame) == False:
        polys = polys.result()
    
    # convert to gdf
    gdf = gpd.GeoDataFrame(
        df, 
        geometry=gpd.points_from_xy(df.lon_4326, df.lat_4326)
    ).set_crs("EPSG:4326")
    
    # Clip by extent
    gdf = gpd.clip(gdf, polys)
    
    # Drop the geometry column as it confuses Dask (which doesn't understand geometry metadata) and isn't needed
    df = pd.DataFrame(gdf.drop('geometry', axis=1))
    
    return df
    

In [None]:
# Broadcast adm3
kp_dist = client.scatter(kp, broadcast=True)

In [None]:
# Distributed clip
kp_pts = pts.map_partitions(clip_pts, kp_dist)

#### Spatial join information from a large collection of polygons to a large collection of points

In [23]:
settle_buff = gpd.read_file(os.path.join(data_dir,vect_in_dir,"KP_Settlements_NGA_2017_200mBuff.gpkg"),driver="GPKG")
settle_buff.head(1)

Unnamed: 0,FID_Settle,RC,UFI,UNI,LAT,LONG,DMS_LAT,DMS_LONG,MGRS,JOG,...,PROVINCE,PROVINCE_C,DISTRICT,DISTRICT_C,TEHSIL,TEHSIL_C,Remarks,hfs,sum,geometry
0,105912,5,6048637.0,15681640.0,31.304456,70.340982,31:18:16N,70:20:28E,42RXV2761564120,NH42-03,...,Khyber Pakhtunkhwa,4.0,FR Dera Ismail Khan,409,FR Dera Ismail Khan,40901.0,Pcode change,0,,"POLYGON ((627940.770 3464116.408, 627939.807 3..."


In [29]:
settle_buff = settle_buff.to_crs(4326)

In [12]:
# adm3 = gpd.read_file(os.path.join(geo_dir,'Boundaries/KP_Analysis/KP_Analysis_All_Tehsils.gpkg'),driver="GPKG")
# adm3 = adm3[['geometry','ADM1_EN','ADM2_EN','ADM3_EN','ADM1_PCODE','ADM2_PCODE','ADM3_PCODE']]
# adm3 = adm3.rename({'ADM1_PCODE':'Adm1_Code','ADM2_PCODE':'Adm2_Code','ADM3_PCODE':'Adm3_Code'},axis=1)
# adm3 = adm3.to_crs(4326)
# adm3.head()

In [30]:
# Broadcast polygons
# adm3_dist = client.scatter(adm3, broadcast=True)
settle_dist = client.scatter(settle_buff[['geometry','FID_Settle']],broadcast=True)

In [31]:
def get_sj(df, polys):
    # Join using 4326
    # Convert to GDF
    if isinstance(polys, gpd.GeoDataFrame) == False:
        polys = polys.result()
    gdf = gpd.GeoDataFrame(
        df, 
        geometry=gpd.points_from_xy(df.lon_4326, df.lat_4326)
    ).set_crs("EPSG:4326")
    
    # Join Admin 3
    gdf = gpd.sjoin(gdf, polys, how='left', op='within')
    
    df = pd.DataFrame(gdf.drop('geometry', axis=1))
    
    
    return df
    

In [32]:
# kp_pts_adm = kp_pts.map_partitions(get_sj, adm3_dist)
# len(kp_pts_adm.Adm3_Code.unique())

In [33]:
# Distributed spatial join
settle_pts = pts.map_partitions(get_sj, settle_dist)

In [34]:
settle_pts['const'] = 1

# group by ID
settle_wsf_count = settle_pts.groupby(['FID_Settle'])['const'].sum()

#### Export

Export dask outputs

In [None]:
# pts.drop('geometry',axis=1).to_csv(os.path.join(geo_dir,'Population/HRSL/pak_general_v15_pts.csv'))
kp_pts_adm.to_csv(os.path.join(geo_dir,'Population/HRSL/kp_general_v15_pts.csv'), header=True, index=True, single_file=True)

Export normal routine spatial outputs

In [None]:
pts.to_crs(4326).to_file(os.path.join(geo_dir,'Population/HRSL/pak_general_v15_pts.gpkg'),layer="pak_general_v15_4326",driver='GPKG')
pts.to_crs(dcrs_int).to_file(os.path.join(geo_dir,'Population/HRSL/pak_general_v15_pts.gpkg'),layer=f"pak_general_v15_{dcrs_int}",driver='GPKG')

In [None]:
kp_pts.to_crs(4326).to_file(os.path.join(geo_dir,'Population/HRSL/kp_hrsl_v15_pts.gpkg'),layer="kp_general_v15_4326",driver='GPKG')
kp_pts.to_crs(dcrs_int).to_file(os.path.join(geo_dir,'Population/HRSL/kp_hrsl_v15_pts.gpkg'),layer=f"kp_general_v15_{dcrs_int}",driver='GPKG')

Export any custom object created by the SJ

In [39]:
settle_wsf_count.to_csv(os.path.join(data_dir,vect_out_dir,r'settle_wsf_count.csv'),header=True,index=True,single_file=True)

['P:/PAK/Code/kpgit_robert/data/vect_out/settle_wsf_count.csv']