# Spark notebook for doing spatial join of GIS points with admins

## Installation and setup

In [0]:
# installing the python packages using pip
!pip install pandas
!pip install numpy
!pip install geopandas
!pip install descartes
!pip install fiona
!pip install shapely
!pip install pyproj
!pip install matplotlib
!pip install pyspark
!pip install geospark

# importing the required python packages
import time
import datetime

import pandas as pd
import geopandas as gp

from pyspark.sql import SparkSession

from geospark.register import upload_jars
from geospark.register import GeoSparkRegistrator

upload_jars() # necessary to load in GeoSpark libraries manually installed to the server

spark = SparkSession.builder.getOrCreate()

GeoSparkRegistrator.registerAll(spark)

In [0]:
# import a more extensive list of pySpark and GeoSpark packages
import warnings
warnings.filterwarnings("ignore")

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as fs
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.sql.functions import count
from pyspark.sql.functions import col, countDistinct
from pyspark import SparkContext

import pylab as plt
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from pyspark.sql.functions import acos, cos, sin, lit, toRadians

import geospark
from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars
from geospark.core.formatMapper.shapefileParser import ShapefileReader
from geospark.core import SpatialRDD

In [0]:
# these help optimize the operations of Spark

spark.conf.set("spark.sql.shuffle.partitions", 1000)
spark.conf.set("spark.network.timeout", 1000)
spark.conf.set("spark.driver.memory", 20)
# spark.conf.set("spark.driver.cores", 20) # appears unnecessary with autoscale
spark.conf.set("spark.driver.maxResultSize", 10)
spark.conf.set("spark.serializer", KryoSerializer.getName)
spark.conf.set("spark.kryo.registrator", GeoSparkKryoRegistrator.getName)

In [0]:
%scala

import com.vividsolutions.jts.geom.{Coordinate, Geometry, GeometryFactory}
import org.datasyslab.geospark.formatMapper.shapefileParser.ShapefileReader
import org.datasyslab.geospark.spatialRDD.SpatialRDD
import org.datasyslab.geosparksql.utils.{Adapter, GeoSparkSQLRegistrator}
GeoSparkSQLRegistrator.registerAll(sqlContext)

## Read shapefiles and mobility data

In [0]:
# let's read admin3 and admin5 shapefiles from the GeoPackage
admin3 = gp.read_file("/dbfs/mnt/CUBEIQ/esapv/India_Administrative_Boundaries.gpkg",layer="Admin3")
admin5 = gp.read_file("/dbfs/mnt/CUBEIQ/esapv/India_Administrative_Boundaries.gpkg",layer="Admin5_TownVillageWard")

# load the whole cubeiq data into spark dataframe
data_path = ''
sdf = spark.read.format('delta').load(data_path)

## Convert latitude and longitudes to GIS points

In [0]:
# register sdf spark dataframe as sdf_ tempview
sdf.createOrReplaceTempView('sdf_')

# converting the long and lat columns into Point(X, Y) column using geospark
points = spark.sql("SELECT sdf_.device_id, sdf_.timestamp, ST_Point(CAST(sdf_.lon AS Decimal(24,20)),CAST(sdf_.lat AS Decimal(24,20))) AS geometry FROM sdf_")

## Create spark dataframe from geodataframe

In [0]:
# Enable Arrow-based columnar data transfer for speeding up this process
# Interestingly this operation can NOT make use of additional workers, which is why it takes around ~45 minutes to convert Admin3 and Admin5 to dataframes
# admin5 takes considerably more time

spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Creating spark dataframes from the geopandas GeoDataFrames

admin3 = spark.createDataFrame(
    admin3[['L3_CODE', 'geometry']]
  )

## Data filtering and spatial join

In [0]:
# now register spark dataframes as tables
sqlContext.registerDataFrameAsTable(points, "points")
sqlContext.registerDataFrameAsTable(admin3, "admin3_tbl")
# sqlContext.registerDataFrameAsTable(admin5, "admin5_tbl")

In [0]:
# here data can be filtered using some date range; this is optional
from_ = "01/01/2020"
to_ = "07/07/2020"

from_ = time.mktime(datetime.datetime.strptime(from_, "%d/%m/%Y").timetuple())
to_ = time.mktime(datetime.datetime.strptime(to_, "%d/%m/%Y").timetuple())

timestamps = (from_, to_)
points1 = points.where(col('timestamp').between(*timestamps))

# now register this as a table
sqlContext.registerDataFrameAsTable(points1, "pts_filter_tbl")

In [0]:
# spatial join for adm3
intersect_query_adm3 =  """
        SELECT s.L3_CODE, p.device_id, p.timestamp, p.geometry 
        FROM pts_filter_tbl AS p, admin3_tbl AS s 
        WHERE ST_Intersects(p.geometry, s.geometry)
"""

spatial_join_result = spark.sql(intersect_query_adm3)

In [0]:
# registering the spatial tables
sqlContext.registerDataFrameAsTable(spatial_join_result, "sjr_tbl")
# sqlContext.registerDataFrameAsTable(admin5, "admin5_tbl")

## Export

In [0]:
# Exporting spatial join results as a delta table we can access at any time
# Partitioning by L3 CODE for sanity's sake
# Based off here https://docs.databricks.com/delta/intro-notebooks.html
export_path = ''
spatial_join_result.write.format("delta").mode("overwrite").partitionBy("L3_CODE").save(export_path)