# Spatial Data Analysis

In this notebook:
* Loading spatial data with geospark
* Using JoinQuery without indexing
* Trying different indexing methods

## Imports

In [1]:
%load_ext memory_profiler

In [2]:
import os

In [3]:
import folium
from time import time
import geopandas as gpd
import copy
from pyspark.sql import SparkSession
from geospark.core.SpatialRDD import PointRDD
from geospark.core.SpatialRDD import PolygonRDD
from pyspark import StorageLevel
from pyspark.sql import SQLContext
from geospark.register import GeoSparkRegistrator
from geospark.utils import GeoSparkKryoRegistrator, KryoSerializer
from geospark.register import upload_jars
from geospark.core.enums import FileDataSplitter
from geospark.core.spatialOperator import RangeQuery
from geospark.sql.types import GeometryType
from geospark.core.enums import IndexType
from geospark.core.spatialOperator import JoinQuery
from pyspark.sql.types import LongType
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from geospark.core.enums import GridType

## Environment

In [4]:
%env SPARK_HOME /opt/conda/envs/geospark_demo/lib/python3.7/site-packages/pyspark
%env ARROW_PRE_0_15_IPC_FORMAT 1
%env JAVA_HOME /opt/conda/envs/geospark_demo

env: SPARK_HOME=/opt/conda/envs/geospark_demo/lib/python3.7/site-packages/pyspark
env: ARROW_PRE_0_15_IPC_FORMAT=1
env: JAVA_HOME=/opt/conda/envs/geospark_demo


In [5]:
#Generate spark session
upload_jars()
spark = SparkSession.builder.\
        master("local[1]").\
        appName("SpatialAnalysis").\
        config("spark.serializer", KryoSerializer.getName).\
        config("spark.kryo.registrator", GeoSparkKryoRegistrator.getName) .\
        getOrCreate()

In [6]:
GeoSparkRegistrator.registerAll(spark)

True

In [7]:
sc = spark.sparkContext
sqlContext = SQLContext(sc)

## Loading Data

In [8]:
data_path = "../../../data/HW2/nyc-data"
nyc_tweets_location = "file://" + os.path.abspath(data_path)+ "/nyc-tweets.txt"
nyc_neighborhoods_location = "file://" + os.path.abspath(data_path)+ "/nyc-neighborhoods.wkt"

In [9]:
# Loading point data set from twitter tweets 
def load_data():
    point_rdd = PointRDD(sc, nyc_tweets_location, 0, FileDataSplitter.CSV, False) #, 10, StorageLevel.MEMORY_ONLY, "epsg:4326", "epsg:4326")
    point_rdd.analyze()
    point_rdd.spatialPartitioning(GridType.KDBTREE)

    # Loading polygon dataset corresponding to the neighborhood regions in New York
    polygon_rdd = PolygonRDD(sc, nyc_neighborhoods_location, FileDataSplitter.WKT, False)
    polygon_rdd.spatialPartitioning(point_rdd.getPartitioner())
    return point_rdd, polygon_rdd

In [19]:
%memit point_rdd, polygon_rdd = load_data()

peak memory: 175.24 MiB, increment: 0.00 MiB


## Without Index

Finding the numbers of tweets that are contained within each one of the neighborhoord polygons without using spatial indexes.

In [20]:
start = time()
spatial_join_result_non_flat = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, False, False)
print(f"Finished after {time() - start} seconds")

Finished after 0.021333694458007812 seconds


In [12]:
start = time()
number_of_points = spatial_join_result_non_flat.map(lambda x: [x[0].geom, x[1].__len__()])
schema = StructType([
    StructField("neighborhood", GeometryType(), False),
    StructField("number_of_points", LongType(), False)
])
df = spark.createDataFrame(number_of_points, schema, verifySchema=False)
df.show()
print(f"Created dataframe in {time() - start} seconds")

+--------------------+----------------+
|        neighborhood|number_of_points|
+--------------------+----------------+
|POLYGON ((-73.740...|           53103|
|POLYGON ((-73.883...|          175269|
|POLYGON ((-73.975...|          318134|
|POLYGON ((-73.847...|          110461|
|POLYGON ((-73.902...|          299634|
|POLYGON ((-73.898...|          128726|
|POLYGON ((-73.918...|          101676|
|POLYGON ((-73.932...|          283657|
|POLYGON ((-73.883...|          140596|
|POLYGON ((-73.793...|           85819|
|POLYGON ((-73.887...|          172802|
|POLYGON ((-73.862...|           71699|
|POLYGON ((-73.894...|           62533|
|POLYGON ((-73.974...|          111565|
|POLYGON ((-73.993...|          499036|
|POLYGON ((-73.854...|           69036|
|POLYGON ((-73.992...|          123127|
|POLYGON ((-73.940...|          184815|
|POLYGON ((-73.896...|           85414|
|POLYGON ((-73.849...|          117237|
+--------------------+----------------+
only showing top 20 rows

Created datafr

In [13]:
# Save
df.coalesce(1).toPandas().to_csv('output.csv',  index=False)

## With Indexing

Finding the numbers of tweets that are contained within each one of the neighborhoord polygons using spatial indexes.

In [14]:
index_types = [False, IndexType.QUADTREE, IndexType.RTREE]

In [15]:
for point_index in index_types:
    for polygon_index in index_types:
        point_rdd, polygon_rdd = load_data()
        if point_index:
            point_rdd.buildIndex(point_index, True)
        if polygon_index:
            polygon_rdd.buildIndex(polygon_index, True)

        start = time()
        spatial_join_result_non_flat = JoinQuery.SpatialJoinQuery(point_rdd, polygon_rdd, True, False)
        print(f" Point index: {point_index}, Polygon index: {polygon_index}")
        print(f"Finished after {time() - start} seconds \n")

 Point index: False, Polygon index: False
Finished after 0.027492046356201172 seconds 

 Point index: False, Polygon index: IndexType.QUADTREE
Finished after 0.019048213958740234 seconds 

 Point index: False, Polygon index: IndexType.RTREE
Finished after 0.015477657318115234 seconds 

 Point index: IndexType.QUADTREE, Polygon index: False
Finished after 0.018654346466064453 seconds 

 Point index: IndexType.QUADTREE, Polygon index: IndexType.QUADTREE
Finished after 0.013472318649291992 seconds 

 Point index: IndexType.QUADTREE, Polygon index: IndexType.RTREE
Finished after 0.01818108558654785 seconds 

 Point index: IndexType.RTREE, Polygon index: False
Finished after 0.014382600784301758 seconds 

 Point index: IndexType.RTREE, Polygon index: IndexType.QUADTREE
Finished after 0.02059030532836914 seconds 

 Point index: IndexType.RTREE, Polygon index: IndexType.RTREE
Finished after 0.013378143310546875 seconds 

