In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, LongType

spark = SparkSession.builder \
    .appName("Read GeoJSON with PySpark") \
    .getOrCreate()


# Define the custom schema
custom_schema = StructType([
    StructField("type", StringType(), nullable=True),
    StructField("id", LongType(), nullable=True),
    StructField("properties", StructType([
        StructField("boroughCode", LongType(), nullable=True),
        StructField("borough", StringType(), nullable=True),
        StructField("@id", StringType(), nullable=True)
    ]), nullable=True),
    StructField("geometry", StructType([
        StructField("type", StringType(), nullable=True),
        StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType()))), nullable=True)
    ]), nullable=True)
])

# Read GeoJSON file with custom schema
geojson_df = spark.read.schema(custom_schema).json("nyc-boroughs.geojson")



In [2]:
# View the schema
geojson_df.printSchema()

# Show the data
geojson_df.show()
geojson_df = geojson_df.na.drop() #remove the NULL values

# Show the resulting DataFrame
geojson_df.show()

num_rows = geojson_df.count()
print(num_rows)

root
 |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- @id: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)

+-------+----+--------------------+--------------------+
|   type|  id|          properties|            geometry|
+-------+----+--------------------+--------------------+
|   NULL|NULL|                NULL|                NULL|
|   NULL|NULL|                NULL|                NULL|
|   NULL|NULL|                NULL|                NULL|
|Feature|   0|{5, Staten Island...|{Polygon, [[[-74....|
|   NULL|NULL|                NULL|                NULL

In [3]:
!pip install shapely



In [4]:
from pyspark.sql.functions import udf, col
from shapely.geometry import shape, Point
#here i try to enrich the JSON data with shapely

#define a UDF to convert the geometry array to a Shapely polygon
def array_to_polygon(coordinates):
    polygon = shape({"type": "Polygon", "coordinates": coordinates})
    return polygon

def polygon_area(coordinates):
    polygon = shape({"type": "Polygon", "coordinates": coordinates})
    return polygon.area
    
#define UDFs
array_to_polygon_udf = udf(array_to_polygon, StructType([StructField("type", StringType()), StructField("coordinates", StringType())]))
polygon_area_udf = udf(polygon_area, DoubleType())

# Enrich the GeoJSON DataFrame with the Shapely polygons
enriched_geojson = (geojson_df.withColumn("area", polygon_area_udf("geometry.coordinates")))

# Now the enriched_geojson DataFrame will have a new column 'polygon' containing Shapely polygon objects
enriched_geojson.printSchema()

sorted_enriched_geojson = enriched_geojson.orderBy(col("properties.boroughCode"), col("area").desc())

#showing causes an error, but it should be sorted
sorted_enriched_geojson.show()

root
 |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- @id: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)
 |-- area: double (nullable = true)

+-------+---+--------------------+--------------------+--------------------+
|   type| id|          properties|            geometry|                area|
+-------+---+--------------------+--------------------+--------------------+
|Feature| 72|{1, Manhattan, ht...|{Polygon, [[[-73....|0.005859077996035753|
|Feature| 71|{1, Manhattan, ht...|{Polygon, [[[-73....|2.327165585676201...|
|Feature| 51|{1, Manhattan, ht...

In [5]:
from pyspark.sql.functions import broadcast


geojson_iterable = enriched_geojson.collect()


# Define a UDF to get the borough name
def get_borough_name(lat, lon, data):
    point = Point(lon, lat)
    for row in data:
        polygon = shape({"type": "Polygon", "coordinates": row["geometry"]["coordinates"]})
        if polygon.contains(point):
            return row["properties"]["borough"]
    return None

# Register the UDF
get_borough_name_udf = udf(lambda lat, lon: get_borough_name(lat, lon, geojson_iterable), StringType())

    

In [6]:
sample_df = spark.read.csv("sample.csv", header=True, inferSchema=True)

# Show the DataFrame schema and some sample data
sample_df.printSchema()
sample_df.show()

root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)

+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|    pickup_datetime

In [7]:
# Enrich Taxi Ride Data
enriched_sample_data = (sample_df.withColumn("pickup_borough", get_borough_name_udf("pickup_latitude", "pickup_longitude"))
                               .withColumn("dropoff_borough", get_borough_name_udf("dropoff_latitude", "dropoff_longitude"))
                       )

enriched_sample_data.printSchema()
enriched_sample_data.show()


root
 |-- medallion: string (nullable = true)
 |-- hack_license: string (nullable = true)
 |-- vendor_id: string (nullable = true)
 |-- rate_code: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_time_in_secs: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- pickup_borough: string (nullable = true)
 |-- dropoff_borough: string (nullable = true)

+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+---------

In [10]:
from pyspark.sql.functions import unix_timestamp

#convert the timestamps
enriched_sample_data = enriched_sample_data.withColumn("pickup_ts_ms", unix_timestamp("pickup_datetime") * 1000) \
    .withColumn("dropoff_ts_ms", unix_timestamp("dropoff_datetime") * 1000)

#we compute the time and filter out the ones we don't want
enriched_sample_data = enriched_sample_data.withColumn("duration_ms", col("dropoff_ts_ms") - col("pickup_ts_ms"))
threshold_duration_ms = 4 * 60 * 60 * 1000  # 4 hours in milliseconds
cleaned_sample_data = enriched_sample_data.filter((col("duration_ms") >= 0) & (col("duration_ms") <= threshold_duration_ms))

#drop unnecessary columns
cleaned_sample_data = cleaned_sample_data.drop("pickup_ts_ms", "dropoff_ts_ms")

In [11]:
cleaned_sample_data.show()

+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+--------------+---------------+-----------+
|           medallion|        hack_license|vendor_id|rate_code|store_and_fwd_flag|    pickup_datetime|   dropoff_datetime|passenger_count|trip_time_in_secs|trip_distance|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|pickup_borough|dropoff_borough|duration_ms|
+--------------------+--------------------+---------+---------+------------------+-------------------+-------------------+---------------+-----------------+-------------+----------------+---------------+-----------------+----------------+--------------+---------------+-----------+
|89D227B655E5C82AE...|BA96DE419E711691B...|      CMT|        1|                 N|2013-01-01 15:11:48|2013-01-01 15:18:10|              4|              38