In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, ArrayType, LongType

spark = SparkSession.builder \
    .appName("Read GeoJSON with PySpark") \
    .getOrCreate()


# Define the custom schema
custom_schema = StructType([
    StructField("type", StringType(), nullable=True),
    StructField("id", LongType(), nullable=True),
    StructField("properties", StructType([
        StructField("boroughCode", LongType(), nullable=True),
        StructField("borough", StringType(), nullable=True),
        StructField("@id", StringType(), nullable=True)
    ]), nullable=True),
    StructField("geometry", StructType([
        StructField("type", StringType(), nullable=True),
        StructField("coordinates", ArrayType(ArrayType(ArrayType(DoubleType()))), nullable=True)
    ]), nullable=True)
])

# Read GeoJSON file with custom schema
geojson_df = spark.read.schema(custom_schema).json("nyc-boroughs.geojson")



In [27]:
# View the schema
geojson_df.printSchema()

# Show the data
geojson_df.show()
df_no_null = geojson_df.na.drop()

# Show the resulting DataFrame
df_no_null.show()

num_rows = df_no_null.count()
print(num_rows)

root
 |-- type: string (nullable = true)
 |-- id: long (nullable = true)
 |-- properties: struct (nullable = true)
 |    |-- boroughCode: long (nullable = true)
 |    |-- borough: string (nullable = true)
 |    |-- @id: string (nullable = true)
 |-- geometry: struct (nullable = true)
 |    |-- type: string (nullable = true)
 |    |-- coordinates: array (nullable = true)
 |    |    |-- element: array (containsNull = true)
 |    |    |    |-- element: array (containsNull = true)
 |    |    |    |    |-- element: double (containsNull = true)

+-------+----+--------------------+--------------------+
|   type|  id|          properties|            geometry|
+-------+----+--------------------+--------------------+
|   NULL|NULL|                NULL|                NULL|
|   NULL|NULL|                NULL|                NULL|
|   NULL|NULL|                NULL|                NULL|
|Feature|   0|{5, Staten Island...|{Polygon, [[[-74....|
|   NULL|NULL|                NULL|                NULL