In [0]:
import requests
url = "https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime=2014-04-15&endtime=2014-06-01"

# Fetch data from API
response = requests.get(url)
data = response.json()  # Convert response to JSON

eq_data_features=data["features"]
print(eq_data_features)



[{'type': 'Feature', 'properties': {'mag': 4.8, 'place': '37 km ENE of Cortes, Philippines', 'time': 1401580599250, 'updated': 1409016068000, 'tz': None, 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/usc000r9n5', 'detail': 'https://earthquake.usgs.gov/fdsnws/event/1/query?eventid=usc000r9n5&format=geojson', 'felt': None, 'cdi': None, 'mmi': None, 'alert': None, 'status': 'reviewed', 'tsunami': 0, 'sig': 354, 'net': 'us', 'code': 'c000r9n5', 'ids': ',usc000r9n5,', 'sources': ',us,', 'types': ',cap,origin,phase-data,', 'nst': None, 'dmin': 2.499, 'rms': 0.89, 'gap': 108, 'magType': 'mb', 'type': 'earthquake', 'title': 'M 4.8 - 37 km ENE of Cortes, Philippines'}, 'geometry': {'type': 'Point', 'coordinates': [126.5072, 9.4091, 10]}, 'id': 'usc000r9n5'}, {'type': 'Feature', 'properties': {'mag': 0.8, 'place': '13km N of Borrego Springs, CA', 'time': 1401580576230, 'updated': 1457682658270, 'tz': None, 'url': 'https://earthquake.usgs.gov/earthquakes/eventpage/ci15507425', 'detail

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, LongType, IntegerType, ArrayType

eq_properties_schema = StructType([
    StructField("mag", StringType(),True),
    StructField("place", StringType(), True),
    StructField("time", LongType(), True),    
    StructField("updated", LongType(), True),  
    StructField("tz", StringType(), True),      
    StructField("url", StringType(), True),  
    StructField("detail", StringType(), True),  
    StructField("felt", StringType(), True), 
    StructField("cdi", StringType(), True),
    StructField("mmi", StringType(), True), 
    StructField("alert", StringType(), True),  
    StructField("status", StringType(), True),  
    StructField("tsunami", IntegerType(), True),  # Updated from BooleanType
    StructField("sig", StringType(), True),  # Updated from DoubleType
    StructField("net", StringType(), True), 
    StructField("code", StringType(), True),  # Updated from DoubleType
    StructField("sources", StringType(), True), 
    StructField("types", StringType(), True),
    StructField("nst", StringType(), True),  # Updated from DoubleType
    StructField("dmin", StringType(), True),
    StructField("rms", StringType(), True), 
    StructField("gap", StringType(), True),  # Gap was missing
    StructField("magType", StringType(), True),
    StructField("type", StringType(), True), 
    StructField("title", StringType(), True)
])

eq_geometry_schema = StructType([
    StructField("type", StringType(), True),
    StructField("coordinates", ArrayType(StringType()), True)
])


eq_schema = StructType([
    StructField("type", StringType(), True),
    StructField("properties", eq_properties_schema, True),
    StructField("geometry", eq_geometry_schema, True),
    StructField("id", StringType(), True)
])

In [0]:
eq_bronze_df = spark.createDataFrame(eq_data_features, schema=eq_schema)

eq_bronze_df.describe()

Out[77]: DataFrame[summary: string, type: string, id: string]

In [0]:
from pyspark.sql.functions import explode

for col_name in array_columns:
    eq_bronze_df = eq_bronze_df.withColumn(col_name, explode(eq_bronze_df[col_name]))

eq_bronze_df.show()


+-------+--------------------+--------------------+-----------+
|   type|          properties|            geometry|         id|
+-------+--------------------+--------------------+-----------+
|Feature|{1.29, 10km SSW o...|{Point, [-116.777...| ci11408890|
|Feature|{1.1, Central Ala...|{Point, [-151.645...|ak01421ig3u|
|Feature|{1.2, 7 km SSW of...|{Point, [-150.016...|ak01421i2zj|
|Feature|{1.4, 32 km N of ...|{Point, [-150.827...|ak01421heui|
|Feature|{4, 28 km WNW of ...|{Point, [-71.621,...| usc000mnnn|
|Feature|{0.53, 5 km WSW o...|{Point, [-122.783...| nc72134466|
|Feature|{0.7, 6 km E of M...|{Point, [-118.894...| nc72134461|
|Feature|{0.47, 11km ESE o...|{Point, [-116.562...| ci11408882|
|Feature|{0.6, 14 km ENE o...|{Point, [-118.464...| nn00557597|
|Feature|{4.2, south of th...|{Point, [-176.820...| usb000m2w9|
|Feature|{1.8, 74 km SW of...|{Point, [-167.302...|ak01421dpqr|
|Feature|{4.2, 272 km S of...|{Point, [128.3539...| usb000m2wa|
|Feature|{0.64, 8 km ESE o...|{Point, [-

In [0]:
from pyspark.sql.functions import col

eq_bronze_exploded = eq_bronze_df.select(
    "type",
    "properties.mag",
    "properties.place",
    "properties.time",
    "properties.updated",
    "properties.tz",
    "properties.url",
    "properties.detail",
    "properties.felt",
    "properties.cdi",
    "properties.mmi",
    "properties.alert",
    "properties.status",
    "properties.tsunami",
    "properties.sig",
    "properties.net",
    "properties.code",
    "properties.sources",
    "properties.types",
    "properties.nst",
    "properties.dmin",
    "properties.rms",
    "properties.gap",
    "properties.magType",
    col("properties.type").alias("properties_type"),
    "properties.title",
    col("geometry.type").alias("geometry_type"),
    col("geometry.coordinates").getItem(0).alias("latitude"),
    col("geometry.coordinates").getItem(0).alias("longitude"),
    col("geometry.coordinates").getItem(0).alias("depth"),
    col("id")
)


eq_bronze_exploded.show()

+-------+----+--------------------+-------------+-------------+----+--------------------+--------------------+----+----+----+-----+---------+-------+---+---+----------+-------+--------------------+----+----------+----+-----+-------+---------------+--------------------+-------------+------------+------------+------------+------------+
|   type| mag|               place|         time|      updated|  tz|                 url|              detail|felt| cdi| mmi|alert|   status|tsunami|sig|net|      code|sources|               types| nst|      dmin| rms|  gap|magType|properties_type|               title|geometry_type|    latitude|   longitude|       depth|          id|
+-------+----+--------------------+-------------+-------------+----+--------------------+--------------------+----+----+----+-----+---------+-------+---+---+----------+-------+--------------------+----+----------+----+-----+-------+---------------+--------------------+-------------+------------+------------+------------+------

In [0]:
eq_bronze_exploded.describe()

Out[79]: DataFrame[summary: string, type: string, mag: string, place: string, time: string, updated: string, tz: string, url: string, detail: string, felt: string, cdi: string, mmi: string, alert: string, status: string, tsunami: string, sig: string, net: string, code: string, sources: string, types: string, nst: string, dmin: string, rms: string, gap: string, magType: string, properties_type: string, title: string, geometry_type: string, latitude: string, longitude: string, depth: string, id: string]

In [0]:
eq_bronze_exploded.filter(col("mag")>5).show()




+-------+---+--------------------+-------------+-------------+----+--------------------+--------------------+----+----+-----+------+--------+-------+---+---+--------+--------------------+--------------------+----+------+----+----+-------+---------------+--------------------+-------------+---------+---------+---------+----------+
|   type|mag|               place|         time|      updated|  tz|                 url|              detail|felt| cdi|  mmi| alert|  status|tsunami|sig|net|    code|             sources|               types| nst|  dmin| rms| gap|magType|properties_type|               title|geometry_type| latitude|longitude|    depth|        id|
+-------+---+--------------------+-------------+-------------+----+--------------------+--------------------+----+----+-----+------+--------+-------+---+---+--------+--------------------+--------------------+----+------+----+----+-------+---------------+--------------------+-------------+---------+---------+---------+----------+
|Featur