### Installing

Ensure that you have Homebrew installed on your macOS system.

```bash
brew install openjdk@17
brew link --force --overwrite openjdk@17
```

Modify your shell configuration file (e.g., `~/.bash_profile`, `~/.zshrc`, etc.) to set the environment variables:
```bash
export JAVA_HOME=$(/usr/libexec/java_home -v 17)
export PATH=$JAVA_HOME/bin:$PATH
```

Install AWS dependencies:
```bash
pyspark --packages org.apache.hadoop:hadoop-aws:3.3.2
```

In [1]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from sedona.spark import SedonaContext
import os
from pyspark.sql.functions import col, explode
from datetime import datetime, date
from pyspark import StorageLevel
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType, MapType
from tqdm import tqdm
from sedona.sql.types import GeometryType
load_dotenv()

True

In [2]:
!spark-submit --version

25/05/30 09:56:50 WARN Utils: Your hostname, PeteCastle.local resolves to a loopback address: 127.0.0.1; using 192.168.45.216 instead (on interface en0)
25/05/30 09:56:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.5.4
      /_/
                        
Using Scala version 2.12.18, Java HotSpot(TM) 64-Bit Server VM, 1.8.0_451
Branch HEAD
Compiled by user yangjie01 on 2024-12-17T04:51:46Z
Revision a6f220d951742f4074b37772485ee0ec7a774e7d
Url https://github.com/apache/spark
Type --help for more information.


In [3]:
sedona = SedonaContext.builder() \
    .appName("BDCCFinalExam") \
    .master("local[*]") \
    .config("spark.driver.memory", "7g") \
    .config("spark.executor.memory", "7g") \
    .config("spark.storage.memoryFraction","0.4") \
    .config("spark.memory.fraction","0.6") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY")) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") \
    .config("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") \
    .config("spark.jars.packages",
            'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
            "org.apache.hadoop:hadoop-aws:3.2.0,"\
            'org.datasyslab:geotools-wrapper:1.7.1-28.5,'\
            "com.amazonaws:aws-java-sdk-bundle:1.11.375") \
    .getOrCreate()

25/05/30 09:56:51 WARN Utils: Your hostname, PeteCastle.local resolves to a loopback address: 127.0.0.1; using 192.168.45.216 instead (on interface en0)
25/05/30 09:56:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/francismarkcayco/.ivy2/cache
The jars for the packages stored in: /Users/francismarkcayco/.ivy2/jars
org.apache.sedona#sedona-spark-shaded-3.3_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
org.datasyslab#geotools-wrapper added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7d02c370-b66b-4f32-bff6-34bc1f6441ff;1.0
	confs: [default]
	found org.apache.sedona#sedona-spark-shaded-3.3_2.12;1.7.1 in central


:: loading settings :: url = jar:file:/opt/anaconda3/envs/bdcc-final-exam/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.2.0 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.375 in central
	found org.datasyslab#geotools-wrapper;1.7.1-28.5 in central
:: resolution report :: resolve 106ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.375 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.0 from central in [default]
	org.apache.sedona#sedona-spark-shaded-3.3_2.12;1.7.1 from central in [default]
	org.datasyslab#geotools-wrapper;1.7.1-28.5 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   4   |   0   |   0   |   0   ||   4   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark

In [4]:
from sedona.register import SedonaRegistrator
from pyspark.sql.functions import input_file_name


# Register Sedona UDTs and functions
SedonaRegistrator.registerAll(sedona)

  SedonaRegistrator.registerAll(sedona)


True

In [5]:
snapshot_dates = [
    date(year, month, 1)
    for year in range(2025, 2017, -1)
    for month in (1, 4, 7, 10)
]

# filter out snapshot dates that are in future
snapshot_dates = [d for d in snapshot_dates if d <= date.today()]

print("Snapshot dates:")
for snapshot_date in snapshot_dates:
    print(snapshot_date.strftime("%Y-%m-%d"))

Snapshot dates:
2025-01-01
2025-04-01
2024-01-01
2024-04-01
2024-07-01
2024-10-01
2023-01-01
2023-04-01
2023-07-01
2023-10-01
2022-01-01
2022-04-01
2022-07-01
2022-10-01
2021-01-01
2021-04-01
2021-07-01
2021-10-01
2020-01-01
2020-04-01
2020-07-01
2020-10-01
2019-01-01
2019-04-01
2019-07-01
2019-10-01
2018-01-01
2018-04-01
2018-07-01
2018-10-01


In [6]:
FEATURES_TO_RETAIN = ['building','amenity','leisure','public_transport','office','shop','tourism']

schema = StructType([
    StructField("crs", StructType([
        StructField("properties", StructType([
            StructField("name", StringType(), True)
        ]), True),
        StructField("type", StringType(), True)
    ]), True),

    StructField("features", ArrayType(
        StructType([
            StructField("geometry", GeometryType(), True), 
            StructField("properties", StructType([
                StructField("amenity", StringType(), True),
                StructField("building", StringType(), True),
                StructField("element", StringType(), True),
                StructField("id", LongType(), True),
                StructField("leisure", StringType(), True),
                StructField("name", StringType(), True),
                StructField("office", StringType(), True),
                StructField("province", StringType(), True),
                StructField("public_transport", StringType(), True),
                StructField("region", StringType(), True),
                StructField("shop", StringType(), True),
                StructField("tourism", StringType(), True)
            ]), True),
            StructField("type", StringType(), True)
        ])
    ), True),

    StructField("name", StringType(), True),
    StructField("type", StringType(), True)
])

In [None]:
unique_features = {}
for snapshot_date in tqdm(snapshot_dates, desc="Processing snapshot dates"):
    snapshot_date_str = snapshot_date.strftime("%Y-%m-%d")

    df = sedona.read.format("geojson") \
        .option("multiLine", "true") \
        .schema(schema) \
        .load(f"s3a://amenities-dataset/amenities_v2/date={snapshot_date_str}/")

    exploded_df = df.select(explode("features").alias("feature")).repartition(8).persist(StorageLevel.MEMORY_AND_DISK)

    # Select all relevant properties in one pass
    values_df = exploded_df.select(
        *[col(f"feature.properties.{feature}").alias(feature) for feature in FEATURES_TO_RETAIN]
    )

    for feature in FEATURES_TO_RETAIN:
        distinct_values = (
            values_df
            .select(feature)
            # .filter(col(feature).isNotNull())
            .distinct()
            .rdd.flatMap(lambda x: x)
            .collect()
        )

        unique_features.setdefault(feature, set()).update(distinct_values)

    exploded_df.unpersist() 

Processing snapshot dates:   0%|          | 0/30 [00:00<?, ?it/s]25/05/30 09:56:56 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


CodeCache: size=131072Kb used=30600Kb max_used=30604Kb free=100472Kb
 bounds [0x000000010a9f8000, 0x000000010c808000, 0x00000001129f8000]
 total_blobs=11689 nmethods=10623 adapters=977
 compilation: disabled (not enough contiguous free space left)




In [None]:
import json

serializable_features = {k: list(v) for k, v in unique_features.items()}

with open("unique_features.json", "w") as f:
    json.dump(serializable_features, f, indent=4)