### Installing

Ensure that you have Homebrew installed on your macOS system.

```bash
brew install openjdk@17
brew link --force --overwrite openjdk@17
```

Modify your shell configuration file (e.g., `~/.bash_profile`, `~/.zshrc`, etc.) to set the environment variables:
```bash
export JAVA_HOME=$(/usr/libexec/java_home -v 17)
export PATH=$JAVA_HOME/bin:$PATH
```

Install AWS dependencies:
```bash
pyspark --packages org.apache.hadoop:hadoop-aws:3.3.2
```

In [1]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from sedona.spark import SedonaContext
import os
from pyspark.sql.functions import col, explode
from datetime import datetime, date
from pyspark import StorageLevel
from pyspark.sql.types import StructType, StructField, StringType, LongType, ArrayType, MapType
from tqdm import tqdm
from sedona.sql.types import GeometryType
from pathlib import Path
load_dotenv()

True

In [None]:
!spark-submit --version

In [None]:
CACHE_DIR = Path('../_cache')
LOGS_DIR = Path('../_logs')
DATASET_DIR = Path('../datasets')

In [None]:
sedona = SedonaContext.builder() \
    .appName("BDCCFinalExam") \
    .master("local[*]") \
    .config("spark.driver.memory", "7g") \
    .config("spark.executor.memory", "7g") \
    .config("spark.storage.memoryFraction","0.4") \
    .config("spark.memory.fraction","0.6") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider') \
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("AWS_ACCESS_KEY_ID")) \
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("AWS_SECRET_ACCESS_KEY")) \
    .config("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com") \
    .config("spark.hadoop.fs.s3a.fast.upload", "true") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.executor.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") \
    .config("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true") \
    .config("spark.jars.packages",
            'org.apache.sedona:sedona-spark-shaded-3.3_2.12:1.7.1,'
            "org.apache.hadoop:hadoop-aws:3.2.0,"\
            'org.datasyslab:geotools-wrapper:1.7.1-28.5,'\
            "com.amazonaws:aws-java-sdk-bundle:1.11.375") \
    .getOrCreate()

In [None]:
from sedona.register import SedonaRegistrator
from pyspark.sql.functions import input_file_name


# Register Sedona UDTs and functions
SedonaRegistrator.registerAll(sedona)

In [None]:
snapshot_dates = [
    date(year, month, 1)
    for year in range(2025, 2017, -1)
    for month in (1, 4, 7, 10)
]

# filter out snapshot dates that are in future
snapshot_dates = [d for d in snapshot_dates if d <= date.today()]

print("Snapshot dates:")
for snapshot_date in snapshot_dates:
    print(snapshot_date.strftime("%Y-%m-%d"))

In [None]:
FEATURES_TO_RETAIN = ['building','amenity','leisure','public_transport','office','shop','tourism']

schema = StructType([
    StructField("crs", StructType([
        StructField("properties", StructType([
            StructField("name", StringType(), True)
        ]), True),
        StructField("type", StringType(), True)
    ]), True),

    StructField("features", ArrayType(
        StructType([
            StructField("geometry", GeometryType(), True), 
            StructField("properties", StructType([
                StructField("amenity", StringType(), True),
                StructField("building", StringType(), True),
                StructField("element", StringType(), True),
                StructField("id", LongType(), True),
                StructField("leisure", StringType(), True),
                StructField("name", StringType(), True),
                StructField("office", StringType(), True),
                StructField("province", StringType(), True),
                StructField("public_transport", StringType(), True),
                StructField("region", StringType(), True),
                StructField("shop", StringType(), True),
                StructField("tourism", StringType(), True)
            ]), True),
            StructField("type", StringType(), True)
        ])
    ), True),

    StructField("name", StringType(), True),
    StructField("type", StringType(), True)
])

In [None]:
unique_features = {}
for snapshot_date in tqdm(snapshot_dates, desc="Processing snapshot dates"):
    snapshot_date_str = snapshot_date.strftime("%Y-%m-%d")

    df = sedona.read.format("geojson") \
        .option("multiLine", "true") \
        .schema(schema) \
        .load(f"s3a://amenities-dataset/amenities_v2/date={snapshot_date_str}/")

    exploded_df = df.select(explode("features").alias("feature")).repartition(8).persist(StorageLevel.MEMORY_AND_DISK)

    # Select all relevant properties in one pass
    values_df = exploded_df.select(
        *[col(f"feature.properties.{feature}").alias(feature) for feature in FEATURES_TO_RETAIN]
    )

    for feature in FEATURES_TO_RETAIN:
        distinct_values = (
            values_df
            .select(feature)
            # .filter(col(feature).isNotNull())
            .distinct()
            .rdd.flatMap(lambda x: x)
            .collect()
        )

        unique_features.setdefault(feature, set()).update(distinct_values)

    exploded_df.unpersist() 

In [None]:
import json

serializable_features = {k: list(v) for k, v in unique_features.items()}

with open(DATASET_DIR / "unique_features.json", "w") as f:
    json.dump(serializable_features, f, indent=4)