In [0]:
apt_snapshot_old = spark.sql("""
    SELECT *
    FROM 
    my_database.apt_snapshot_revision_35318151_14533 ---APR_snapshot
    -- my_database.apt_snapshot_revision_32248273_14533 ----Latest snaphot
    --from my_database.apt_snapshot_revision_19280027_14533 ----OLD snapsht
""")

In [0]:
from pyspark.sql import functions as F

def convert_to_unsigned(high, low):
    layer_id = 14533
    unsigned_high = int(high) & ((1 << 64) - 1)
    unsigned_low = int(low) & ((1 << 64) - 1)
    return f"{layer_id}_{unsigned_high}_{unsigned_low}"


# Assuming df is your DataFrame
apt_snapshot_old = apt_snapshot_old.withColumn("unsigned_id", F.udf(convert_to_unsigned, "string")( "id.high", "id.low"))
apt_snapshot_old_apa = apt_snapshot_old.filter(F.expr("exists(tags, tag -> tag.tagKey.key = 'metadata:apa:improvement')"))



In [0]:
apt_snapshot_new = spark.sql("""
    SELECT *
    FROM 
    my_database.apt_snapshot_revision_35390657_14533 
""")
 
 # Assuming df is your DataFrame
apt_snapshot_new = apt_snapshot_new.withColumn("unsigned_id", F.udf(convert_to_unsigned, "string")( "id.high", "id.low"))
apt_snapshot_new_apa = apt_snapshot_new.filter(F.expr("exists(tags, tag -> tag.tagKey.key = 'metadata:apa:improvement')"))


In [0]:
print(f"apt old count : {apt_snapshot_old.count()}")
print(f"apt new count : {apt_snapshot_new.count()}")
print(f"apa improvement old count : {apt_snapshot_old_apa.count()}")
print(f"apa improvement new count : {apt_snapshot_new_apa.count()}")

In [0]:
## Find the relocated apt exist in old snapshot and does not exist in new snapshot

delta_apa = apt_snapshot_old_apa.join(apt_snapshot_new_apa, "unsigned_id", "leftanti")

display(delta_apa)

In [0]:
## Count of missing relocated apt

# apt_snapshot_revision_35042875_14533 vs apt_snapshot_revision_35318151_14533 difference : 2300

delta_apa.select("unsigned_id").distinct().count()

In [0]:
delta_apa.select("lat", "lng").distinct().count()

In [0]:
matching_apa = apt_snapshot_old_apa.alias("old").join(apt_snapshot_new_apa.alias("new"), "unsigned_id", "left")
display(matching_apa)

In [0]:
from pyspark.sql import functions as F
import math

# Define a UDF to calculate distance using the Haversine formula
def haversine(lon1, lat1, lon2, lat2):
    R = 6371000  # Earth radius in meters
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)
    
    a = math.sin(delta_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    return R * c

# Register the UDF
haversine_udf = F.udf(haversine)

not_matched_apt = matching_apa.withColumn("lat_lon_match", F.expr("old.lat = new.lat AND old.lng = new.lng")).withColumn(
    "old_wkt", 
    F.concat(F.lit("POINT("), F.col("old.lng"), F.lit(" "), F.col("old.lat"), F.lit(")"))
).withColumn(
    "new_wkt", 
    F.concat(F.lit("POINT("), F.col("new.lng"), F.lit(" "), F.col("new.lat"), F.lit(")"))
).withColumn("distance_meters", haversine_udf("old.lng", "old.lat", "new.lng", "new.lat"))

display(not_matched_apt)

In [0]:
# find not matched
not_matched_apt = not_matched_apt.filter(F.expr("lat_lon_match = false")).orderBy(F.col("distance_meters").desc())

display(not_matched_apt)

In [0]:
not_matched_apt.count()

In [0]:
# filter on delta based on unsinged_id 14533_14594831958515712000_14839047867195262075

not_matched_apt.filter(F.expr("unsigned_id = '14533_11517816655427207168_15349482073535523071'")).display()