In [14]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import radians, cos, sin, col
from pyspark.ml.feature import VectorAssembler, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors


df = spark.read.format("delta").table("stg_gaia")
df = df.select(["source_id","ra","dec","distance","distance_gspphot"])
# 3. Convert (ra, dec, distance) → Cartesian (x, y, z)
df_coordinates = (
    df
    .withColumn("ra_rad", radians(col("ra")))
    .withColumn("dec_rad", radians(col("dec")))
    .withColumn("x", col("distance_gspphot") * cos(col("dec_rad")) * cos(col("ra_rad")))
    .withColumn("y", col("distance_gspphot") * cos(col("dec_rad")) * sin(col("ra_rad")))
    .withColumn("z", col("distance_gspphot") * sin(col("dec_rad")))
    .select("source_id","x","y","z")
)

#df_coordinates.select(["x","y","z"]).show(10)
#4. Assemble into a single feature vector column
assembler = VectorAssembler(inputCols=["x","y","z"], outputCol="features")
df_vec = assembler.transform(df_coordinates).select("source_id","features")

print(df_vec)
# 5. Initialize and train the LSH model
#    bucketLength= distance interval size; numHashTables=number of hash families
lsh = BucketedRandomProjectionLSH(
    inputCol="features",
    outputCol="hashes",
    bucketLength=5.0,
    numHashTables=3
)
model = lsh.fit(df_vec)   # this “trains” by picking random projection vectors

model_path = "abfss://b3979a08-4bbe-4d35-b864-4da7d2c8b2b4@onelake.dfs.fabric.microsoft.com/608b9b1d-a537-4410-abb1-240bd8dc87ff/Files/NNmodel"

# Remove old model directory if exists (since save() will error if path exists)

try:
    if (len(notebookutils.fs.ls(model_path))>0):
        print("in") # works in Databricks, use appropriate command in Fabric
        notebookutils.fs.rm(model_path,recurse=True)
except Exception as e:
    print("folder does not exist")
# Save updated model
model.save(model_path)

df_coordinates.write.format("delta").mode("overwrite").saveAsTable("stg_coordinates")



StatementMeta(, a03e3afc-2989-4863-9ed6-1a491ff96cfc, 16, Finished, Available, Finished)

DataFrame[source_id: bigint, features: vector]


in
