In [45]:
import numpy as np
from scipy.linalg import eigh
import pandas as pd
from pyspark.sql import SparkSession
from scipy.spatial import KDTree
from pyspark.sql.functions import udf
from pyspark.sql.types import *

In [20]:
def read_csv_point_cloud(csv_file):
    data = pd.read_csv(csv_file, header=None)
    return data

In [21]:
csv_file = "../../Impl/Data/Vaihingen/area2_cov_multi.csv"
points = read_csv_point_cloud(csv_file)

In [22]:
points

Unnamed: 0,0
0,0.00138718 1.10728 0.0171147 0 0.112454 0.8875...
1,0.00138718 1.10974 0.0171147 0 0.0746411 0.925...
2,0.00154131 1.11221 0.0171147 0 0.0746411 0.925...
3,0.00154131 1.11221 0.0171147 0 0.0746411 0.925...
4,0.00354501 1.11714 0.0171147 0 0.493422 0.5065...
...,...
266670,1.06612 1.76079 0.0214057 0.30497 0.385871 0.3...
266671,1.06628 1.76079 0.0171147 0.270976 0.440006 0....
266672,1.06643 1.76326 0.0171147 0.29536 0.318112 0.3...
266673,1.06643 1.76572 0.0171147 0.233511 0.364512 0....


In [23]:
points[['x', 'y', 'z', 'cl', 'cs', 'cp', 'label']] = points[0].str.split(' ', expand=True)
points

Unnamed: 0,0,x,y,z,cl,cs,cp,label
0,0.00138718 1.10728 0.0171147 0 0.112454 0.8875...,0.00138718,1.10728,0.0171147,0,0.112454,0.887546,2
1,0.00138718 1.10974 0.0171147 0 0.0746411 0.925...,0.00138718,1.10974,0.0171147,0,0.0746411,0.925359,2
2,0.00154131 1.11221 0.0171147 0 0.0746411 0.925...,0.00154131,1.11221,0.0171147,0,0.0746411,0.925359,2
3,0.00154131 1.11221 0.0171147 0 0.0746411 0.925...,0.00154131,1.11221,0.0171147,0,0.0746411,0.925359,2
4,0.00354501 1.11714 0.0171147 0 0.493422 0.5065...,0.00354501,1.11714,0.0171147,0,0.493422,0.506578,2
...,...,...,...,...,...,...,...,...
266670,1.06612 1.76079 0.0214057 0.30497 0.385871 0.3...,1.06612,1.76079,0.0214057,0.30497,0.385871,0.309159,5
266671,1.06628 1.76079 0.0171147 0.270976 0.440006 0....,1.06628,1.76079,0.0171147,0.270976,0.440006,0.289018,8
266672,1.06643 1.76326 0.0171147 0.29536 0.318112 0.3...,1.06643,1.76326,0.0171147,0.29536,0.318112,0.386527,8
266673,1.06643 1.76572 0.0171147 0.233511 0.364512 0....,1.06643,1.76572,0.0171147,0.233511,0.364512,0.401977,8


In [24]:
points.drop(columns=[0], inplace=True)

In [25]:
points

Unnamed: 0,x,y,z,cl,cs,cp,label
0,0.00138718,1.10728,0.0171147,0,0.112454,0.887546,2
1,0.00138718,1.10974,0.0171147,0,0.0746411,0.925359,2
2,0.00154131,1.11221,0.0171147,0,0.0746411,0.925359,2
3,0.00154131,1.11221,0.0171147,0,0.0746411,0.925359,2
4,0.00354501,1.11714,0.0171147,0,0.493422,0.506578,2
...,...,...,...,...,...,...,...
266670,1.06612,1.76079,0.0214057,0.30497,0.385871,0.309159,5
266671,1.06628,1.76079,0.0171147,0.270976,0.440006,0.289018,8
266672,1.06643,1.76326,0.0171147,0.29536,0.318112,0.386527,8
266673,1.06643,1.76572,0.0171147,0.233511,0.364512,0.401977,8


In [26]:
points_arr = points[['x', 'y', 'z']].to_numpy(dtype=np.float64)
points_arr

array([[1.38718e-03, 1.10728e+00, 1.71147e-02],
       [1.38718e-03, 1.10974e+00, 1.71147e-02],
       [1.54131e-03, 1.11221e+00, 1.71147e-02],
       ...,
       [1.06643e+00, 1.76326e+00, 1.71147e-02],
       [1.06643e+00, 1.76572e+00, 1.71147e-02],
       [1.06967e+00, 1.76326e+00, 1.71147e-02]])

In [27]:
def compute_geometric_features(points_arr):
    cov_matrix =np.cov(points_arr, rowvar=False)
    eigenvalues, eigenvectors = eigh(cov_matrix)
    idx = eigenvalues.argsort()[::-1]
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:, idx]
    omnivariance = np.prod(eigenvalues) ** (1/3)
    eigenentropy = -np.sum(eigenvalues * np.log(eigenvalues))
    anisotropy = (eigenvalues[0] - eigenvalues[2]) / eigenvalues[0]
    linearity = (eigenvalues[0] - eigenvalues[1]) / eigenvalues[0]
    planarity = (eigenvalues[1] - eigenvalues[2]) / eigenvalues[0]
    scattering = eigenvalues[2] / eigenvalues[0]
    sum_of_eigenvalues = np.sum(eigenvalues)
    change_of_curvature = eigenvalues[2] / np.sum(eigenvalues)
    return omnivariance, eigenentropy, anisotropy, linearity, planarity, scattering, sum_of_eigenvalues, change_of_curvature


In [28]:
print(compute_geometric_features(points_arr))

(0.014169021351177946, 0.3410513744379214, 0.9885157805234319, 0.3244220972197599, 0.664093683303672, 0.011484219476568159, 0.12074763212830705, 0.006807229754649193)


In [30]:
def find_k_closest_points(points, query_point, k):
    kdtree = KDTree(points)
    _, indices = kdtree.query(query_point, k) # Returns distances and indices of the k-closest points to the query points.

    return indices

In [31]:
def find_points_within_radius(points, query_point, r):
    kdtree = KDTree(points)
    indices = kdtree.query_ball_point(query_point, r) # Returns indices of all points within distance r from the query point.
    return indices

In [32]:
len(find_points_within_radius(points_arr, points_arr[3222], 0.2))

27747

In [35]:
import scipy

In [38]:
import os
import sys

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [54]:
from pyspark.sql.functions import udf
from pyspark.sql.types import *

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
# Convert numpy array to PySpark DataFrame
points_df = spark.createDataFrame(points)
# Define UDF to compute geometric features
@udf(returnType=ArrayType(DoubleType()))
def compute_geometric_features_udf(point, neighbors):
    # Compute geometric features using point and neighbors
    # Replace this with your actual implementation
    return [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]
# Find neighbors within radius 0.25 for each point
neighbors_df = points_df.rdd.map(lambda row: (row, find_points_within_radius(points_arr, row, 0.25))).toDF(["point", "neighbors"])
# Apply UDF to compute geometric features using neighbors
geometric_features_df = neighbors_df.withColumn("geometric_features",
    compute_geometric_features_udf(neighbors_df["point"], neighbors_df["neighbors"]))
# Collect the results as a new DataFrame
result_df = geometric_features_df.select("point", "geometric_features")

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 15.0 failed 1 times, most recent failure: Lost task 0.0 in stage 15.0 (TID 225) (Vidhish-HP executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1227, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 90, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 174, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'scipy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:181)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 1227, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 90, in read_command
    command = serializer._read_with_length(file)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 174, in _read_with_length
    return self.loads(obj)
           ^^^^^^^^^^^^^^^
  File "C:\Users\Vidhish17\Desktop\RE\gvcl\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 472, in loads
    return cloudpickle.loads(obj, encoding=encoding)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ModuleNotFoundError: No module named 'scipy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:181)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2438)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
