In [3]:
import os
import geopandas as gpd
from shapely.geometry import MultiLineString, LineString
import geopandas as gpd
import cuspatial
import cudf
import time

In [4]:

folder_path = r'Data'
file_names = os.listdir(folder_path)
file_names

lines = gpd.read_file(f'Data/{file_names[0]}')
points_fine = gpd.read_file(f'Data/{file_names[1]}')
points_coarse = gpd.read_file(f'Data/{file_names[2]}')


In [14]:
print(lines.head())
print(points_fine.head())
print(points_coarse.head())

                                            geometry
0  LINESTRING (698700.023 6752939.839, 698657.573...
1  LINESTRING (698657.573 6752977.403, 698700.897...
2  LINESTRING (699053.068 6753243.991, 699124.000...
3  LINESTRING (699420.273 6753430.681, 699488.419...
4  LINESTRING (699420.273 6753430.681, 699422.060...
                   geometry
0  POINT (3.01190 47.87969)
1  POINT (3.01190 47.87970)
2  POINT (3.01190 47.87971)
3  POINT (3.01190 47.87971)
4  POINT (3.01191 47.87969)
                   geometry
0  POINT (3.01191 47.87970)
1  POINT (3.01177 47.87951)
2  POINT (3.01177 47.87955)
3  POINT (3.01178 47.87958)
4  POINT (3.01180 47.87951)


In [2]:

class LineToMultiLineConversion:
    """
    A class to convert a GeoPandas DataFrame containing LineStrings 
    into a MultiLineString, with optional duplication.

    Attributes:
        geopandasDataFrame (GeoDataFrame): The input GeoPandas DataFrame.
        EPSG (int): The EPSG code for the coordinate reference system.
    """

    def __init__(self, geopandasDataFrame, EPSG=3857):
        """
        Initializes the LineToMultiLineConversion class with the given GeoDataFrame 
        and EPSG code.

        Inputs:
            geopandasDataFrame (GeoDataFrame): The input GeoPandas DataFrame.
            EPSG (int, optional): The EPSG code for the coordinate reference system. Defaults to 3857.

        Raises:
            TypeError: If EPSG is not an integer.
            ValueError: If EPSG is non-positive.
        """
        if not isinstance(EPSG, int):
            raise TypeError("The EPSG must be an integer")
        
        if EPSG <= 0:
            raise ValueError("The EPSG must be non-negative")

        self.geopandasDataFrame = geopandasDataFrame
        self.EPSG = EPSG
        self.geopandasDataFrame.to_crs(EPSG, inplace=True)

    def convertToMultiline(self, duplicate=False, n_times=10):
        """
        Converts the LineStrings in the GeoDataFrame to a MultiLineString.
        Optionally duplicates the MultiLineString a specified number of times.

        Inputs:
            duplicate (bool, optional): If True, duplicates the MultiLineString n_times times. Defaults to False.
            n_times (int, optional): The number of times to duplicate the MultiLineString if duplicate is True. Defaults to 10.

        Outputs:
            GeoDataFrame: A new GeoDataFrame containing the MultiLineString, 
                          or multiple copies of it if duplicate is True.
        """
        multiline = MultiLineString(self.geopandasDataFrame.geometry.tolist())

        if duplicate:
            multiline_list = [multiline for _ in range(n_times)]
            multiline_list_gp_df = gpd.GeoDataFrame({'geometry': multiline_list}, crs=self.EPSG)
            return multiline_list_gp_df

        multiline_gp_df = gpd.GeoDataFrame({'geometry': [multiline]})
        return multiline_gp_df

class PointLinesDistanceCalculation:
    """
    A class to calculate the distance between points and lines using cuSpatial.
    
    Attributes:
        gpdDataFramePoints (GeoDataFrame): The input GeoDataFrame containing points.
        gpdDataFrameLines (GeoDataFrame): The input GeoDataFrame containing lines.
        EPSG (int): The EPSG code for the coordinate reference system.
    """
    
    def __init__(self, gpdDataFramePoints, gpdDataFrameLines, EPSG=3857):
        """
        Initializes the PointLinesDistanceCalculation class with the given GeoDataFrames 
        and EPSG code.

        Inputs:
            gpdDataFramePoints (GeoDataFrame): The input GeoDataFrame containing points.
            gpdDataFrameLines (GeoDataFrame): The input GeoDataFrame containing lines.
            EPSG (int, optional): The EPSG code for the coordinate reference system. Defaults to 3857.
        """
        # Initialize the LineToMultiLineConversion with the lines GeoDataFrame and EPSG
        converter = LineToMultiLineConversion(gpdDataFrameLines, EPSG)
        
        # Store the number of points
        self.n = len(gpdDataFramePoints)
        self.EPSG = EPSG 

        # Convert the lines to MultiLineString, duplicated to match the number of points
        self.gpdDataFrameMultiLines = converter.convertToMultiline(duplicate=True, n_times=self.n)
        
        # Convert the points GeoDataFrame to the specified CRS
        gpdDataFramePoints.to_crs(self.EPSG, inplace=True) 
        self.gpdDataFramePoints = gpdDataFramePoints

    def calculateDistance(self, batch_size=5000):
        """
        Calculates the distance between points and lines in batches using cuSpatial.

        Inputs:
            batch_size (int, optional): The number of points and lines to process in each batch. Defaults to 5000.

        Outputs:
            list: A list of arrays containing the distances for each batch.
        """
        distances = []
        x = 0
        start_time = time.time()

        # Iterate over the points in batches
        for i in range(0, len(self.gpdDataFramePoints), batch_size):
            x += 1
            # Print progress information
            print(f"\n___________THIS IS RUN NR. {x} ___________")
            print(f"___________Number of Samples: ~{x * batch_size} ___________")

            
            # Get the current batch of points and lines
            batch_points = self.gpdDataFramePoints.iloc[i:i + batch_size]
            batch_lines = self.gpdDataFrameMultiLines.iloc[i:i + batch_size]


            print("\n Starting the conversion...")
            # Convert the batches to cuSpatial GeoDataFrames
            points_cu_gdf = cuspatial.from_geopandas(batch_points.geometry)
            lines_cu_gdf = cuspatial.from_geopandas(batch_lines.geometry)
            print("\nConversion finished\n")


            print("\nStarting the distance calculation, please wait...")
            start_time = time.time()
            # Calculate pairwise distances between points and lines
            batch_distances = cuspatial.pairwise_point_linestring_distance(points_cu_gdf, lines_cu_gdf)
            
            # Calculate and print the elapsed time
            current_time = time.time()
            elapsed_time = current_time - start_time
            print(f"Elapsed time: {elapsed_time} seconds")

            # Append the distances of the current batch to the distances list
            distances.append(batch_distances)


            break ## This is here to test different batch sizes and perfomance as large datasets lead to crashes
        
        return distances
       
class PointLinesDistanceCalculationAlternative:
    """
    A class to calculate the distance between points and lines using cuSpatial.
    
    Attributes:
        gpdDataFramePoints (GeoDataFrame): The input GeoDataFrame containing points.
        gpdDataFrameLines (GeoDataFrame): The input GeoDataFrame containing lines.
        EPSG (int): The EPSG code for the coordinate reference system.
    """
    
    def __init__(self, gpdDataFramePoints, gpdDataFrameLines, EPSG=3857):
        """
        Initializes the PointLinesDistanceCalculation class with the given GeoDataFrames 
        and EPSG code.

        Inputs:
            gpdDataFramePoints (GeoDataFrame): The input GeoDataFrame containing points.
            gpdDataFrameLines (GeoDataFrame): The input GeoDataFrame containing lines.
            EPSG (int, optional): The EPSG code for the coordinate reference system. Defaults to 3857.
        """
        # Initialize the LineToMultiLineConversion with the lines GeoDataFrame and EPSG
        converter = LineToMultiLineConversion(gpdDataFrameLines, EPSG)
        
        # Store the number of points
        self.n = len(gpdDataFramePoints)
        self.EPSG = EPSG 

        # Convert the lines to MultiLineString, duplicated to match the number of points
        self.gpdDataFrameMultiLines = converter.convertToMultiline(duplicate=True, n_times=self.n)
        
        # Convert the points GeoDataFrame to the specified CRS
        gpdDataFramePoints.to_crs(self.EPSG, inplace=True) 
        self.gpdDataFramePoints = gpdDataFramePoints

    

        
       
    def calculateDistance(self):
        """
        Calculates the distance between points and lines in batches using cuSpatial.

        Inputs:
            batch_size (int, optional): The number of points and lines to process in each batch. Defaults to 5000.

        Outputs:
            list: A list of arrays containing the distances for each batch.
        """

        
        print("\n Starting the conversion...")
        # Convert points to cuSpatial format
        points_cu_gdf = cuspatial.from_geopandas(self.gpdDataFramePoints.geometry)
        multiline_cu_gdf = cuspatial.from_geopandas(self.gpdDataFrameMultiLines.geometry)


        print("\nConversion finished\n")
        start_time = time.time()

        print("\nStarting the distance calculation, please wait...")
        distances = cuspatial.pairwise_point_linestring_distance(points_cu_gdf, multiline_cu_gdf)

        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time} seconds")

        return distances




In [16]:
calculator = PointLinesDistanceCalculationAlternative(points_coarse, lines)
distances = calculator.calculateDistance()


 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 0.18236207962036133 seconds


In [17]:
calculato_batch = PointLinesDistanceCalculation(points_fine, lines)
distances = calculato_batch.calculateDistance(batch_size=20000)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~20000 ___________

 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 0.27135539054870605 seconds


In [18]:
calculato_batch = PointLinesDistanceCalculation(points_fine, lines)
distances = calculato_batch.calculateDistance(batch_size=40000)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~40000 ___________

 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 0.18142008781433105 seconds


In [7]:
calculato_batch = PointLinesDistanceCalculation(points_fine, lines)
distances = calculato_batch.calculateDistance(batch_size=50000)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~50000 ___________

 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 0.21275687217712402 seconds


In [19]:
calculato_batch = PointLinesDistanceCalculation(points_fine, lines)
distances = calculato_batch.calculateDistance(batch_size=60000)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~60000 ___________

 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 0.7506148815155029 seconds


In [5]:
calculato_batch = PointLinesDistanceCalculation(points_fine, lines)
distances = calculato_batch.calculateDistance(batch_size=65000)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~65000 ___________

 Starting the conversion...

Conversion finished


Starting the distance calculation, please wait...
Elapsed time: 3.269395589828491 seconds


In [17]:


def cpuDistanceCalculationTest(points, lines, batch_size):
    x = 0
    for i in range(0, len(points), batch_size):
        x += 1
        # Print progress information
        print(f"\n___________THIS IS RUN NR. {x} ___________")
        print(f"___________Number of Samples: ~{x * batch_size} ___________")

        # Ensure the CRS of lines matches points
        lines = lines.to_crs(points.crs)

        converter = LineToMultiLineConversion(lines)
        multilines = converter.convertToMultiline(duplicate=True, n_times=batch_size)

        # Get the current batch of points and lines
        batch_points = points.iloc[i:i + batch_size].reset_index(drop=True)
        batch_multiLines = multilines.iloc[i:i + batch_size].reset_index(drop=True)

        # Check the indices of the batches
        print(f"Indices of batch_points: {batch_points.index}")
        print(f"Indices of batch_multiLines: {batch_multiLines.index}")

        start_time = time.time()

        print("\nStarting the distance calculation, please wait...")
        distance = batch_points.distance(other=batch_multiLines)
        elapsed_time = time.time() - start_time
        print(f"Elapsed time: {elapsed_time} seconds")

        break  # This is here to test different batch sizes and performance as large datasets lead to crashes

    return distance


In [19]:
batch_sizes = [20000, 40000, 50000, 60000, 65000]

for batch_size in batch_sizes:
    points_coarse.to_crs(3857, inplace=True)
    cpuDistanceCalculationTest(points_fine, lines, batch_size)


___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~20000 ___________
Indices of batch_points: RangeIndex(start=0, stop=20000, step=1)
Indices of batch_multiLines: RangeIndex(start=0, stop=20000, step=1)

Starting the distance calculation, please wait...
Elapsed time: 0.2357192039489746 seconds

___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~40000 ___________
Indices of batch_points: RangeIndex(start=0, stop=40000, step=1)
Indices of batch_multiLines: RangeIndex(start=0, stop=40000, step=1)

Starting the distance calculation, please wait...
Elapsed time: 0.405712366104126 seconds

___________THIS IS RUN NR. 1 ___________
___________Number of Samples: ~50000 ___________
Indices of batch_points: RangeIndex(start=0, stop=50000, step=1)
Indices of batch_multiLines: RangeIndex(start=0, stop=50000, step=1)

Starting the distance calculation, please wait...
Elapsed time: 0.5567817687988281 seconds

___________THIS IS RUN NR. 1 ___________
________