In [2]:
import geopandas as gpd
import pandas as pd
import sys
import os

In [3]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)

In [4]:
# Import SHAPEFILE_PATH from constants
from constants import LUP_13

In [5]:
# Load the dataset
gdf = gpd.read_file(LUP_13)

In [6]:
# Function to select properties for validation set based on distance constraint
def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    validation_set = []
    remaining_set = gdf.copy()
    
    while len(validation_set) < validation_fraction * len(gdf):
        # Randomly select a property
        selected_property = remaining_set.sample(1)
        
        # Append to validation set
        validation_set.append(selected_property)
        
        # Calculate distance between the centroid of the selected property and centroids of remaining properties
        distances = remaining_set.centroid.distance(selected_property.centroid.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set[distances > distance_threshold]
        
        # Break if no more properties can be added
        if len(remaining_set) == 0:
            break
    
    return gpd.GeoDataFrame(pd.concat(validation_set, ignore_index=True))

# Get validation set
validation_gdf = select_validation_set(gdf)

# Get training set by excluding validation set
train_gdf = gdf[~gdf.index.isin(validation_gdf.index)]

print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

Training set size: 1661
Validation set size: 185


In [7]:
# Save training set to a GeoPackage file
train_gdf.to_file("training_set.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_set.gpkg", driver="GPKG")


In [9]:
def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    validation_set = []
    remaining_set = gdf.copy()
    
    while len(validation_set) < validation_fraction * len(gdf):
        # Randomly select a property
        selected_property = remaining_set.sample(1)
        
        # Append to validation set
        validation_set.append(selected_property)
        
        # Calculate distance between the boundary of the selected property and boundaries of remaining properties
        distances = remaining_set.boundary.distance(selected_property.boundary.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set[distances > distance_threshold]
        
        # Break if no more properties can be added
        if len(remaining_set) == 0:
            break
    
    return gpd.GeoDataFrame(pd.concat(validation_set, ignore_index=True))


In [10]:
# Get validation set
validation_gdf = select_validation_set(gdf)

# Get training set by excluding validation set
train_gdf = gdf[~gdf.index.isin(validation_gdf.index)]

# Save training set to a GeoPackage file
train_gdf.to_file("training_setb.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_setb.gpkg", driver="GPKG")

In [11]:
print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

Training set size: 1661
Validation set size: 185
