In [1]:
import geopandas as gpd
import pandas as pd
import sys
import os

In [2]:
# Get the current working directory
current_dir = os.path.abspath('')

# Search for the 'constants.py' file starting from the current directory and moving up the hierarchy
project_root = current_dir
while not os.path.isfile(os.path.join(project_root, 'constants.py')):
    project_root = os.path.dirname(project_root)

# Add the project root to the Python path
sys.path.append(project_root)

In [8]:
from constants import DATA_PATH

In [3]:
dissolved_putid = r"C:\Users\bsf31\Documents\post-meds\data\policy-data\processing\dissolved_clean_putid.gpkg"

In [4]:
# Load the dataset
gdf = gpd.read_file(dissolved_putid)

In [7]:
gdf.geometry = gdf.buffer(25, join_style= 2)
gdf.geometry = gdf.buffer(-25, join_style= 2)


In [9]:
# For Visual Check in Qgis

'''
output_path = os.path.join(DATA_PATH,'processing')


# Create the directory if it doesn't exist
if not os.path.exists(output_path):
    os.makedirs(output_path)
    # Save the GeoDataFrame as a GeoPackage
# Define the filename for the GeoPackage

filename = os.path.join(output_path, "clean_dissolved_clean_putid.gpkg")
gdf.to_file(filename, driver="GPKG")'''

In [14]:
def select_validation_set(gdf, distance_threshold=5000, validation_fraction=0.1):
    # Initialize an empty GeoDataFrame for the validation set
    validation_set = gpd.GeoDataFrame(columns=gdf.columns)
    # Make a copy of the original GeoDataFrame to work as the remaining set
    remaining_set = gdf.copy()
    
    # Calculate the target size of the validation set based on the specified fraction
    target_size = validation_fraction * len(gdf)

    while len(validation_set) < target_size:
        # Randomly select a property from the remaining set
        selected_property = remaining_set.sample(1)

        # Append the selected property to the validation set
        validation_set = pd.concat([validation_set, selected_property], ignore_index=True)
        
        # Calculate the distance from the selected property to all properties in the remaining set
        distances = remaining_set.distance(selected_property.geometry.squeeze())
        
        # Remove properties within the distance threshold from the remaining set
        remaining_set = remaining_set.loc[distances > distance_threshold]

        # If the remaining set is empty, break the loop to prevent infinite iterations
        if remaining_set.empty:
            break

    return validation_set, remaining_set


In [15]:
validation_set, remaining_set = select_validation_set(gdf)


In [None]:
# Save training set to a GeoPackage file
remaining_set.to_file("training_set.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_set.to_file("validation_set.gpkg", driver="GPKG")

In [13]:
# Get validation set
validation_gdf = select_validation_set(gdf)

# Get training set by excluding validation set
train_gdf = gdf[~gdf.index.isin(validation_gdf.index)]

# Save training set to a GeoPackage file
train_gdf.to_file("training_setb.gpkg", driver="GPKG")

# Save validation set to a GeoPackage file
validation_gdf.to_file("validation_setb.gpkg", driver="GPKG")

In [12]:
print(f"Training set size: {len(train_gdf)}")
print(f"Validation set size: {len(validation_gdf)}")

Training set size: 1570
Validation set size: 175
