In [11]:
# --------------------------------------------------------------------
# 1. SETUP AND CONFIGURATION
# --------------------------------------------------------------------
import geopandas as gpd
import pandas as pd
from pathlib import Path
import glob

print("Starting ICB Integration and Validation Process...")

Starting ICB Integration and Validation Process...


In [13]:
# --- Configuration ---

# Define the target Coordinate Reference System (CRS) for the final output
TARGET_CRS = "EPSG:4326"

# Define the path where the raw ICB source files are located.
# This should point to the directory containing your seven source files.
RAW_DATA_DIR = Path("/Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/icb_split_enriched/audit_outputs")

# Define the canonical column name mapping for attribute harmonisation
RENAME_COLS = {
    "lsoa21cd":  "lsoa",
    "lsoa21nm": "lsoa_name",
    "msoa21cd":  "msoa",
    "msoa21nm":  "msoa_name",
    "lad22cd":   "lad",
    "lad22nm":   "lad_name",
    "ruc21nm":   "urban_rural",
    "imd_rank":  "imd_rank",
}


# Define the output directory for the final processed files.
PROCESSED_DATA_DIR = Path("/Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/")
PROCESSED_DATA_DIR.mkdir(exist_ok=True) # Ensure the directory exists


In [14]:
# --------------------------------------------------------------------
# 2. LOAD, HARMONIZE, AND CONCATENATE ICB FILES
# --------------------------------------------------------------------
print(f"\nLooking for source files in: {RAW_DATA_DIR}")

# Find all GeoPackage files in the source directory
icb_files = list(RAW_DATA_DIR.glob("*.gpkg"))

if not icb_files:
    raise FileNotFoundError(f"ERROR: No source files found in {RAW_DATA_DIR}. Please check the path.")

print(f"Found {len(icb_files)} source files to process.")

harmonized_gdfs = []
for file_path in icb_files:
    print(f"  - Processing {file_path.name}...")
    gdf = gpd.read_file(file_path)

    # **UPDATED**: Add a column to track the origin file for better debugging
    gdf['source_file'] = file_path.name

    # Attribute Harmonisation
    gdf.columns = [c.lower() for c in gdf.columns]
    cols_to_rename = {k: v for k, v in RENAME_COLS.items() if k in gdf.columns}
    gdf = gdf.rename(columns=cols_to_rename)

    # CRS Uniformity
    if gdf.crs != TARGET_CRS:
        print(f"    - Reprojecting from {gdf.crs} to {TARGET_CRS}")
        gdf = gdf.to_crs(TARGET_CRS)

    harmonized_gdfs.append(gdf)

# Concatenate into a single regional layer
print("\nConcatenating all harmonized files...")
lsoa_sw_raw = pd.concat(harmonized_gdfs, ignore_index=True)
print(f"✓ Raw concatenated layer created with {len(lsoa_sw_raw)} total records.")




Looking for source files in: /Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/icb_split_enriched/audit_outputs
Found 7 source files to process.
  - Processing NHS_Dorset_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Bristol_North_Somerset_and_South_Gloucestershire_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Gloucestershire_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Bath_and_North_East_Somerset_Swindon_and_Wiltshire_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Cornwall_and_the_Isles_of_Scilly_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Somerset_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...
  - Processing NHS_Devon_Integrated_Care_Board_with_geom_enriched_tidy.gpkg...

Concatenating all harmonized files...
✓ Raw concatenated layer created with 3410 total records.


In [15]:
# --------------------------------------------------------------------
# 3. INITIAL DATA VALIDATION
# --------------------------------------------------------------------
print("\nPerforming initial data validation checks on raw data...")

# --- Hierarchy Integrity ---
lsoa_in_msoa_check = lsoa_sw_raw.groupby('msoa')['lsoa'].nunique().sum()
total_lsoas = lsoa_sw_raw['lsoa'].nunique()
if lsoa_in_msoa_check == total_lsoas: print("✓ Hierarchy Test (LSOA->MSOA): PASSED")
else: print(f"✗ Hierarchy Test (LSOA->MSOA): FAILED")

msoa_in_lad_check = lsoa_sw_raw.groupby('lad')['msoa'].nunique().sum()
total_msoas = lsoa_sw_raw['msoa'].nunique()
if msoa_in_lad_check == total_msoas: print("✓ Hierarchy Test (MSOA->LAD): PASSED")
else: print(f"✗ Hierarchy Test (MSOA->LAD): FAILED")

# --- Edge Alignment ---
duplicate_lsoa_count = lsoa_sw_raw['lsoa'].duplicated().sum()
if duplicate_lsoa_count == 0: print("✓ Edge Alignment Test: PASSED")
else: print(f"✗ Edge Alignment Test: FAILED. Found {duplicate_lsoa_count} duplicate LSOA records.")




Performing initial data validation checks on raw data...
✓ Hierarchy Test (LSOA->MSOA): PASSED
✓ Hierarchy Test (MSOA->LAD): PASSED
✗ Edge Alignment Test: FAILED. Found 3 duplicate LSOA records.


In [16]:
# --------------------------------------------------------------------
# 4. **NEW**: DETECT AND FIX EDGE ALIGNMENT FAILURES
# --------------------------------------------------------------------
if duplicate_lsoa_count > 0:
    print("\nAttempting to fix edge alignment failures...")

    # Isolate and report on the codes causing issues
    duplicate_codes = lsoa_sw_raw[lsoa_sw_raw.lsoa.duplicated(keep=False)]['lsoa'].unique()
    print(f"  - Found {len(duplicate_codes)} unique LSOA codes with split geometries: {list(duplicate_codes)}")

    # The dissolve operation merges the geometries for each LSOA code.
    # 'aggfunc'='first' takes the attribute data from the first row encountered for each LSOA.
    print("  - Merging split geometries by dissolving on 'lsoa' code...")
    lsoa_sw_fixed = lsoa_sw_raw.dissolve(by='lsoa', aggfunc='first').reset_index()

    print("\n--- Post-Fix Re-Validation ---")
    final_duplicates = lsoa_sw_fixed['lsoa'].duplicated().sum()
    if final_duplicates == 0:
        print("✓ SUCCESS: Edge Alignment Test now PASSES.")
        print(f"  - Original row count: {len(lsoa_sw_raw)}")
        print(f"  - Corrected row count: {len(lsoa_sw_fixed)}")
    else:
        raise Exception("FATAL: Fixing duplicates failed. Manual review required.")
else:
    print("\nNo edge alignment issues found. No correction needed.")
    # If no fix was needed, the 'fixed' version is just a copy of the raw version
    lsoa_sw_fixed = lsoa_sw_raw.copy()


Attempting to fix edge alignment failures...
  - Found 3 unique LSOA codes with split geometries: ['E01034378', 'E01034858', 'E01035107']
  - Merging split geometries by dissolving on 'lsoa' code...

--- Post-Fix Re-Validation ---
✓ SUCCESS: Edge Alignment Test now PASSES.
  - Original row count: 3410
  - Corrected row count: 3407


In [18]:
# --------------------------------------------------------------------
# 5. CREATE AND SAVE FINAL CLEAN LAYERS
# --------------------------------------------------------------------
print("\nCreating final dissolved layers from clean, corrected data...")

# Use the 'lsoa_sw_fixed' dataframe as the source of truth
# This is the corrected line for 08c
final_cols = [
    'lsoa', 'lsoa_name', 'msoa', 'msoa_name', 'lad', 'lad_name',
    'urban_rural', 'imd_rank', 'geometry'
]
final_cols_exist = [c for c in final_cols if c in lsoa_sw_fixed.columns]
lsoa_final = lsoa_sw_fixed[final_cols_exist].sort_values('lsoa')

# Dissolve LSOA -> MSOA
msoa_final = (
    lsoa_final
    .dissolve(by="msoa", as_index=False, aggfunc="first")
    .sort_values("msoa")
)

# Dissolve LSOA -> LAD
lad_final = (
    lsoa_final
    .dissolve(by="lad", as_index=False, aggfunc="first")
    .sort_values("lad")
)

print(f"Final layer shapes: LSOA({lsoa_final.shape}), MSOA({msoa_final.shape}), LAD({lad_final.shape})")

# --- Save to GeoPackage Files ---
print("\nSaving final layers to separate GeoPackage files...")
lsoa_out_path = PROCESSED_DATA_DIR / "southwest_lsoa.gpkg"
msoa_out_path = PROCESSED_DATA_DIR / "southwest_msoa.gpkg"
lad_out_path = PROCESSED_DATA_DIR / "southwest_lad.gpkg"

lsoa_final.to_file(lsoa_out_path, driver="GPKG")
msoa_final.to_file(msoa_out_path, driver="GPKG")
lad_final.to_file(lad_out_path, driver="GPKG")

print(f"✓ Saved LSOA layer to: {lsoa_out_path}")
print(f"✓ Saved MSOA layer to: {msoa_out_path}")
print(f"✓ Saved LAD layer to: {lad_out_path}")
print("\nProcess complete.")


Creating final dissolved layers from clean, corrected data...
Final layer shapes: LSOA((3407, 8)), MSOA((711, 8)), LAD((28, 8))

Saving final layers to separate GeoPackage files...
✓ Saved LSOA layer to: /Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/southwest_lsoa.gpkg
✓ Saved MSOA layer to: /Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/southwest_msoa.gpkg
✓ Saved LAD layer to: /Users/rosstaylor/Downloads/Research Project/Code Folder/Research Project - Geospatial Health Demand/data/processed/southwest_lad.gpkg

Process complete.
