In [None]:
# Geographic Crosswalk Creator for Japanese Administrative Boundaries
#
# This notebook creates geographic crosswalks between different years of Japanese administrative 
# boundaries, implementing a spatial standardization procedure using 2000 boundaries as the 
# reference unit. Following Eckert et al. (2020), it constructs precise geographic crosswalks
# by intersecting historical district boundaries with the reference map.
#
# Author: Shizuka  Inoue
# Date: 2025/02/24

import os
import pandas as pd
import geopandas as gpd
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Set up directory paths
notebook_dir = Path(os.path.dirname(os.path.abspath('__file__')))
project_root = notebook_dir.parent
base_dir = project_root / 'Data'
export_dir = project_root / 'Crosswalk'

# Set the base year for the crosswalk.
base_year = 2000

# Set the threshold for the share of the intersection area. We do this to avoid the case where the intersection area is too small to be meaningful.
threshhold = 0.0001

# Specify the paths for your shapefiles and desired output
base_path = os.path.join(base_dir, f'jpn{base_year}', f'jpn{base_year}geo.shp')

base_gdf = gpd.read_file(base_path, encoding='shift_jis')

# Create a dissolve key. We do this to dissolve the polygons into a single polygon for each administrative unit since there are multiple polygons for the same administrative unit.
base_gdf['dissolve_key'] = base_gdf['N03_007']

base_gdf = base_gdf.dissolve(by = "dissolve_key")

base_gdf.reset_index(inplace=True)

# Convert base_gdf from geographic (EPSG:4326) to projected CRS. We do this to compute the area of the polygons in square meters.
base_gdf = base_gdf.to_crs({'proj':'cea'})

base_gdf['geometry'] = base_gdf['geometry'].buffer(0)

base_gdf['Area (Reference Unit)'] = base_gdf.area/10**6 # Convert the area to square kilometers.

base_gdf = base_gdf[['geometry', 'PREF',  'GUN', 'CITY', 'N03_007', 'Area (Reference Unit)']]

# Set the years for which to create crosswalks.
years = [1980]

for year in years:
    print(f"\nProcessing year {year}")
    new_path = os.path.join(base_dir, f'jpn{year}', f'jpn{year}geo.shp')
    
    new_gdf = gpd.read_file(new_path, encoding='shift_jis')
    new_gdf = new_gdf.set_crs("EPSG:4326", allow_override=True) 
    new_gdf = new_gdf.to_crs({'proj':'cea'})  

    # Fix invalid geometries
    new_gdf['geometry'] = new_gdf['geometry'].buffer(0).make_valid()
    print("Invalid geometries in new_gdf:", sum(~new_gdf.geometry.is_valid))

    # Create a unique key for each area
    new_gdf['dissolve_key'] = new_gdf['N03_007']

    # Dissolve and reset index
    new_gdf = new_gdf.dissolve(by='dissolve_key')
    new_gdf.reset_index(inplace=True)  

    # Fix geometries and calculate areas
    new_gdf['geometry'] = new_gdf['geometry'].buffer(0).make_valid()
    new_gdf['Area (Reporting Unit)'] = new_gdf.area/10**6

    # Print the bounds of the base and new geodataframes to check if they are in the same coordinate system.
    print("Base GDF bounds:", base_gdf.total_bounds)
    print("New GDF bounds:", new_gdf.total_bounds)

    # Attempt overlay operation
    try:
        intersect = gpd.overlay(base_gdf, new_gdf, how='intersection', keep_geom_type=False)
        print(f"Overlay successful for year {year}")
    except Exception as e:
        print(f"Overlay failed for year {year}: {str(e)}")
        try:
            intersect = gpd.overlay(base_gdf, new_gdf, how='intersection', 
                                    keep_geom_type=False, use_sindex=False)
            print("Overlay successful using alternative method")
        except Exception as e2:
            print(f"Alternative overlay also failed: {str(e2)}")
            continue

    # Calculate intersection areas 
    intersect['intersect_area'] = intersect.geometry.area/10**6
    
    # Calculate weights relative to reporting unit areas
    intersect['weight'] = intersect['intersect_area'] / intersect["Area (Reporting Unit)"]

    # Check whether the weights sum to 1 for each area.
    print(f"\nWeight statistics for {year}:")
    weight_sums = intersect.groupby(['N03_007_2'])['weight'].sum()
    print(f"Mean weight sum: {weight_sums.mean():.3f}")
    print(f"Min weight sum: {weight_sums.min():.3f}")
    print(f"Max weight sum: {weight_sums.max():.3f}")
    print(f"Areas with weights < 0.95: {sum(weight_sums < 0.95)}")

    # Filter by threshold
    intersect = intersect[intersect['weight'] > threshhold]

    # Normalize weights to sum to 1 for each area
    intersect['weight'] = intersect.groupby(['N03_007_2'])['weight'].transform(
        lambda x: x / x.sum()
    )

    # Rename columns
    intersect = intersect.rename(columns={
        "PREF_1": "PREF2000", 
        "GUN_1": "GUN2000",
        "CITY_1": "CITY2000", 
        "N03_007_1": "CITY_CODE2000",
        'PREF_2': f"PREF{year}",
        'GUN_2': f"GUN{year}",
        "CITY_2": f"CITY{year}",
        "N03_007_2": f"CITY_CODE{year}",
    })

    # Export results
    export_path = os.path.join(export_dir, f'Crosswalk_2000_{year}.xlsx')
    intersect = intersect.drop_duplicates()
    
    # Verify final weights
    final_weights = intersect.groupby([f"CITY_CODE{year}"])['weight'].sum()
    print("\nAfter normalization:")
    print(f"Mean final weight sum: {final_weights.mean():.3f}")
    print(f"Min final weight sum: {final_weights.min():.3f}")
    print(f"Max final weight sum: {final_weights.max():.3f}")

    intersect.to_excel(export_path, index=False)
    print(f"Successfully exported crosswalk for year {year}")




Processing year 1980
Invalid geometries in new_gdf: 0
Base GDF bounds: [13684894.6214349   2583224.77590051 17141256.5835307   4533543.53047737]
New GDF bounds: [13684894.6214349   2583224.77590051 17141256.5835307   4533543.53047737]
Overlay successful for year 1980

Weight statistics for 1980:
Mean weight sum: 1.000
Min weight sum: 0.722
Max weight sum: 1.000
Areas with weights < 0.95: 1

After normalization:
Mean final weight sum: 1.000
Min final weight sum: 1.000
Max final weight sum: 1.000
Successfully exported crosswalk for year 1980
