In [13]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import BallTree

In [35]:
FOLDER = "E:\jupyternoteBookWorkPath\erp\house_eco_poi1"
HOUSE_FILE = os.path.join(FOLDER, "shanghai_house_price_9.CSV")
FACILITY_FILES = {
    "dist_to_bank_m":               os.path.join(FOLDER, "bank_poi.xlsx"),
    "dist_to_primary_school_m":     os.path.join(FOLDER, "primary_school_poi.xlsx"),
    "dist_to_middle_school_m":      os.path.join(FOLDER, "middle_school_poi.xlsx"),
    "dist_to_shopping_center_m":    os.path.join(FOLDER, "shopping_center_poi.xlsx"),
    "dist_to_top_tier_hospital_m":  os.path.join(FOLDER, "top_tier_hospital_poi.xlsx"),
    "dist_to_scenic_spot_m":        os.path.join(FOLDER, "scenic_spot_poi.xlsx"),
}

OUTPUT_FILE = os.path.join(FOLDER, "shanghai_house_price_9_with_distances.csv")
EARTH_RADIUS_M = 6371008.8  # IUGG mean earth radius in meters

  FOLDER = "E:\jupyternoteBookWorkPath\erp\house_eco_poi1"  # change if needed


In [37]:
def _find_lon_lat_columns(df):
    
    candidates_lon = ["longitude", "lon", "lng", "LONGITUDE", "Lon", "Lng", "Longitude"]
    candidates_lat = ["latitude", "lat", "LATITUDE", "Lat", "Latitude"]

    lon_col = None
    lat_col = None

    cols_lower = {c.lower(): c for c in df.columns}

    for c in candidates_lon:
        if c.lower() in cols_lower:
            lon_col = cols_lower[c.lower()]
            break

    for c in candidates_lat:
        if c.lower() in cols_lower:
            lat_col = cols_lower[c.lower()]
            break

    if lon_col is None or lat_col is None:
        raise ValueError(
            f"Could not find longitude/latitude columns. Available columns: {list(df.columns)}"
        )
    return lon_col, lat_col

def _to_radians_coords(df):
    """Extract lon/lat from df, coerce to numeric, drop NaNs, and return Nx2 array in radians as [lat, lon]."""
    lon_col, lat_col = _find_lon_lat_columns(df)
    lon = pd.to_numeric(df[lon_col], errors="coerce")
    lat = pd.to_numeric(df[lat_col], errors="coerce")
    mask = (~lon.isna()) & (~lat.isna())
    coords_deg = np.vstack([lat[mask].to_numpy(), lon[mask].to_numpy()]).T  # shape (N, 2) = [lat, lon] in degrees
    coords_rad = np.deg2rad(coords_deg)  # convert to radians
    return coords_rad, mask

def build_balltree_from_poi(poi_path):
    """
    Load a POI Excel and build a BallTree (haversine).
    Returns (tree or None, coords_rad or None).
    If POI file has no valid coords, returns (None, None).
    """
    poi = pd.read_excel(poi_path)
    coords_rad, mask = _to_radians_coords(poi)
    if coords_rad.shape[0] == 0:
        return None, None
    # BallTree expects [lat, lon] in radians; use haversine metric
    tree = BallTree(coords_rad, metric="haversine")
    return tree, coords_rad

def nearest_distance_meters(house_coords_rad, tree):
    """
    Query nearest neighbor haversine distance (in meters) for each house coordinate.
    house_coords_rad: Nx2 array of [lat, lon] in radians
    tree: BallTree built on POI coords in radians
    Returns 1D numpy array of distances in meters (length N).
    """
    # BallTree returns distance in radians for haversine
    dist_rad, _ = tree.query(house_coords_rad, k=1)  # shape (N, 1)
    return (dist_rad[:, 0] * EARTH_RADIUS_M)

def main():
    # Load house data
    print("Loading house data...")
    houses = pd.read_csv(HOUSE_FILE)
    # Make a copy to avoid modifying original
    result = houses.copy()

    # Extract valid house coordinates (store mask to re-insert NaNs later)
    print("Preparing house coordinates...")
    house_coords_rad, house_mask = _to_radians_coords(houses)

    n_valid = int(house_mask.sum())
    if n_valid == 0:
        raise RuntimeError("No valid (longitude, latitude) found in the house file.")

    # For each facility file: build tree, query, and write column
    for out_col, poi_path in FACILITY_FILES.items():
        print(f"Processing: {out_col} from {os.path.basename(poi_path)}")
        tree, _ = build_balltree_from_poi(poi_path)
        result[out_col] = np.nan

        if tree is None:
            print(f"Warning: {poi_path} has no valid coordinates; filling {out_col} with NaN.")
            continue

        # Query only rows with valid house coords
        dists_m = nearest_distance_meters(house_coords_rad, tree)

        # Place results back into the full-length column using the mask
        col_full = np.full(shape=(len(result),), fill_value=np.nan, dtype=float)
        col_full[house_mask.to_numpy()] = dists_m
        result[out_col] = col_full

    # Save output
    print(f"Writing output to: {OUTPUT_FILE}")
    result.to_csv(OUTPUT_FILE, index=False)
    print("Done.")

In [39]:
if __name__ == "__main__":
    main()

Loading house data...
Preparing house coordinates...
Processing: dist_to_bank_m from bank_poi.xlsx
Processing: dist_to_primary_school_m from primary_school_poi.xlsx
Processing: dist_to_middle_school_m from middle_school_poi.xlsx
Processing: dist_to_shopping_center_m from shopping_center_poi.xlsx
Processing: dist_to_top_tier_hospital_m from top_tier_hospital_poi.xlsx
Processing: dist_to_scenic_spot_m from scenic_spot_poi.xlsx
Writing output to: E:\jupyternoteBookWorkPath\erp\house_eco_poi1\shanghai_house_price_9_with_distances.csv
Done.
