# master.py

Split into smaller cells for readability; converted from `backend/Data/master.py`.

Note: This code references `__file__` when building paths. In a notebook, `__file__` may be undefined unless you execute these cells in a way that defines it.


## Imports & constants

Describes the modules and global configuration (paths, category weights, defaults) shared by all later cells.

In [3]:
# Imports & constants

#!/usr/bin/env python3

"""
Generate POI-influenced metrics for cafes and write a master CSV.

Creates: backend/Data/master_cafes_metrics.csv
"""

import os
import sys
import math
import argparse
from pathlib import Path
from collections import defaultdict
from typing import Optional, Tuple, List, Dict

import numpy as np
import pandas as pd


def _resolve_data_dir() -> str:
    anchor = (
        Path(__file__).parent
        if "__file__" in globals()
        else Path.cwd()
    ).resolve()
    bases = {anchor, anchor.parent, anchor.parent.parent}
    suffixes = [
        Path(""),
        Path("CSV"),
        Path("CSV_Reference"),
        Path("Data"),
        Path("Data") / "CSV",
        Path("Data") / "CSV_Reference",
    ]
    for base in bases:
        for suffix in suffixes:
            candidate = (base / suffix).resolve()
            if (candidate / "cafes.csv").is_file():
                return str(candidate)
    raise FileNotFoundError(
        f"Could not locate cafes.csv near {anchor}. Ensure the CSV directory is checked in."
    )


# Per-cafe POI metrics

# Explains how each cafe aggregates nearby POIs into counts, weights, and distance features.

In [4]:
# Per-cafe POI metrics
def compute_poi_metrics_for_cafes(
    cafes: pd.DataFrame,
    poi: pd.DataFrame,
    poi_name: str,
    category_weight: float,
    radius_m: float = 1000.0,
    road_network: Optional["RoadNetwork"] = None,
    snap_tolerance_m: float = ROAD_SNAP_TOLERANCE_M,
    decay_scale_m: float = 1000.0,
) -> pd.DataFrame:
    """Compute per-cafe aggregated POI metrics.
    If `decay_scale_m` is provided, per-POI weights are decayed exponentially
    by distance (meters) using the factor exp(-distance / decay_scale_m).
    Road-network shortest-path distances are used when `road_network` is
    available; otherwise a haversine (straight-line) distance is used.
    """
    latlon = detect_latlon(poi)
    if latlon is None:
        # no coordinates in POI, return zeros
        cafes[f"{poi_name}_count_1km"] = 0
        cafes[f"{poi_name}_weight_1km"] = 0.0
        cafes[f"{poi_name}_min_dist_m"] = np.nan
        return cafes

    poi_lat_col, poi_lon_col = latlon
    poi_lats = pd.to_numeric(poi[poi_lat_col], errors="coerce").to_numpy(dtype=float)
    poi_lons = pd.to_numeric(poi[poi_lon_col], errors="coerce").to_numpy(dtype=float)

    weight_col = detect_weight_col(poi)
    # if a precomputed weight column exists (created by helper), use it
    if "_computed_weight" in poi.columns:
        poi_weights = pd.to_numeric(poi["_computed_weight"], errors="coerce").fillna(1.0).to_numpy().astype(float)
    else:
        # gather optional rating/review columns
        def _detect_rating_col(df: pd.DataFrame) -> Optional[str]:
            for c in RATING_COL_CANDS:
                if c in df.columns:
                    return c
            return None

        def _detect_reviews_col(df: pd.DataFrame) -> Optional[str]:
            for c in REVIEWS_COL_CANDS:
                if c in df.columns:
                    return c
            return None

        rating_col = _detect_rating_col(poi)
        reviews_col = _detect_reviews_col(poi)

        if weight_col is not None:
            raw_vals = pd.to_numeric(poi[weight_col], errors="coerce")
            # base score from rank-like column (lower is better)
            if "rank" in weight_col.lower():
                maxr = raw_vals.max(skipna=True)
                if pd.isna(maxr) or maxr <= 0:
                    maxr = 1.0
                fill_val = float(maxr) + 1.0
                ranks = raw_vals.fillna(fill_val).replace(0.0, fill_val).astype(float)
                inv = 1.0 / (ranks + 1e-9)
                base_norm = inv / float(inv.max()) if inv.max() > 0 else pd.Series(np.ones(len(ranks)), index=ranks.index)
            else:
                # treat column as higher-is-better; normalize by max
                v = raw_vals.fillna(0.0).astype(float)
                base_norm = v / float(v.max()) if v.max() > 0 else pd.Series(np.zeros(len(v)), index=v.index)

            # optional rating and reviews influence
            rating_vals = pd.to_numeric(poi[rating_col], errors="coerce") if rating_col is not None else None
            reviews_vals = pd.to_numeric(poi[reviews_col], errors="coerce") if reviews_col is not None else None

            # Use base/rank and (optionally) reviews only â€” do NOT use rating or weekly-hours
            comps = [base_norm]
            if reviews_vals is not None:
                rv = reviews_vals.fillna(0.0).astype(float)
                maskr = rv > 0
                if maskr.any():
                    # log-scale normalize by max
                    norm_rev = pd.Series(0.0, index=rv.index)
                    maxlog = float(np.log1p(rv[maskr]).max())
                    if maxlog > 0:
                        norm_rev[maskr] = np.log1p(rv[maskr]) / maxlog
                    comps.append(norm_rev)

            # final per-POI weight is mean of available components
            stacked = np.vstack([c.to_numpy() for c in comps])
            poi_weights = np.nanmean(stacked, axis=0).astype(float)
        else:
            # no explicit weight column: use reviews only (do NOT use rating or weekly-hours)
            reviews_vals = pd.to_numeric(poi[reviews_col], errors="coerce") if reviews_col is not None else None
            if reviews_vals is not None:
                rv = reviews_vals.fillna(0.0).astype(float)
                maskr = rv > 0
                norm_rev = pd.Series(0.0, index=rv.index)
                if maskr.any():
                    maxlog = float(np.log1p(rv[maskr]).max())
                    if maxlog > 0:
                        norm_rev[maskr] = np.log1p(rv[maskr]) / maxlog
                    poi_weights = norm_rev.to_numpy().astype(float)
                else:
                    poi_weights = np.ones_like(poi_lats, dtype=float)
            else:
                poi_weights = np.ones_like(poi_lats, dtype=float)

    poi_weights = np.asarray(poi_weights, dtype=float)

    # Prepare result columns (use suffix based on radius)
    try:
        suffix = f"_{int(radius_m/1000)}km"
    except Exception:
        suffix = "_1km"
    counts = []
    weight_sums = []
    min_dists = []

    # detect cafe lat/lon columns
    cafe_latlon = detect_latlon(cafes)
    if cafe_latlon is None:
        raise ValueError("Could not detect lat/lon in cafes CSV")

    cafe_lat_col, cafe_lon_col = cafe_latlon
    cafe_lats = pd.to_numeric(cafes[cafe_lat_col], errors="coerce").to_numpy(dtype=float)
    cafe_lons = pd.to_numeric(cafes[cafe_lon_col], errors="coerce").to_numpy(dtype=float)

    use_network = bool(road_network) and getattr(road_network, "node_count", 0) > 0
    node_to_poi: Dict[int, List[int]] = defaultdict(list)
    poi_snap_offsets: List[float] = []
    if use_network:
        poi_nodes, poi_snap_offsets = road_network.snap_points(poi_lats, poi_lons, max_snap_m=snap_tolerance_m)
        for idx, node_id in enumerate(poi_nodes):
            if node_id is not None and math.isfinite(poi_snap_offsets[idx]):
                node_to_poi[int(node_id)].append(idx)
        if not node_to_poi:
            use_network = False

    def _network_stats(cafe_node: int, cafe_offset: float) -> Optional[Tuple[int, float, float]]:
        if not use_network:
            return None
        lengths = road_network.shortest_paths_from(cafe_node, cutoff=radius_m)
        if not lengths:
            return None
        total_count = 0
        total_weight = 0.0
        min_dist = None
        for node_id, path_dist in lengths.items():
            poi_indices = node_to_poi.get(node_id)
            if not poi_indices:
                continue
            for poi_idx in poi_indices:
                total_dist = path_dist + cafe_offset + poi_snap_offsets[poi_idx]
                if total_dist <= radius_m:
                    total_count += 1
                    # apply exponential decay to per-POI weight based on path distance (meters)
                    try:
                        decayed = float(poi_weights[poi_idx]) * math.exp(-(float(total_dist) / float(decay_scale_m)))
                    except Exception:
                        decayed = float(poi_weights[poi_idx])
                    total_weight += decayed
                    if min_dist is None or total_dist < min_dist:
                        min_dist = total_dist
        if total_count == 0 or min_dist is None:
            return None
        return total_count, total_weight, float(min_dist)

    # Iterate cafes and compute distances
    for i in range(len(cafes)):
        lat = cafe_lats[i]
        lon = cafe_lons[i]
        if not math.isfinite(lat) or not math.isfinite(lon):
            counts.append(0)
            weight_sums.append(0.0)
            min_dists.append(float(np.nan))
            continue
        if use_network:
            cafe_node, cafe_offset = road_network.snap_point(lat, lon, max_snap_m=snap_tolerance_m)
            if cafe_node is not None:
                cafe_offset = float(cafe_offset or 0.0)
                net_stats = _network_stats(cafe_node, cafe_offset)
                if net_stats is not None:
                    cnt, wsum, mind = net_stats
                    counts.append(int(cnt))
                    weight_sums.append(float(wsum))
                    min_dists.append(float(mind))
                    continue
        dists = haversine_m(lat, lon, poi_lats, poi_lons)  # meters
        within_mask = dists <= radius_m
        counts.append(int(np.count_nonzero(within_mask)))
        if np.any(within_mask):
            # apply exponential decay to haversine distances as well
            try:
                ds = dists[within_mask].astype(float)
                pws = poi_weights[within_mask].astype(float)
                decayed_arr = pws * np.exp(-(ds / float(decay_scale_m)))
                weight_sum = float(np.sum(decayed_arr))
            except Exception:
                weight_sum = float(np.sum(poi_weights[within_mask]))
            weight_sums.append(weight_sum)
            min_dists.append(float(np.min(dists[within_mask])))
        else:
            weight_sums.append(0.0)
            min_dists.append(float(np.nan))

    cafes[f"{poi_name}_count{suffix}"] = counts
    cafes[f"{poi_name}_weight{suffix}"] = weight_sums
    cafes[f"{poi_name}_min_dist_m"] = min_dists

    # Also store category weight so downstream composite score can use it
    cafes[f"{poi_name}_category_weight"] = category_weight

    return cafes


NameError: name 'ROAD_SNAP_TOLERANCE_M' is not defined