In [2]:
from pathlib import Path
import sys
import os

# Adjust the path as needed to reach your project root from the notebook's location
project_root = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:

from config import PROJECT_ROOT

In [3]:
mapping_utrecht_weekday = {
    "C3": (4, 4, 3)
}

mapping_utrecht_weekend = {
    "C3": (4, 4, 3)
}
mapping_rotterdam_weekday = {
    "C1": (1, 3, 2),
    "C4": (3, 2, 1),
    "C3": (4, 4, 3)
}
mapping_rotterdam_weekend = {
    "C1": (1, 2, 3)
}

In [4]:
# Get the decomposition file path of the final CP and Tucker decompositions
decomposition_path_weekday_utrecht_cp = str(
    PROJECT_ROOT) + "/data/results/decompositions/CP/final/peak hours normalized/utrecht/odt_no_same_od_no_rare_od_fixed_thresh_normalizedPeaks/odt_utrecht_hourly_weekday/run_20250610_001409/rank_5/odt_processed_utrecht_hourly_weekday_factors.npz"
decomposition_path_weekday_utrecht_tucker = str(
    PROJECT_ROOT) + "/data/results/decompositions/tucker/hourly_analysis/odt_utrecht_hourly_weekday/MU/run_20250610_123349/rank_5_5_4/odt_utrecht_hourly_weekday_factors.npz"

decomposition_path_weekend_utrecht_cp = str(
    PROJECT_ROOT) + "/data/results/decompositions/CP/final/peak hours normalized/utrecht/odt_no_same_od_no_rare_od_fixed_thresh_normalizedPeaks/odt_utrecht_hourly_weekend/run_20250610_001322/rank_4/odt_processed_utrecht_hourly_weekend_factors.npz"
decomposition_path_weekend_utrecht_tucker = str(
    PROJECT_ROOT) + "/data/results/decompositions/tucker/hourly_analysis/odt_utrecht_hourly_weekend/MU/run_20250610_123349/rank_4_4_5/odt_utrecht_hourly_weekend_factors.npz"

decomposition_path_weekday_rotterdam_cp = str(
    PROJECT_ROOT) + "/data/results/decompositions/CP/final/peak hours normalized/rotterdam/odt_no_same_od_no_rare_od_fixed_thresh_normalizedPeaks/odt_rotterdam_hourly_weekday/run_20250610_001727/rank_6/odt_processed_rotterdam_hourly_weekday_factors.npz"
decomposition_path_weekday_rotterdam_tucker = str(
    PROJECT_ROOT) + "/data/results/decompositions/tucker/hourly_analysis/odt_rotterdam_hourly_weekday/MU/run_20250610_123349/rank_4_4_3/odt_rotterdam_hourly_weekday_factors.npz"

decomposition_path_weekend_rotterdam_cp = str(
    PROJECT_ROOT) + "/data/results/decompositions/CP/final/peak hours normalized/rotterdam/odt_no_same_od_no_rare_od_fixed_thresh_normalizedPeaks/odt_rotterdam_hourly_weekend/run_20250610_001652/rank_6/odt_processed_rotterdam_hourly_weekend_factors.npz"
decomposition_path_weekend_rotterdam_tucker = str(
    PROJECT_ROOT) + "/data/results/decompositions/tucker/hourly_analysis/odt_rotterdam_hourly_weekend/MU/run_20250610_123349/rank_4_4_3/odt_rotterdam_hourly_weekend_factors.npz"

In [5]:
decomposition_path_weekend_rotterdam_tucker

'/Users/peterfalterbaum/Documents/Nova/thesis local/implementation/public_implementation/data/results/decompositions/tucker/hourly_analysis/odt_rotterdam_hourly_weekend/MU/run_20250610_123349/rank_4_4_3/odt_rotterdam_hourly_weekend_factors.npz'

In [6]:
def get_decomposition_data(decomposition_path, granularity, return_core=False):
    """
    Parameters
    ----------
    decomposition_path : str or Path
        Path to the .npz file containing factor matrices (and possibly 'core').
    granularity : str
        'Weekday' or 'Weekend' (used to pick the correct index mapping file).
    return_core : bool, optional
        If True, also return the Tucker core tensor.

    Returns
    -------
    origin_factor : np.ndarray
    destination_factor : np.ndarray
    time_factor : np.ndarray
    core : np.ndarray (only if return_core=True)
    idx_to_origins : dict
    idx_to_destinations : dict
    """
    # 1. Determine run directory and read summary
    decomposition_path = Path(decomposition_path)
    run_dir = decomposition_path.parent.parent
    run_summary_path = run_dir / "run_summary.json"
    if not run_summary_path.exists():
        raise FileNotFoundError(
            f"Missing run_summary.json at {run_summary_path}")
    with open(run_summary_path, "r") as f:
        run_summary = json.load(f)

    # 2. Load the decomposition .npz
    data = np.load(decomposition_path, allow_pickle=True)
    origin_factor = data["factors"][0]
    destination_factor = data["factors"][1]
    time_factor = data["factors"][2]

    # 3. Optionally extract the Tucker core
    core = None
    if return_core:
        if "core" not in data:
            raise KeyError(
                "No 'core' array found in decomposition file for Tucker.")
        core = data["core"]

    # 4. Load index mappings
    tensor_file = Path(run_summary["tensor_info"]["file"])
    tensor_dir = tensor_file.parent
    city = "utrecht" if "utrecht" in tensor_file.name.lower() else "rotterdam"
    idx_map_file = tensor_dir / \
        f"index_mappings_{city}_{granularity.lower()}_hourly.json"
    if not idx_map_file.exists():
        raise FileNotFoundError(f"Index mapping not found: {idx_map_file}")
    with open(idx_map_file, "r") as f:
        idx_maps = json.load(f)

    idx_to_origins = idx_maps["idx_to_origins"]
    idx_to_destinations = idx_maps["idx_to_destinations"]

    # 5. Return
    if return_core:
        return origin_factor, destination_factor, time_factor, core, idx_to_origins, idx_to_destinations
    else:
        return origin_factor, destination_factor, time_factor, idx_to_origins, idx_to_destinations

In [7]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import os
import json
from pathlib import Path


def compare_cp_vs_tucker(
    cp_path: str,
    tk_path: str,
    mapping: dict[str, tuple[int, int, int]],
    granularity: str
) -> pd.DataFrame:
    """
    For each CP‐component label (e.g. "C3") in `mapping`, compute
    Pearson‐r between:
      - CP origin[:,k] vs Tucker origin[:,r_o-1],
      - CP dest   [:,k] vs Tucker dest   [:,r_d-1],
      - CP time   [:,k] vs Tucker time   [:,r_t-1].

    Returns a DataFrame indexed by CP‐label with columns
    ['origin_r','destination_r','time_r'].
    """
    # load CP factors (ignore core)
    A_cp, B_cp, C_cp, _, _ = get_decomposition_data(
        cp_path, granularity, return_core=False)
    # load Tucker factors + core (we’ll just ignore G)
    A_tk, B_tk, C_tk, _, _ = get_decomposition_data(
        tk_path, granularity, return_core=False)

    rows = []
    for cp_label, (ro, rd, rt) in mapping.items():
        k = int(cp_label[1:]) - 1       # zero-based CP component idx
        i_ro, i_rd, i_rt = ro-1, rd-1, rt-1  # zero-based Tucker indices

        # compute Pearson r
        or_r = pearsonr(A_cp[:, k], A_tk[:, i_ro])[0]
        dr_r = pearsonr(B_cp[:, k], B_tk[:, i_rd])[0]
        tr_r = pearsonr(C_cp[:, k], C_tk[:, i_rt])[0]

        rows.append({
            'cp_label':      cp_label,
            'origin_r':      or_r,
            'destination_r': dr_r,
            'time_r':        tr_r
        })

    df = pd.DataFrame(rows).set_index('cp_label')
    return df

In [8]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


def best_cp_tk_matches(cp_path, tk_path, granularity):
    # load factors only (ignore core and mappings)
    A_cp, B_cp, C_cp, * \
        _ = get_decomposition_data(cp_path, granularity, return_core=False)
    A_tk, B_tk, C_tk, G, * \
        _ = get_decomposition_data(tk_path, granularity, return_core=True)

In [9]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


def best_cp_tk_matches(cp_path: str,
                       tk_path: str,
                       granularity: str,
                       tcore: float = 0.8) -> pd.DataFrame:
    """
    For each CP component k, finds the Tucker triplet (r_o,r_d,r_t)
    that maximizes the average Pearson-r across origin/destination/time,
    and returns a DataFrame with:
      - r_origin, r_destination, r_time, r_mean
      - whether that triplet is among the top‐energy triplets covering tcore% of G’s energy
      - the rank position of that triplet in G when sorted by squared‐core energy
    """
    # load factors and core
    A_cp, B_cp, C_cp, * \
        _ = get_decomposition_data(cp_path, granularity, return_core=False)
    A_tk, B_tk, C_tk, G_tk, * \
        _ = get_decomposition_data(tk_path, granularity, return_core=True)

    K = A_cp.shape[1]
    R1, R2, R3 = G_tk.shape

    # precompute Tucker‐core energy ranking
    E = G_tk**2
    flat = E.ravel()
    flat_idx = np.argsort(flat)[::-1]
    cum_energy = np.cumsum(flat[flat_idx])
    total_energy = cum_energy[-1]
    M = np.searchsorted(cum_energy, tcore * total_energy) + 1
    sel = flat_idx[:M]
    sel_set = set(sel)
    # map each flat‐index to its 1-based rank in the sorted list
    rank_map = {fi: rank+1 for rank, fi in enumerate(flat_idx)}

    # list all zero‐based triplets
    triplets = [(i, j, k)
                for i in range(R1)
                for j in range(R2)
                for k in range(R3)]
    # similarity matrix
    S = np.zeros((K, len(triplets)))
    for k_cp in range(K):
        for m, (i_ro, i_rd, i_rt) in enumerate(triplets):
            or_r = pearsonr(A_cp[:, k_cp], A_tk[:, i_ro])[0]
            dr_r = pearsonr(B_cp[:, k_cp], B_tk[:, i_rd])[0]
            tr_r = pearsonr(C_cp[:, k_cp], C_tk[:, i_rt])[0]
            S[k_cp, m] = (or_r + dr_r + tr_r) / 3.0

    # build output
    rows = []
    for k_cp in range(K):
        best_m = S[k_cp].argmax()
        i_ro, i_rd, i_rt = triplets[best_m]

        or_r = pearsonr(A_cp[:, k_cp], A_tk[:, i_ro])[0]
        dr_r = pearsonr(B_cp[:, k_cp], B_tk[:, i_rd])[0]
        tr_r = pearsonr(C_cp[:, k_cp], C_tk[:, i_rt])[0]
        mean_r = S[k_cp, best_m]

        # compute the flat‐index and its energy‐rank and top80‐flag
        flat_index = np.ravel_multi_index((i_ro, i_rd, i_rt), G_tk.shape)
        in_top80 = flat_index in sel_set
        energy_rank = rank_map[flat_index]

        rows.append({
            'cp_component':    f"C{k_cp+1}",
            'tucker_triplet':  (i_ro+1, i_rd+1, i_rt+1),
            'r_origin':        or_r,
            'r_destination':   dr_r,
            'r_time':          tr_r,
            'r_mean':          mean_r,
            'in_top80':        in_top80,
            'energy_rank':     energy_rank
        })

    return pd.DataFrame(rows).set_index('cp_component')

In [10]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


def best_cp_tk_matches(cp_path: str,
                       tk_path: str,
                       granularity: str) -> pd.DataFrame:
    """
    For each CP component k, finds the Tucker triplet (r_o,r_d,r_t)
    that maximizes the average Pearson-r across origin/destination/time,
    and returns a DataFrame with all four correlation scores.
    """
    # load factors (and core, mappings—we ignore core here)
    A_cp, B_cp, C_cp, * \
        _ = get_decomposition_data(cp_path, granularity, return_core=False)
    A_tk, B_tk, C_tk, G_tk, * \
        _ = get_decomposition_data(tk_path, granularity, return_core=True)

    K = A_cp.shape[1]
    R1, R2, R3 = G_tk.shape

    # build list of all zero-based triplets
    triplets = [(i, j, k)
                for i in range(R1)
                for j in range(R2)
                for k in range(R3)]
    M = len(triplets)

    # prepare similarity storage
    S = np.zeros((K, M))
    # precompute all correlations
    for k_cp in range(K):
        for m, (i_ro, i_rd, i_rt) in enumerate(triplets):
            or_r = pearsonr(A_cp[:, k_cp], A_tk[:, i_ro])[0]
            dr_r = pearsonr(B_cp[:, k_cp], B_tk[:, i_rd])[0]
            tr_r = pearsonr(C_cp[:, k_cp], C_tk[:, i_rt])[0]
            S[k_cp, m] = (or_r + dr_r + tr_r) / 3.0

    # build output rows
    rows = []
    for k_cp in range(K):
        best_m = S[k_cp].argmax()
        i_ro, i_rd, i_rt = triplets[best_m]

        or_r = pearsonr(A_cp[:, k_cp], A_tk[:, i_ro])[0]
        dr_r = pearsonr(B_cp[:, k_cp], B_tk[:, i_rd])[0]
        tr_r = pearsonr(C_cp[:, k_cp], C_tk[:, i_rt])[0]
        mean_r = S[k_cp, best_m]

        rows.append({
            'cp_component':   f"C{k_cp+1}",
            'tucker_triplet': (i_ro+1, i_rd+1, i_rt+1),
            'r_origin':       or_r,
            'r_destination':  dr_r,
            'r_time':         tr_r,
            'r_mean':         mean_r
        })

    return pd.DataFrame(rows).set_index('cp_component')

In [11]:
# Set city and time scope, select variables automatically
city: str = "rotterdam"
time_scope: str = "weekday"

cp_path: str = globals()[f"decomposition_path_{time_scope}_{city}_cp"]
tk_path: str = globals()[f"decomposition_path_{time_scope}_{city}_tucker"]
mapping: dict[str, tuple[int, int, int]] = globals()[
    f"mapping_{city}_{time_scope}"]

df = compare_cp_vs_tucker(
    cp_path,
    tk_path,
    mapping,
    granularity=time_scope
)
print(df)

          origin_r  destination_r    time_r
cp_label                                   
C1        0.999627       0.999494  0.993979
C4        0.994307       0.987728  0.969292
C3        0.924466       0.921360  0.646027


In [12]:
for city in ["utrecht", "rotterdam"]:
    for time_scope in ["weekday", "weekend"]:
        cp_path: str = globals()[f"decomposition_path_{time_scope}_{city}_cp"]
        tk_path: str = globals()[
            f"decomposition_path_{time_scope}_{city}_tucker"]
        mapping: dict[str, tuple[int, int, int]] = globals()[
            f"mapping_{city}_{time_scope}"]

        df = best_cp_tk_matches(
            cp_path,
            tk_path,
            granularity=time_scope
        )
        print(f"City: {city}, Time scope: {time_scope}")
        print(df)
        print("\n\n")

City: utrecht, Time scope: weekday
             tucker_triplet  r_origin  r_destination    r_time    r_mean
cp_component                                                            
C1                (1, 3, 3)  0.994522       0.990345  0.886916  0.957261
C2                (2, 5, 1)  0.770017       0.806487  0.871061  0.815855
C3                (4, 1, 4)  0.972348       0.967990  0.957775  0.966038
C4                (3, 1, 3)  0.862260       0.924068  0.756835  0.847721
C5                (3, 4, 2)  0.572917       0.898973  0.901884  0.791258



City: utrecht, Time scope: weekend
             tucker_triplet  r_origin  r_destination    r_time    r_mean
cp_component                                                            
C1                (3, 1, 4)  0.849633       0.991449  0.644595  0.828559
C2                (4, 3, 5)  0.993408       0.975974  0.970928  0.980104
C3                (2, 2, 2)  0.944764       0.957638  0.896753  0.933052
C4                (2, 2, 1)  0.332182       0.40495