In [1]:
import pandas as pd
import random
import numpy as np
import ast
import json
import os
import folium
import h3
import matplotlib.pyplot as plt
import matplotlib.pylab as pl
import matplotlib.patches as patches

from typing import Dict, Set, List, Optional, Tuple

In [21]:
from utils.h3hierarchy import create_h3_hierarchical_tree
from utils.metrics import compute_discernability_and_cavg_sparse

In [16]:
from utils.visualization import GeneralizedH3Visualizer

## Data Preparation

In [3]:
df_gps = pd.read_csv("data/processed_trips.csv")

df_people = pd.read_csv("data/individuals_dataset.csv")
df_people = df_people[df_people['GPS_RECORD'] == True]

df_merged = pd.merge(
    df_gps,
    df_people[['ID', 'WEIGHT_INDIV']],
    left_on='ID',
    right_on='ID',
    how='inner'
)#.drop(columns='ID')
df_merged = df_merged.rename(columns={"ori_lat": "start_lat", "ori_lon": "start_lon", "dst_lat":'end_lat',"dst_lon":'end_lon'})

In [4]:
h3_resolution = 10
df_merged['start_h3'] = df_merged.apply(lambda row: h3.latlng_to_cell(row['start_lat'], row['start_lon'], h3_resolution), axis=1)
df_merged['end_h3'] = df_merged.apply(lambda row: h3.latlng_to_cell(row['end_lat'], row['end_lon'], h3_resolution), axis=1)

In [5]:
od_matrix_first = df_merged.groupby(['start_h3', 'end_h3']).agg({
    'WEIGHT_INDIV': ['sum', 'count']
}).reset_index()

od_matrix_first.columns = ['start_h3', 'end_h3', 'total_weight', 'count']

In [None]:
parent_hexes = ["861fb4667ffffff", "861fb4677ffffff", "861fb466fffffff", "861fb4647ffffff", "861fb475fffffff"]

target_resolution = 10
start_valid_h3 = set()
end_valid_h3 = set()

for parent in parent_hexes:
    children = h3.cell_to_children(parent, target_resolution)
    for child in children:
        start_valid_h3.add(child)
        end_valid_h3.add(child)
mask = (
    (od_matrix_first["start_h3"].isin(start_valid_h3))
    & (od_matrix_first["end_h3"].isin(end_valid_h3))
)
od_matrix_first = od_matrix_first[mask].copy()
print(f"Number of filtered rows: {len(od_matrix_first):,}")

Number of filtered rows: 23,264


In [None]:
od_matrix = od_matrix_first.copy()
od_matrix

In [9]:
filtered_df = df_merged.merge(
    od_matrix[['start_h3', 'end_h3']],
    on=['start_h3', 'end_h3'],
    how='inner'
)

In [10]:
tree_start = create_h3_hierarchical_tree(od_matrix, target_resolution=10, hex_column='start_h3')

Resolution 0: 1 nodes
Resolution 1: 1 nodes
Resolution 2: 1 nodes
Resolution 3: 1 nodes
Resolution 4: 1 nodes
Resolution 5: 2 nodes
Resolution 6: 5 nodes
Resolution 7: 35 nodes
Resolution 8: 244 nodes
Resolution 9: 1522 nodes
Resolution 10: 6624 nodes
OPTIMIZED TREE STATISTICS (START_H3)
{'total_nodes': 8433, 'root_resolution': 4, 'min_resolution': 4, 'target_resolution': 10, 'total_weight': 72646323, 'nodes_by_resolution': defaultdict(<class 'int'>, {10: 6624, 9: 1522, 8: 244, 7: 35, 6: 5, 5: 2, 4: 1}), 'resolution_range': '4 → 10'}
OPTIMIZATIONS:
Saved resolutions: 4
Tree efficiency: 7/11 livelli utilizzati


In [11]:
tree_end = create_h3_hierarchical_tree(od_matrix, target_resolution=10, hex_column='end_h3')

Resolution 0: 1 nodes
Resolution 1: 1 nodes
Resolution 2: 1 nodes
Resolution 3: 1 nodes
Resolution 4: 1 nodes
Resolution 5: 2 nodes
Resolution 6: 5 nodes
Resolution 7: 35 nodes
Resolution 8: 245 nodes
Resolution 9: 1509 nodes
Resolution 10: 6651 nodes
OPTIMIZED TREE STATISTICS (END_H3)
{'total_nodes': 8448, 'root_resolution': 4, 'min_resolution': 4, 'target_resolution': 10, 'total_weight': 72646296, 'nodes_by_resolution': defaultdict(<class 'int'>, {10: 6651, 9: 1509, 8: 245, 7: 35, 6: 5, 5: 2, 4: 1}), 'resolution_range': '4 → 10'}
OPTIMIZATIONS:
Saved resolutions: 4
Tree efficiency: 7/11 livelli utilizzati


## OIGH algorithm

In [None]:
class Lattice2DCount:
    """
    OIGH adapted for H3HierarchicalTree
    """
    def __init__(self, od_matrix, tree_start, tree_end, k, S):
        self.od_matrix = od_matrix.copy()
        self.tree_start = tree_start
        self.tree_end = tree_end
        self.k = k
        self.S = S
        self.total_vol = self.od_matrix['count'].sum()

        self.L_start = max(node.resolution for node in tree_start.nodes.values())
        self.L_end   = max(node.resolution for node in tree_end.nodes.values())

        self.nodes = {}
        self.add_node(self.L_start, self.L_end, parents=[])
        self.max_level_found = np.inf
        self.od_matrix_agg = None
        self.best_avg_class_size = np.inf

    def add_node(self, lvlo, lvld, parents):
        if (lvlo, lvld) not in self.nodes:
            node = LatticeNodeCount(lvlo, lvld, parents=parents, lattice=self)
            self.nodes[(lvlo, lvld)] = node
            if lvlo > self.tree_start.min_resolution:
                self.add_node(lvlo-1, lvld, [node])
                node.children.append(self.nodes[(lvlo-1, lvld)])
            if lvld > self.tree_end.min_resolution:
                self.add_node(lvlo, lvld-1, [node])
                node.children.append(self.nodes[(lvlo, lvld-1)])
        else:
            self.nodes[(lvlo, lvld)].parents += parents


class LatticeNodeCount:
    def __init__(self, lvlo, lvld, parents, lattice):
        self.lattice = lattice
        self.lvlo = lvlo
        self.lvld = lvld
        self.parents = parents
        self.children = []
        self.anonymous = None
        self.visited = False

    def evaluate(self):
        self.visited = True
        if self.lvlo + self.lvld <= self.lattice.max_level_found:

            if self.children and not self.children[0].visited:
                self.children[0].evaluate()

            if self.anonymous is None:
                od_matrix_agg = self.get_aggregation()

                self.avg_class_size = self.get_mean_agg_level(od_matrix_agg)
                self.suppr_vol = od_matrix_agg[od_matrix_agg['count'] < self.lattice.k]['count'].sum()

                if self.suppr_vol > self.lattice.S:
                    self.tag_unanonymous()
                else:
                    if self.lvlo + self.lvld == self.lattice.max_level_found:
                        if (self.lattice.od_matrix_agg is None) or (self.avg_class_size < self.lattice.best_avg_class_size):
                            self.lattice.od_matrix_agg = od_matrix_agg
                            self.lattice.best_avg_class_size = self.avg_class_size
                    else:
                        self.lattice.max_level_found = self.lvlo + self.lvld
                        self.lattice.od_matrix_agg = od_matrix_agg
                        self.lattice.best_avg_class_size = self.avg_class_size
                    self.tag_anonymous()

            if len(self.children) > 1 and not self.children[1].visited:
                self.children[1].evaluate()

    def tag_anonymous(self):
        if self.anonymous is None:
            self.anonymous = True
            for c in self.children:
                c.tag_anonymous()

    def tag_unanonymous(self):
        if self.anonymous is None:
            self.anonymous = False
            for p in self.parents:
                p.tag_unanonymous()

    def map_to_level(self, h, target_res, tree):
        node = tree.nodes.get(h)
        if node is None:
            return h
        while node and node.resolution > target_res:
            node = node.parent
        return node.h3_id if node else h

    def get_aggregation(self):
        df = self.lattice.od_matrix.copy()

        df['start_gen'] = df['start_h3'].apply(lambda h: self.map_to_level(h, self.lvlo, self.lattice.tree_start))
        df['end_gen']   = df['end_h3'].apply(lambda h: self.map_to_level(h, self.lvld, self.lattice.tree_end))

        agg = df.groupby(['start_gen', 'end_gen']).agg({
            'count': 'sum',          
            'total_weight': 'sum'    
        }).reset_index()

        return agg

    def get_mean_agg_level(self, od_matrix_agg):
        def weighted_res(h3_col, tree):
            levels = []
            for h in h3_col:
                node = tree.nodes.get(h)
                levels.append(node.resolution if node else tree.min_resolution)
            return levels

        od_matrix_agg['res_o'] = weighted_res(od_matrix_agg['start_gen'], self.lattice.tree_start)
        od_matrix_agg['res_d'] = weighted_res(od_matrix_agg['end_gen'], self.lattice.tree_end)

        od_matrix_agg['w_res_o'] = od_matrix_agg['res_o'] * od_matrix_agg['count']
        od_matrix_agg['w_res_d'] = od_matrix_agg['res_d'] * od_matrix_agg['count']

        mean_vals = od_matrix_agg[od_matrix_agg['count'] >= self.lattice.k].agg({
            'w_res_o':'sum', 'w_res_d':'sum', 'count':'sum'
        })

        if mean_vals['count'] == 0:
            return np.inf

        return (mean_vals['w_res_o'] + mean_vals['w_res_d']) / mean_vals['count']


def oigh(od_matrix, tree_start, tree_end, k, S):
    lat = Lattice2DCount(od_matrix, tree_start, tree_end, k, S)
    lat.nodes[(lat.L_start, lat.L_end)].evaluate()
    return lat.od_matrix_agg

In [None]:
od_matrix_generalized = oigh(od_matrix, tree_start, tree_end, k=10, S=0)
od_matrix_generalized.head()

## Result and visualization

In [20]:
visualizer = GeneralizedH3Visualizer(od_matrix_generalized)
mappa = visualizer.create_map(max_hexagons=2000000)
mappa

In [None]:
metrics = compute_discernability_and_cavg(od_matrix_generalized, k=10, suppressed_count=0)

In [27]:
metrics

{'C_DM': np.int64(36396727),
 'C_AVG': np.float64(80.58),
 'total_records': np.int64(28203),
 'total_equivalence_classes': 35,
 'k': 10}

In [None]:
class GeneralizationMetric:
    """
    Ḡ = (1/V+) × Σ(|o| + |d|) × v_{o→d}
    """
    def __init__(self, k_threshold: int = 10):
        self.k_threshold = k_threshold

    def calculate_generalization_error(self, od_matrix_generalized: pd.DataFrame, od_matrix: pd.DataFrame) -> float:
        # generalized -> number of original cells
        origin_counts = self._build_hexagon_counts(
            od_matrix_generalized, od_matrix, column_gen="start_gen", column_orig="start_h3"
        )
        destination_counts = self._build_hexagon_counts(
            od_matrix_generalized, od_matrix, column_gen="end_gen", column_orig="end_h3"
        )

        total_volume_anonymous = 0
        weighted_count_sum = 0

        for _, row in od_matrix_generalized.iterrows():
            flow_value = row["count"]
            if flow_value >= self.k_threshold:
                origin_h3 = row["start_gen"]
                dest_h3   = row["end_gen"]

                origin_count = origin_counts.get(origin_h3, 1)
                dest_count   = destination_counts.get(dest_h3, 1)

                total_volume_anonymous += flow_value
                weighted_count_sum += (origin_count + dest_count) * flow_value

        return weighted_count_sum / total_volume_anonymous if total_volume_anonymous > 0 else 0.0

    def _build_hexagon_counts(
        self, od_matrix_generalized: pd.DataFrame, od_matrix: pd.DataFrame, 
        column_gen: str, column_orig: str
    ) -> dict:
        """
        Count how many original hexagons belong to each generalized hexagon.
        """
        generalized_hexagons = od_matrix_generalized[column_gen].unique()
        original_hexagons = od_matrix[column_orig].unique()

        counts = {}
        for gen_hex in generalized_hexagons:
            target_res = h3.get_resolution(gen_hex)

            # Find all parents of originals at target resolution
            parent_series = [h3.cell_to_parent(h, target_res) for h in original_hexagons]

            # Count how many times the parent == gen_hex appears
            count = sum(1 for p in parent_series if p == gen_hex)
            counts[gen_hex] = max(count, 1)  # fallback to 1

        return counts

In [None]:
metric = GeneralizationMetric(k_threshold=10)
error = metric.calculate_generalization_error(od_matrix_generalized, od_matrix)
print(f"Average generalization error Ḡ: {error:.3f}")

In [None]:
def fast_reconstruction_loss(original_od_df: pd.DataFrame,
                             od_matrix_generalized: pd.DataFrame) -> float:
    """
    reconstruction loss:first_seen
    E = (1/V) * Σ |ṽ_o→d - v_o→d|
    """
    
    # Crea un dizionario per accesso veloce ai flussi generalizzati
    generalized_flows = {
        (row['start_gen'], row['end_gen']): row['count']
        for _, row in od_matrix_generalized.iterrows()
    }
    
    total_volume = 0
    total_abs_error = 0

    gen_start_hexes = od_matrix_generalized['start_gen'].unique()
    gen_end_hexes   = od_matrix_generalized['end_gen'].unique()

    for _, row in original_od_df.iterrows():
        start_h3 = row['start_h3']
        end_h3   = row['end_h3']
        true_count = row['count']
        
        # Trova gli esagoni generalizzati corrispondenti
        gen_start = _find_generalized_parent(start_h3, gen_start_hexes)
        gen_end   = _find_generalized_parent(end_h3, gen_end_hexes)
        
        if gen_start is None or gen_end is None:
            continue
            
        # Search for the corresponding generalized flow
        gen_key = (gen_start, gen_end)
        gen_count = generalized_flows.get(gen_key, 0)
        
        # Calculate the absolute error
        total_abs_error += abs(gen_count - true_count)
        total_volume += true_count

    return total_abs_error / total_volume if total_volume > 0 else 0.0


def _find_generalized_parent(original_h3: str, generalized_hexagons: list) -> str:
    """
    Find the generalized hexagon that contains the original hexagon.
    """
    original_res = h3.get_resolution(original_h3)
    
    for gen_hex in generalized_hexagons:
        gen_res = h3.get_resolution(gen_hex)
        
        if gen_res <= original_res:
            parent = h3.cell_to_parent(original_h3, gen_res)
            if parent == gen_hex:
                return gen_hex
    
    return None

In [16]:
loss = fast_reconstruction_loss(
    original_od_df=od_matrix,
    od_matrix_generalized=od_matrix_generalized
)
print(f"Reconstruction Loss: {loss:.6f}")

### Metrics with weights

In [11]:
def compute_discernability_and_cavg(df: pd.DataFrame, k: int, suppressed_count: int = 0) -> dict:
    """
    Args:
        df: DataFrame with ['start_h3', 'end_h3', 'count']
        k: for k-anonimity
        suppressed_count: number of OD pairs suppressed (optional)
    
    Returns:
        dict con C_DM, C_AVG, total number of records and equivalence classes
    """
    counts = df['total_weight'].values
    total_records = counts.sum() + suppressed_count
    total_equiv_classes = len(counts) + suppressed_count
    
    k_anonymous_counts = counts[counts >= k]
    c_dm_gen = np.sum(k_anonymous_counts**2)
    
    # Penalty for suppressed records
    suppression_penalty = suppressed_count * counts.sum()  # o totale record, a seconda della definizione
    c_dm = c_dm_gen + suppression_penalty
    
    # C_AVG: (total_records / total_equiv_classes) / k
    c_avg = (total_records / total_equiv_classes) / k if total_equiv_classes > 0 else float('inf')
    
    return {
        'C_DM': c_dm,
        'C_AVG': c_avg,
        'total_records': total_records,
        'total_equivalence_classes': total_equiv_classes,
        'k': k
    }

In [None]:
metrics = compute_discernability_and_cavg(od_matrix_generalized, k=10*media_peso, suppressed_count=0)

In [None]:
class GeneralizationMetric:
    """
    Ḡ = (1/V+) × Σ(|o| + |d|) × v_{o→d}
    """
    def __init__(self, k_threshold: int = 10):
        self.k_threshold = k_threshold

    def calculate_generalization_error(self, od_matrix_generalized: pd.DataFrame, od_matrix: pd.DataFrame) -> float:
        # generalized -> number of original cells
        origin_counts = self._build_hexagon_counts(
            od_matrix_generalized, od_matrix, column_gen="start_gen", column_orig="start_h3"
        )
        destination_counts = self._build_hexagon_counts(
            od_matrix_generalized, od_matrix, column_gen="end_gen", column_orig="end_h3"
        )

        total_volume_anonymous = 0
        weighted_count_sum = 0

        for _, row in od_matrix_generalized.iterrows():
            flow_value = row["total_weight"]
            if flow_value >= self.k_threshold:
                origin_h3 = row["start_gen"]
                dest_h3   = row["end_gen"]

                origin_count = origin_counts.get(origin_h3, 1)
                dest_count   = destination_counts.get(dest_h3, 1)

                total_volume_anonymous += flow_value
                weighted_count_sum += (origin_count + dest_count) * flow_value

        return weighted_count_sum / total_volume_anonymous if total_volume_anonymous > 0 else 0.0

    def _build_hexagon_counts(
        self, od_matrix_generalized: pd.DataFrame, od_matrix: pd.DataFrame, 
        column_gen: str, column_orig: str
    ) -> dict:
        """
        Count how many original hexagons belong to each generalized hexagon.
        """
        generalized_hexagons = od_matrix_generalized[column_gen].unique()
        original_hexagons = od_matrix[column_orig].unique()

        counts = {}
        for gen_hex in generalized_hexagons:
            target_res = h3.get_resolution(gen_hex)

            # Trova tutti i parent degli originali alla risoluzione target
            parent_series = [h3.cell_to_parent(h, target_res) for h in original_hexagons]

            # Conta quante volte compare il parent == gen_hex
            count = sum(1 for p in parent_series if p == gen_hex)
            counts[gen_hex] = max(count, 1)  # fallback a 1

        return counts

In [None]:
metric = GeneralizationMetric(k_threshold=10*media_peso)
error = metric.calculate_generalization_error(od_matrix_generalized, od_matrix)

In [None]:
def fast_reconstruction_loss(original_od_df: pd.DataFrame,
                             od_matrix_generalized: pd.DataFrame) -> float:
    """
    reconstruction loss:
    E = (1/V) * Σ |ṽ_o→d - v_o→d|
    """
    
    
    generalized_flows = {
        (row['start_gen'], row['end_gen']): row['total_weight']
        for _, row in od_matrix_generalized.iterrows()
    }
    
    total_volume = 0
    total_abs_error = 0

    gen_start_hexes = od_matrix_generalized['start_gen'].unique()
    gen_end_hexes   = od_matrix_generalized['end_gen'].unique()

    for _, row in original_od_df.iterrows():
        start_h3 = row['start_h3']
        end_h3   = row['end_h3']
        true_count = row['total_weight']
        
        # Find the corresponding generalized hexagons
        gen_start = _find_generalized_parent(start_h3, gen_start_hexes)
        gen_end   = _find_generalized_parent(end_h3, gen_end_hexes)
        
        if gen_start is None or gen_end is None:
            continue
            
        # Search for the corresponding generalized flow
        gen_key = (gen_start, gen_end)
        gen_count = generalized_flows.get(gen_key, 0)
        
        # the absolute error
        total_abs_error += abs(gen_count - true_count)
        total_volume += true_count

    return total_abs_error / total_volume if total_volume > 0 else 0.0


def _find_generalized_parent(original_h3: str, generalized_hexagons: list) -> str:
    """
    Find the generalized hexagon that contains the original hexagon.
    """
    original_res = h3.get_resolution(original_h3)
    
    for gen_hex in generalized_hexagons:
        gen_res = h3.get_resolution(gen_hex)
        
        if gen_res <= original_res:
            parent = h3.cell_to_parent(original_h3, gen_res)
            if parent == gen_hex:
                return gen_hex
    
    return None

In [None]:
loss = fast_reconstruction_loss(
    original_od_df=od_matrix,
    od_matrix_generalized=od_matrix_generalized
)