In [10]:
# -*- coding: utf-8 -*-
"""
Demonstrates loading and accessing the facility location problem data.
Handles potential nested dictionary structure and verifies structure across all settings.
Also inspects agent weights alongside peaks and misreports.
"""

import pickle
import numpy as np
import os

# --- Configuration ---
DATA_DIR = 'data'
TRAIN_DATA_FILE = os.path.join(DATA_DIR, 'all_data_train.pkl')
TEST_DATA_FILE = os.path.join(DATA_DIR, 'all_data_test.pkl')

# --- Load Data ---
print(f"Attempting to load training data from: {TRAIN_DATA_FILE}")
print(f"Attempting to load test data from: {TEST_DATA_FILE}")

data_train = None
data_test = None

# Load training data
try:
    if not os.path.exists(DATA_DIR):
        print(f"WARNING: Data directory '{DATA_DIR}' not found. Creating it.")
        os.makedirs(DATA_DIR)
        print(f"Please place '{os.path.basename(TRAIN_DATA_FILE)}' and '{os.path.basename(TEST_DATA_FILE)}' in '{DATA_DIR}'.")
    with open(TRAIN_DATA_FILE, 'rb') as f:
        data_train = pickle.load(f)
    print("Training data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Training data file not found at {TRAIN_DATA_FILE}")
except Exception as e:
    print(f"ERROR: Could not load training data. {e}")

# Load test data
try:
    with open(TEST_DATA_FILE, 'rb') as f:
        data_test = pickle.load(f)
    print("Test data loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: Test data file not found at {TEST_DATA_FILE}")
except Exception as e:
    print(f"ERROR: Could not load test data. {e}")

# --- Verify Structure Across All Settings ---
print("\n--- Verifying Structure Across All Settings ---")
if data_train:
    print("\nTraining Data Settings:")
    for setting, content in data_train.items():
        if isinstance(content, dict):
            print(f"  {setting}: keys = {list(content.keys())}")
        else:
            print(f"  {setting}: unexpected type {type(content)}")
else:
    print("No training data.")

if data_test:
    print("\nTest Data Settings:")
    for setting, content in data_test.items():
        if isinstance(content, dict):
            print(f"  {setting}: keys = {list(content.keys())}")
        else:
            print(f"  {setting}: unexpected type {type(content)}")
else:
    print("No test data.")

# --- Inspect a Specific Setting: training ---
if data_train:
    example_setting = ('uniform', 5)
    print(f"\n--- Inspect Training Setting {example_setting} ---")
    if example_setting in data_train:
        setting_data = data_train[example_setting]
        print("Inner keys:", list(setting_data.keys()))

        # Peaks
        peaks = setting_data.get('peaks', None)
        if isinstance(peaks, np.ndarray):
            print("Peaks shape:", peaks.shape)
            print("First 3 peak samples:\n", peaks[:3])
        else:
            print("Peaks not found or wrong type.")

        # Weights
        weights = setting_data.get('weights', None)
        if isinstance(weights, np.ndarray):
            print("Weights shape:", weights.shape)
            if weights.ndim == 1:
                print("Agent weights (constant):", weights)
            elif weights.ndim == 2:
                print("First 3 samples' weights:\n", weights[:3])
        else:
            print("Weights not found or wrong type.")

        # Misreports
        misr = setting_data.get('misreports', None)
        if isinstance(misr, np.ndarray):
            print("Misreports shape:", misr.shape)
            if misr.ndim >= 3:
                print("Misreports for agent 0 of sample 0:", misr[0, 0, :])
        else:
            print("Misreports not found or wrong type.")
    else:
        print("Setting not found in training data.")

# --- Inspect a Specific Setting: test ---
if data_test:
    example_test = ('normal', 10)
    print(f"\n--- Inspect Test Setting {example_test} ---")
    if example_test in data_test:
        setting_data = data_test[example_test]
        print("Inner keys:", list(setting_data.keys()))

        # Peaks
        peaks = setting_data.get('peaks', None)
        if isinstance(peaks, np.ndarray):
            print("Test peaks shape:", peaks.shape)
            print("First 3 test peak samples:\n", peaks[:3])
        else:
            print("Test peaks not found or wrong type.")

        # Weights
        weights = setting_data.get('weights', None)
        if isinstance(weights, np.ndarray):
            print("Test weights shape:", weights.shape)
            if weights.ndim == 1:
                print("Test agent weights (constant):", weights)
            elif weights.ndim == 2:
                print("First 3 test samples' weights:\n", weights[:3])
        else:
            print("Test weights not found or wrong type.")
    else:
        print("Setting not found in test data.")
else:
    print("No test data to inspect.")

print("\n--- End of Data Exploration ---")



Attempting to load training data from: data/all_data_train.pkl
Attempting to load test data from: data/all_data_test.pkl
Training data loaded successfully.
Test data loaded successfully.

--- Verifying Structure Across All Settings ---

Training Data Settings:
  ('uniform', 5): keys = ['peaks', 'misreports']
  ('normal', 5): keys = ['peaks', 'misreports']
  ('beta1', 5): keys = ['peaks', 'misreports']
  ('beta2', 5): keys = ['peaks', 'misreports']
  ('uniform', 9): keys = ['peaks', 'misreports']
  ('normal', 9): keys = ['peaks', 'misreports']
  ('beta1', 9): keys = ['peaks', 'misreports']
  ('beta2', 9): keys = ['peaks', 'misreports']
  ('uniform', 10): keys = ['peaks', 'misreports']
  ('normal', 10): keys = ['peaks', 'misreports']
  ('beta1', 10): keys = ['peaks', 'misreports']
  ('beta2', 10): keys = ['peaks', 'misreports']
  ('uniform', 100): keys = ['peaks', 'misreports']
  ('normal', 100): keys = ['peaks', 'misreports']
  ('beta1', 100): keys = ['peaks', 'misreports']
  ('beta2', 

In [8]:
# -*- coding: utf-8 -*-
"""
Implements and evaluates baseline mechanisms for the facility location problem.
"""

import pickle
import numpy as np
import os
import time

# --- Configuration ---
DATA_DIR = 'data'
TRAIN_DATA_FILE = os.path.join(DATA_DIR, 'all_data_train.pkl')
TEST_DATA_FILE = os.path.join(DATA_DIR, 'all_data_test.pkl')
PEAK_DATA_KEY = 'peaks' # Key for accessing peak data in the loaded dictionary
MISREPORT_DATA_KEY = 'misreports' # Key for accessing misreport data

# --- Evaluation Setting ---
# Choose the problem setting and number of facilities (K) to evaluate
DISTRIBUTION = 'uniform'
NUM_AGENTS = 5
K_FACILITIES = 2 # Number of facilities to locate

# Define agent weights (uniform weights for this example)
# For weighted scenarios from the paper, use e.g., [5, 1, 1, 1, 1] for n=5
AGENT_WEIGHTS = np.ones(NUM_AGENTS)
# Normalize weights
AGENT_WEIGHTS = AGENT_WEIGHTS / np.sum(AGENT_WEIGHTS)

# Regret calculation parameters
# Assume misreports structure: (num_samples, num_agents, num_misreports_per_agent)
# If structure is different, adjust calculate_regret function
NUM_MISREPORTS_PER_AGENT = 10 # From LLMMech paper Section 4.1

# --- Load Data ---
print(f"Loading data for setting: ({DISTRIBUTION}, {NUM_AGENTS})")

data_train = None
data_test = None
train_peaks = None
train_misreports = None
test_peaks = None
test_misreports = None

try:
    with open(TRAIN_DATA_FILE, 'rb') as f:
        data_train = pickle.load(f)
    setting_key = (DISTRIBUTION, NUM_AGENTS)
    if setting_key in data_train and isinstance(data_train[setting_key], dict):
        train_peaks = data_train[setting_key].get(PEAK_DATA_KEY)
        train_misreports = data_train[setting_key].get(MISREPORT_DATA_KEY)
        if train_peaks is None:
             print(f"ERROR: Key '{PEAK_DATA_KEY}' not found in training data for {setting_key}")
        # Add check for misreports if needed for training phase (e.g., finding best constant)
    else:
        print(f"ERROR: Setting {setting_key} not found or not a dict in training data.")

except FileNotFoundError:
    print(f"ERROR: Training data file not found at {TRAIN_DATA_FILE}")
except Exception as e:
    print(f"ERROR: Could not load training data. {e}")

try:
    with open(TEST_DATA_FILE, 'rb') as f:
        data_test = pickle.load(f)
    setting_key = (DISTRIBUTION, NUM_AGENTS)
    if setting_key in data_test and isinstance(data_test[setting_key], dict):
        test_peaks = data_test[setting_key].get(PEAK_DATA_KEY)
        test_misreports = data_test[setting_key].get(MISREPORT_DATA_KEY)
        if test_peaks is None:
             print(f"ERROR: Key '{PEAK_DATA_KEY}' not found in test data for {setting_key}")
        if test_misreports is None:
             print(f"ERROR: Key '{MISREPORT_DATA_KEY}' not found in test data for {setting_key}")
             print("Regret calculation will not be possible.")
    else:
        print(f"ERROR: Setting {setting_key} not found or not a dict in test data.")

except FileNotFoundError:
    print(f"ERROR: Test data file not found at {TEST_DATA_FILE}")
except Exception as e:
    print(f"ERROR: Could not load test data. {e}")

# Proceed only if test data is loaded
if test_peaks is not None:
    print(f"\nSuccessfully loaded test peaks. Shape: {test_peaks.shape}")
    if test_misreports is not None:
         print(f"Successfully loaded test misreports. Shape: {test_misreports.shape}") # Print shape to verify structure
         # Expected shape e.g., (1000, 5, 10) for 1000 samples, 5 agents, 10 misreports each
         # Adjust NUM_MISREPORTS_PER_AGENT if the shape differs

    # --- Helper Functions ---

    def calculate_social_cost(agent_peaks, facility_locations, weights):
        """Calculates the weighted social cost for one instance."""
        if not isinstance(facility_locations, (list, np.ndarray)) or len(facility_locations) == 0:
            return np.inf # Invalid locations
        if not isinstance(agent_peaks, (list, np.ndarray)):
             return np.inf # Invalid peaks

        agent_peaks = np.array(agent_peaks)
        facility_locations = np.array(facility_locations)
        weights = np.array(weights)

        # Calculate distance from each agent to their nearest facility
        # Expand dims for broadcasting: agent_peaks (n, 1), facility_locations (k,) -> distances (n, k)
        distances = np.abs(agent_peaks[:, np.newaxis] - facility_locations)
        min_distances = np.min(distances, axis=1) # Shape (n,)

        # Calculate weighted average cost
        weighted_cost = np.sum(min_distances * weights) # Uses normalized weights implicitly
        # If weights were not normalized, divide by np.sum(weights)
        return weighted_cost

    def calculate_regret(mechanism_func, instance_idx, true_peaks_all, misreports_all, weights, k_facilities):
        """
        Calculates the maximum regret for a given mechanism on a specific instance.
        Assumes misreports_all shape is (num_samples, num_agents, num_misreports)
        """
        if misreports_all is None:
            # print("Misreport data not available, skipping regret calculation.")
            return 0.0 # Cannot calculate regret

        true_peaks_instance = true_peaks_all[instance_idx]
        misreports_instance = misreports_all[instance_idx] # Shape (num_agents, num_misreports)

        # 1. Calculate cost with true reports
        true_locations = mechanism_func(true_peaks_instance, k_facilities, weights)
        true_cost_per_agent = np.min(np.abs(true_peaks_instance[:, np.newaxis] - np.array(true_locations)), axis=1)

        max_regret_instance = 0.0

        # 2. Iterate through each agent potentially misreporting
        for agent_i in range(len(true_peaks_instance)):
            max_gain_agent_i = 0.0
            # 3. Iterate through each possible misreport for that agent
            # Ensure misreports_instance has the expected dimensions
            if misreports_instance.ndim == 2 and misreports_instance.shape[0] == len(true_peaks_instance):
                 for misreport_j in range(misreports_instance.shape[1]):
                    misreported_peak = misreports_instance[agent_i, misreport_j]
                    # Create the peak profile with agent_i misreporting
                    misreport_profile = np.copy(true_peaks_instance)
                    misreport_profile[agent_i] = misreported_peak

                    # 4. Run mechanism with the misreported profile
                    misreport_locations = mechanism_func(misreport_profile, k_facilities, weights)

                    # 5. Calculate agent_i's cost *using their true peak* but with the new locations
                    cost_with_misreport = np.min(np.abs(true_peaks_instance[agent_i] - np.array(misreport_locations)))

                    # 6. Calculate gain (true cost - cost when misreporting)
                    gain = true_cost_per_agent[agent_i] - cost_with_misreport

                    # 7. Track max gain for this agent
                    if gain > max_gain_agent_i:
                        max_gain_agent_i = gain
            else:
                 # Handle cases where misreport structure might be different or missing
                 # print(f"Warning: Unexpected misreport structure for instance {instance_idx}, agent {agent_i}. Shape: {misreports_instance.shape}")
                 pass # Skip regret calculation for this agent/instance if structure is wrong

            # 8. Track max regret across all agents for this instance
            if max_gain_agent_i > max_regret_instance:
                max_regret_instance = max_gain_agent_i

        return max_regret_instance


    # --- Baseline Mechanism Implementations ---

    def percentile_rule(agent_peaks, k_facilities, weights=None):
        """Implements a fixed percentile rule (median-based partitioning)."""
        n_agents = len(agent_peaks)
        sorted_peaks = np.sort(agent_peaks)

        if k_facilities == 0:
            return []
        if k_facilities == 1:
            return [np.median(sorted_peaks)]
        if k_facilities >= n_agents: # Place facility at each unique peak
             return sorted(list(np.unique(sorted_peaks)))[:k_facilities]

        # Partition agents into K groups and find median of each
        locations = []
        indices = np.linspace(0, n_agents, k_facilities + 1, dtype=int)
        for i in range(k_facilities):
            group = sorted_peaks[indices[i]:indices[i+1]]
            if len(group) > 0:
                locations.append(np.median(group))
            # Handle potential empty groups if k is large (though covered by k>=n check)
            # else: locations.append(np.median(sorted_peaks)) # Fallback, unlikely needed

        # Ensure K locations are returned, handle edge cases if partitioning failed
        while len(locations) < k_facilities and len(locations) > 0:
             locations.append(locations[-1]) # Duplicate last location
        if not locations: # If K>0 but partitioning failed completely
             locations = [np.median(sorted_peaks)] * k_facilities


        return sorted(locations) # Return sorted locations

    def dictatorial_rule(agent_peaks, k_facilities, weights, dictator_index):
        """Places all K facilities at the dictator's peak."""
        if dictator_index < 0 or dictator_index >= len(agent_peaks):
            print(f"Warning: Invalid dictator index {dictator_index}")
            return [np.median(agent_peaks)] * k_facilities # Fallback
        return [agent_peaks[dictator_index]] * k_facilities

    def best_dictatorial_rule(agent_peaks, k_facilities, weights):
        """Finds the best dictator for a specific instance."""
        n_agents = len(agent_peaks)
        best_cost = np.inf
        best_locations = []

        for i in range(n_agents):
            locations = dictatorial_rule(agent_peaks, k_facilities, weights, i)
            cost = calculate_social_cost(agent_peaks, locations, weights)
            if cost < best_cost:
                best_cost = cost
                best_locations = locations

        # Handle case where no valid dictator found (shouldn't happen if peaks are valid)
        if not best_locations:
             median_loc = np.median(agent_peaks)
             best_locations = [median_loc] * k_facilities

        return best_locations

    def constant_rule(agent_peaks, k_facilities, weights=None):
        """Uses fixed, evenly spaced locations."""
        if k_facilities == 0:
            return []
        # Simple fixed locations: 1/(K+1), 2/(K+1), ..., K/(K+1)
        locations = [ (i + 1.0) / (k_facilities + 1.0) for i in range(k_facilities) ]
        return locations

    # --- Evaluation Loop ---
    print(f"\n--- Evaluating Baselines on Test Set ---")
    print(f"Setting: Distribution={DISTRIBUTION}, Agents={NUM_AGENTS}, Facilities={K_FACILITIES}")

    baseline_results = {
        "Percentile (Median Partition)": {"costs": [], "regrets": []},
        "Best Dictatorial": {"costs": [], "regrets": []},
        "Constant (Evenly Spaced)": {"costs": [], "regrets": []},
    }

    num_test_samples = test_peaks.shape[0]
    start_time = time.time()

    for i in range(num_test_samples):
        instance_peaks = test_peaks[i]

        # Evaluate Percentile Rule
        perc_locs = percentile_rule(instance_peaks, K_FACILITIES)
        perc_cost = calculate_social_cost(instance_peaks, perc_locs, AGENT_WEIGHTS)
        perc_regret = calculate_regret(percentile_rule, i, test_peaks, test_misreports, AGENT_WEIGHTS, K_FACILITIES)
        baseline_results["Percentile (Median Partition)"]["costs"].append(perc_cost)
        baseline_results["Percentile (Median Partition)"]["regrets"].append(perc_regret)

        # Evaluate Best Dictatorial Rule
        bdict_locs = best_dictatorial_rule(instance_peaks, K_FACILITIES, AGENT_WEIGHTS)
        bdict_cost = calculate_social_cost(instance_peaks, bdict_locs, AGENT_WEIGHTS)
        # Regret for best dictatorial is tricky: which dictator rule was chosen *for this instance*?
        # We need to pass the *specific* best dictator index found for this instance to calculate_regret,
        # or recalculate regret based on the *best_dictatorial_rule* function itself.
        # For simplicity here, we recalculate based on the function.
        bdict_regret = calculate_regret(best_dictatorial_rule, i, test_peaks, test_misreports, AGENT_WEIGHTS, K_FACILITIES)
        baseline_results["Best Dictatorial"]["costs"].append(bdict_cost)
        baseline_results["Best Dictatorial"]["regrets"].append(bdict_regret)

        # Evaluate Constant Rule
        const_locs = constant_rule(instance_peaks, K_FACILITIES)
        const_cost = calculate_social_cost(instance_peaks, const_locs, AGENT_WEIGHTS)
        const_regret = calculate_regret(constant_rule, i, test_peaks, test_misreports, AGENT_WEIGHTS, K_FACILITIES)
        baseline_results["Constant (Evenly Spaced)"]["costs"].append(const_cost)
        baseline_results["Constant (Evenly Spaced)"]["regrets"].append(const_regret)

        if (i + 1) % 100 == 0:
            print(f"Processed {i+1}/{num_test_samples} instances...")

    end_time = time.time()
    print(f"\nEvaluation finished in {end_time - start_time:.2f} seconds.")

    # --- Report Results ---
    print("\n--- Average Results on Test Set ---")
    for name, results in baseline_results.items():
        avg_cost = np.mean(results["costs"]) if results["costs"] else np.nan
        avg_regret = np.mean(results["regrets"]) if results["regrets"] else np.nan
        print(f"{name}:")
        print(f"  Avg. Social Cost: {avg_cost:.6f}")
        if test_misreports is not None:
             print(f"  Avg. Max Regret:  {avg_regret:.6f}")
        else:
             print("  Avg. Max Regret:  Not calculated (misreports data missing)")

else:
    print("\nCannot proceed with evaluation as test data failed to load.")

print("\n--- End of Baseline Evaluation ---")


Loading data for setting: (uniform, 5)

Successfully loaded test peaks. Shape: (1000, 5)
Successfully loaded test misreports. Shape: (10000, 5)

--- Evaluating Baselines on Test Set ---
Setting: Distribution=uniform, Agents=5, Facilities=2
Processed 100/1000 instances...
Processed 200/1000 instances...
Processed 300/1000 instances...
Processed 400/1000 instances...
Processed 500/1000 instances...
Processed 600/1000 instances...
Processed 700/1000 instances...
Processed 800/1000 instances...
Processed 900/1000 instances...
Processed 1000/1000 instances...

Evaluation finished in 0.27 seconds.

--- Average Results on Test Set ---
Percentile (Median Partition):
  Avg. Social Cost: 0.088011
  Avg. Max Regret:  0.000000
Best Dictatorial:
  Avg. Social Cost: 0.200382
  Avg. Max Regret:  0.000000
Constant (Evenly Spaced):
  Avg. Social Cost: 0.138031
  Avg. Max Regret:  0.000000

--- End of Baseline Evaluation ---
