# VRP Raw Data Exploration

In [1]:
import json
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import numpy as np

# Get the directory of the current notebook
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
# Set the working directory to the project root (one level up from the notebook directory)
analysis_dir = os.path.join(notebook_dir, os.pardir)
project_root = os.path.join(analysis_dir, os.pardir)
os.chdir(project_root)

# You can verify the new working directory
print(f"Current Working Directory: {os.getcwd()}")

from notebooks.analysis.common_analysis_functions import (
    load_benchmark_data, parse_json_columns, filter_optimal_solutions,
    plot_objective_distribution, plot_solve_time_distribution,
    get_baseline_objective, read_baseline_log
)

# Configuration
VRP_MODEL_FILE_PATH = 'models/VRP/vrp_model.py'
VRP_MODEL_DATA_PATH = 'models/VRP/data/vrp_data_10cust_2veh.json'

# Extract data configuration string (e.g., '10cust_2veh')
data_config_match = re.search(r'vrp_data_([\w_]+)\.json', VRP_MODEL_DATA_PATH)
data_config_str = data_config_match.group(1) if data_config_match else 'default'

OUTPUT_PLOTS_DIR = Path(f'results/vrp_raw_data_exploration_plots/{data_config_str}')
OUTPUT_PLOTS_DIR.mkdir(parents=True, exist_ok=True)

BASELINE_LOG_FILEPATH = Path(f'notebooks/baseline_vrp_log_{data_config_str}.csv')

print(f"Baseline log will be saved to: {BASELINE_LOG_FILEPATH}")
print(f"Plots will be saved to: {OUTPUT_PLOTS_DIR}")

Current Working Directory: /home/timpi/Projects/thesis/multi_agent_supply_chain_optimization
Baseline log will be saved to: notebooks/baseline_vrp_log_10cust_2veh.csv
Plots will be saved to: results/vrp_raw_data_exploration_plots/10cust_2veh


## 1. Load Raw Data

In [2]:
try:
    with open(VRP_MODEL_DATA_PATH, 'r') as f:
        vrp_raw_data = json.load(f)
    print(f"Successfully loaded raw VRP data from {VRP_MODEL_DATA_PATH}")
    print("Keys available:", vrp_raw_data.keys())
except FileNotFoundError:
    print(f"Error: Data file not found at {VRP_MODEL_DATA_PATH}")
    vrp_raw_data = {}
except json.JSONDecodeError as e:
    print(f"Error decoding JSON from {VRP_MODEL_DATA_PATH}: {e}")
    vrp_raw_data = {}

# Get baseline objective
baseline_obj_value = get_baseline_objective(VRP_MODEL_FILE_PATH, VRP_MODEL_DATA_PATH, baseline_log_filepath=str(BASELINE_LOG_FILEPATH))
baseline_df = read_baseline_log(BASELINE_LOG_FILEPATH)

Successfully loaded raw VRP data from models/VRP/data/vrp_data_10cust_2veh.json
Keys available: dict_keys(['coords', 'distance', 'demand', 'vehicle_capacity', 'num_vehicles', 'depot'])
Running baseline model from models/VRP/vrp_model.py with data models/VRP/data/vrp_data_10cust_2veh.json...

log - Running optimization model...

log - Executing model source code...

log - Model execution completed.

log - Extracting optimization results...

log - Optimization results extracted.
Logging run 'baseline_run' to 'notebooks/baseline_vrp_log_10cust_2veh.csv'...
Successfully logged 1 runs to notebooks/baseline_vrp_log_10cust_2veh.csv

log - Optimization Completed.
Baseline objective value: 330.1
Baseline log file loaded from notebooks/baseline_vrp_log_10cust_2veh.csv.


## 2. Explore Data Structure and Contents

In [3]:
if vrp_raw_data:
    distance_matrix = vrp_raw_data.get('distance', [])
    demands = vrp_raw_data.get('demand', [])
    vehicle_capacity = vrp_raw_data.get('vehicle_capacity')
    num_vehicles = vrp_raw_data.get('num_vehicles')
    depot_idx = vrp_raw_data.get('depot', 0)

    num_nodes = len(distance_matrix)
    num_customers = len([d for i, d in enumerate(demands) if i != depot_idx])

    print(f"\nNumber of Nodes (Customers + Depot): {num_nodes}")
    print(f"Number of Customers: {num_customers}")
    print(f"Number of Vehicles: {num_vehicles}")
    print(f"Vehicle Capacity: {vehicle_capacity}")
    print(f"Depot Index: {depot_idx}")

    print("\n--- Demands ---")
    print(f"Count: {len(demands)}")
    if demands: 
        customer_demands = [d for i, d in enumerate(demands) if i != depot_idx]
        if customer_demands: print(f"Min: {np.min(customer_demands):.2f}, Max: {np.max(customer_demands):.2f}, Avg: {np.mean(customer_demands):.2f}, Std: {np.std(customer_demands):.2f}")
    print("First 5 values (including depot if present):", demands[:5])

    print("\n--- Distance Matrix ---")
    print(f"Matrix Shape: ({len(distance_matrix)}x{len(distance_matrix[0]) if distance_matrix else 0})")
    if distance_matrix:
        print("First 3x3 sub-matrix:")
        for i, row in enumerate(distance_matrix):
            if i < 3:
                print([f'{x:.2f}' for x in row[:3]])
            else:
                break
        # Flatten for statistics
        flat_dist = [item for sublist in distance_matrix for item in sublist]
        if flat_dist: print(f"Min: {np.min(flat_dist):.2f}, Max: {np.max(flat_dist):.2f}, Avg: {np.mean(flat_dist):.2f}, Std: {np.std(flat_dist):.2f}")
else:
    print("No raw data loaded for exploration.")


Number of Nodes (Customers + Depot): 11
Number of Customers: 10
Number of Vehicles: 2
Vehicle Capacity: 50
Depot Index: 0

--- Demands ---
Count: 11
Min: 3.00, Max: 9.00, Avg: 5.50, Std: 2.25
First 5 values (including depot if present): [0, 3, 7, 4, 9]

--- Distance Matrix ---
Matrix Shape: (11x11)
First 3x3 sub-matrix:
['0.00', '50.17', '82.42']
['50.17', '0.00', '72.64']
['82.42', '72.64', '0.00']
Min: 0.00, Max: 111.04, Avg: 50.85, Std: 27.76


## 3. Basic Data Visualizations

In [4]:
if vrp_raw_data:
    demands = vrp_raw_data.get('demand', [])
    distance_matrix = vrp_raw_data.get('distance', [])
    depot_idx = vrp_raw_data.get('depot', 0)

    # Demands distribution (excluding depot)
    customer_demands = [d for i, d in enumerate(demands) if i != depot_idx]
    if customer_demands:
        plt.figure(figsize=(8, 5))
        sns.histplot(customer_demands, kde=True)
        plt.title('Distribution of Customer Demands')
        plt.xlabel('Demand Value')
        plt.ylabel('Frequency')
        plt.savefig(OUTPUT_PLOTS_DIR / 'customer_demands_distribution.png')
        plt.close()
        print(f"Plot saved: {OUTPUT_PLOTS_DIR / 'customer_demands_distribution.png'}")

    # Distance matrix heatmap (if not too large)
    if distance_matrix:
        dist_df = pd.DataFrame(distance_matrix)
        if dist_df.shape[0] <= 20 and dist_df.shape[1] <= 20: # Limit for readability
            plt.figure(figsize=(10, 8))
            sns.heatmap(dist_df, annot=True, cmap='viridis', fmt=".1f")
            plt.title('Distance Matrix Heatmap')
            plt.xlabel('Node Index')
            plt.ylabel('Node Index')
            plt.savefig(OUTPUT_PLOTS_DIR / 'distance_matrix_heatmap.png')
            plt.close()
            print(f"Plot saved: {OUTPUT_PLOTS_DIR / 'distance_matrix_heatmap.png'}")
        else:
            print("Distance matrix too large for heatmap visualization. Plotting distribution instead.")
            # Plot distribution of flattened distance values
            flat_dist = [item for sublist in distance_matrix for item in sublist]
            if flat_dist:
                plt.figure(figsize=(8, 5))
                sns.histplot(flat_dist, kde=True, color='purple')
                plt.title('Distribution of Distance Values')
                plt.xlabel('Distance Value')
                plt.ylabel('Frequency')
                plt.savefig(OUTPUT_PLOTS_DIR / 'distance_values_distribution.png')
                plt.close()
                print(f"Plot saved: {OUTPUT_PLOTS_DIR / 'distance_values_distribution.png'}")
else:
    print("No raw data loaded for visualization.")

Plot saved: results/vrp_raw_data_exploration_plots/10cust_2veh/customer_demands_distribution.png
Plot saved: results/vrp_raw_data_exploration_plots/10cust_2veh/distance_matrix_heatmap.png


## 4. Route Visualization (Example from Benchmark Results)

In [5]:
def reconstruct_vrp_routes(variables_dict, data_coords, data_num_vehicles, data_depot):
    """
    Reconstructs vehicle routes from a dictionary of solved PuLP variables.
    Assumes 'x_i_j' variable naming for flow from node i to node j.
    """
    succ = {}
    # Extract all unique node indices from variable names to initialize succ dictionary
    nodes = sorted({int(var_name.split('_')[1]) for var_name in variables_dict if var_name.startswith('x_')})
    succ = {i: [] for i in nodes}

    for var_name, var_value in variables_dict.items():
        if var_name.startswith('x_') and var_value > 0.5: # Check if arc is used
            parts = var_name.split('_')
            if len(parts) == 3: # x_i_j
                _, i, j = parts
            elif len(parts) == 4: # x_i_j_k (fallback for robustness, though user implies x_i_j)
                _, i, j, _ = parts # Ignore vehicle index
            else:
                continue # Skip malformed variable names
            succ[int(i)].append(int(j))

    routes = []
    # This set will track all customers visited across all routes
    all_visited_customers = set()

    for v_idx in range(data_num_vehicles):
        route = [data_depot] # Start with depot
        cur = data_depot
        
        # Keep track of nodes visited in the current route to prevent infinite loops on cycles
        # and to ensure customers are not revisited by the same vehicle in a single route
        visited_in_current_route = {data_depot}

        # Find the starting arc from depot for this vehicle
        next_node_from_depot = None
        for nxt in succ.get(data_depot, []):
            if nxt != data_depot and nxt not in all_visited_customers:
                next_node_from_depot = nxt
                break
        
        if next_node_from_depot is None:
            # No unvisited customer reachable from depot for a new route
            break

        route.append(next_node_from_depot)
        visited_in_current_route.add(next_node_from_depot)
        cur = next_node_from_depot

        while True:
            if cur not in succ or not succ[cur]:
                break # No outgoing arcs from current node

            next_node_found = False
            for nxt in succ[cur]:
                # Ensure next node is not already visited in this route (for cycles)
                # and if it's a customer, ensure it hasn't been visited by *any* vehicle yet
                if nxt not in visited_in_current_route and (nxt == data_depot or nxt not in all_visited_customers):
                    route.append(nxt)
                    visited_in_current_route.add(nxt)
                    cur = nxt
                    next_node_found = True
                    break
            
            if not next_node_found:
                break # No valid next node found, break to avoid infinite loop

            if cur == data_depot:
                # Route completed by returning to depot
                if len(route) > 1: # Ensure it's not just [depot, depot]
                    routes.append(route)
                    # Add all customers from this route to the globally visited set
                    for node in route:
                        if node != data_depot:
                            all_visited_customers.add(node)
                break # Route complete, move to next vehicle
    return routes

def plot_routes(data_coords, data_depot, data_num_vehicles, routes, plot_filename):
    """
    Plot the VRP solution: nodes and vehicle routes.
    """
    coords = np.array(data_coords)
    depot = data_depot
    num_vehicles = data_num_vehicles
    total_nodes = len(coords)

    plt.figure(figsize=(10, 10))
    # Plot customers and depot
    plt.scatter(coords[1:, 0], coords[1:, 1], c='blue', label='Customers', zorder=2)
    plt.scatter(coords[depot, 0], coords[depot, 1], c='red', marker='s', s=150, label='Depot', zorder=3)

    # Add labels for each node
    for i in range(total_nodes):
        plt.annotate(str(i), (coords[i, 0], coords[i, 1]),
                     textcoords="offset points", xytext=(5,5), ha='center', fontsize=9)

    # Define a color palette for routes
    colors = sns.color_palette('tab10', n_colors=num_vehicles)

    # Plot each route
    for idx, route in enumerate(routes):
        pts = coords[route]
        plt.plot(pts[:, 0], pts[:, 1], linestyle='-', marker='o', markersize=5, color=colors[idx % len(colors)], label=f'Vehicle {idx+1}', linewidth=1.5, zorder=1)

    plt.title(f'Vehicle Routing Solution ({num_nodes-1} Customers, {num_vehicles} Vehicles)')
    plt.xlabel('X-coordinate')
    plt.ylabel('Y-coordinate')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    plt.grid(True)
    plt.tight_layout()
    plot_path = OUTPUT_PLOTS_DIR / plot_filename
    plt.savefig(plot_path)
    plt.close()
    print(f"Plot saved: {plot_path}")

# --- Example Route Visualization ---
print("\n--- Example Route Visualization ---")

vrp_baseline_df = baseline_df
vrp_baseline_df = parse_json_columns(vrp_baseline_df, ['parameters', 'constraints', 'variables'])

if not vrp_baseline_df.empty:
    sample_run = vrp_baseline_df.iloc[0] # Take the first optimal run
    print(f"Analyzing sample run_id: {sample_run['run_id']}")


    # Try to get data from baseline_df's parameters, fallback to raw data
    data_coords = sample_run.get('parameters', {}).get('coords', [])
    data_num_vehicles = sample_run.get('parameters', {}).get('num_vehicles')
    data_depot = sample_run.get('parameters', {}).get('depot', 0)
    
    # If not found in parameters, try vrp_raw_data (which might be empty)
    if not data_coords and vrp_raw_data:
        data_coords = vrp_raw_data.get('coords', [])
    if data_num_vehicles is None and vrp_raw_data:
        data_num_vehicles = vrp_raw_data.get('num_vehicles')
    if data_depot == 0 and vrp_raw_data: # Assuming 0 is default and might be overridden
        data_depot = vrp_raw_data.get('depot', 0)

    variables_dict = sample_run.get('variables', {})

    # Ensure data_num_vehicles is not None before proceeding
    if data_num_vehicles is None:
        print("Warning: Number of vehicles not found in baseline data or raw data. Defaulting to 1.")
        data_num_vehicles = 1 # Default to 1 if not found

    if data_coords and variables_dict:
            reconstructed_routes = reconstruct_vrp_routes(variables_dict, data_coords, data_num_vehicles, data_depot)
            print(f"Reconstructed {len(reconstructed_routes)} routes.")
            for i, route in enumerate(reconstructed_routes):
                print(f"  Route {i+1}: {route}")
            
            plot_routes(data_coords, data_depot, data_num_vehicles, reconstructed_routes, 'sample_vrp_route.png')



--- Example Route Visualization ---
Analyzing sample run_id: baseline_run
Reconstructed 0 routes.
Plot saved: results/vrp_raw_data_exploration_plots/10cust_2veh/sample_vrp_route.png


## 5. Conclusion

In [6]:
print("VRP raw data exploration complete. Plots saved to:", OUTPUT_PLOTS_DIR)

VRP raw data exploration complete. Plots saved to: results/vrp_raw_data_exploration_plots/10cust_2veh
