## Final code

In [1]:
import os
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple, Optional, Union, Any
import matplotlib.pyplot as plt
import seaborn as sns
from dataclasses import dataclass
import logging
from pathlib import Path

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class VisualizationConfig:
    """Configuration for visualization parameters"""
    output_dir: str
    data_files: List[str]
    day_type: str
    frequent_path: str
    time_slots: List[int]
    support_weight: float = 0.5
    utilization_weight: float = 0.5
    path_lengths_to_analyze: List[int] = None
    k_critical_path: int =20
    has_critical_path:bool = False
    top_k: int = 10
    text_size:int = 12
    fig_size: Tuple[int, int] = (15, 10)
    
    def __post_init__(self):
        if self.path_lengths_to_analyze is None:
            self.path_lengths_to_analyze = [2, 3, 4, 5]
        assert np.isclose(self.support_weight + self.utilization_weight, 1.0), "Weights must sum to 1"

class PathAnalyzer:
    """Handles path analysis and critical route calculations"""
    
    @staticmethod
    def get_path_length(items: Union[str, List, Any]) -> int:
        """Calculate path length from items"""
        if isinstance(items, str):
            try:
                items = eval(items)
            except:
                return 1
        return len(items) if isinstance(items, list) else 1
    
    @staticmethod
    def calculate_path_utilization(road_ids: Union[str, List], road_utilization: pd.DataFrame) -> float:
        """Calculate average utilization for a path"""
        if isinstance(road_ids, str):
            try:
                road_ids = eval(road_ids)
            except:
                road_ids = [road_ids]
        elif not isinstance(road_ids, list):
            road_ids = [road_ids]
            
        path_utils = road_utilization[road_utilization['road_id'].isin(road_ids)]['utilization']
        return path_utils.mean() if not path_utils.empty else 0.0
    
    def calculate_critical_routes(
        self,
        freq_itemsets: pd.DataFrame,
        road_utilization: pd.DataFrame,
        support_weight: float,
        utilization_weight: float,
        total_transactions:int
    ) -> pd.DataFrame:
        """Calculate critical routes with composite scores"""
        analysis_df = freq_itemsets.copy()
        
        # Calculate support
        # total_paths = max(analysis_df['freq'])
        analysis_df['support'] = analysis_df['freq'] / total_transactions
        
        # Calculate utilization
        analysis_df['avg_utilization'] = analysis_df['items'].apply(
            lambda x: self.calculate_path_utilization(x, road_utilization)
        )
        
        # Normalize utilization
        min_util = analysis_df['avg_utilization'].min()
        max_util = analysis_df['avg_utilization'].max()
        analysis_df['normalized_utilization'] = (
            (analysis_df['avg_utilization'] - min_util) / (max_util - min_util)
            if max_util > min_util else 0
        )
        
        # Calculate composite score
        analysis_df['composite_score'] = (
            support_weight * analysis_df['support'] +
            utilization_weight * analysis_df['normalized_utilization']
        )
        
        # Prepare result
        result = (analysis_df
                 .sort_values('composite_score', ascending=False)
                 .reset_index(drop=True))
        
        # add column route_id
        result['route_id'] = result.index + 1

        # Round numerical columns
        for col in ['support', 'avg_utilization', 'composite_score']:
            result[col] = result[col].round(2)

        # # Reorder columns to put route_id first
        # all_cols = ['route_id'] + [col for col in result.columns if col != 'route_id']
        # result = result[all_cols]
        
        return result[['route_id','items', 'freq', 'support', 'avg_utilization', 'composite_score']]

class PathVisualizer:
    """Handles all visualization tasks"""
    
    def __init__(self, config: VisualizationConfig):
        self.config = config
        self.path_analyzer = PathAnalyzer()
    

    def visualize_critical_routes(self, critical_routes: pd.DataFrame, k_critical_path: int) -> plt.Figure:
        """Create main visualization for critical routes"""
        fig, ax = plt.subplots(figsize=self.config.fig_size)
        
        top_k_routes = critical_routes.nlargest(
            k_critical_path, 'composite_score'
        )

        ax.bar(range(1, k_critical_path + 1), top_k_routes['composite_score'], 
            color='#66b3ff', edgecolor='black', width=0.7)

        ax.set_xlabel('Top 20 most utilized paths', fontsize=self.config.text_size)
        ax.set_ylabel('Composite Priority Score (CPS)', fontsize=self.config.text_size)
        
        ax.tick_params(axis='x', labelsize=self.config.text_size, rotation=45)
        ax.tick_params(axis='y', labelsize=self.config.text_size)
        
        ax.grid(True, linestyle='--', alpha=0.6, axis='y', linewidth=0.7)

        # Set integer ticks for x-axis
        ax.set_xticks(range(1, k_critical_path + 1))
        ax.xaxis.set_major_locator(plt.MaxNLocator(integer=True))
        
        # Keep y-axis as float
        ax.yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))

        plt.tight_layout()

        return fig
    

    def plot_length_distribution(self, critical_path_df: pd.DataFrame) -> plt.Figure:
        """
        Plot path length distribution.
        
        Parameters:
        -----------
        df : pd.DataFrame
            DataFrame containing path data with 'path_length' column
            
        Returns:
        --------
        plt.Figure
            Figure object containing the visualization
        """
        # Helper function to calculate path lengths
        def get_path_length(items):
            if isinstance(items, str):
                try:
                    items = eval(items)
                except:
                    return 1
            return len(items) if isinstance(items, list) else 1
        
        # Calculate path lengths
        analysis_df = critical_path_df.copy()
        analysis_df['path_length'] = analysis_df['items'].apply(get_path_length)
        
        # # Add value labels
        # for i, v in enumerate(length_dist):
        #     ax.text(i, v, str(v), ha='center', va='bottom')

        # Create figure and axis
        fig, ax = plt.subplots(figsize=self.config.fig_size)

        # Calculate and plot length distribution
        length_dist = analysis_df['path_length'].value_counts().sort_index()
        length_dist.plot(kind='bar', ax=ax, color='#66b3ff', edgecolor='black', width=0.7)

        # Customize plot
        # ax.set_title('Distribution of Path Lengths', fontsize=self.config.text_size + 2, fontweight='bold', pad=20)
        ax.set_xlabel('Route length', fontsize=self.config.text_size, labelpad=15)
        ax.set_ylabel('Frequency', fontsize=self.config.text_size, labelpad=15)

        # Customize ticks and gridlines
        ax.tick_params(axis='x', labelsize=self.config.text_size, rotation=45)
        ax.tick_params(axis='y', labelsize=self.config.text_size)
        ax.grid(True, alpha=0.3, axis='y', linestyle='--', linewidth=0.5)

        # Add value labels on bars
        # for i, v in enumerate(length_dist):
        #     ax.text(i, v + 0.2, str(v), ha='center', va='bottom', fontsize=self.config.text_size-2)

        
        plt.tight_layout()
        return fig
    
    
    def plot_specific_length_paths(self, df: pd.DataFrame, path_length: int, custome_fig_size: Tuple[int, int]) -> plt.Figure:
        """
        Plot paths of specific length showing both frequency and composite score.
        
        Parameters:
        -----------
        df : pd.DataFrame
            DataFrame containing path data with 'path_length', 'freq', 
            'composite_score', and 'items' columns
        path_length : int
            Length of paths to visualize
            
        Returns:
        --------
        plt.Figure
            Figure object containing the visualization
        """
        def get_path_length(items):
            if isinstance(items, str):
                try:
                    items = eval(items)
                except:
                    return 1
            return len(items) if isinstance(items, list) else 1
            
        analysis_df = df.copy()
        analysis_df['path_length'] = analysis_df['items'].apply(get_path_length)
        # custome_fig_size = (15, 15)
        fig, ax1 = plt.subplots(figsize=custome_fig_size)
        ax2 = ax1.twinx()
        
        specific_paths = analysis_df[analysis_df['path_length'] == path_length].nlargest(
            self.config.top_k, 'composite_score'
        )
        # Create x-axis labels combining route_id and items
        x_labels = [f'{row["items"]}' for _, row in specific_paths.iterrows()]
        
        if not specific_paths.empty:
            x_space_pos = np.arange(len(x_labels))
            width = 0.40
            
            bars1 = ax1.bar(x_space_pos - width/2, specific_paths['composite_score'],
                        width, color='#66b3ff', label='Composite Priority Score (CPS)')
            
            bars2 = ax2.bar(x_space_pos + width/2, specific_paths['freq'],
                        width, color='#ff6347', label='Frequency')
            
            ax1.set_xlabel('Route', fontsize=self.config.text_size)
            ax1.set_ylabel('Composite Priority Score (CPS)', fontsize=self.config.text_size)
            ax1.tick_params(axis='y', labelsize=self.config.text_size)
            
            ax2.set_ylabel('Frequency', fontsize=self.config.text_size)
            ax2.tick_params(axis='y', labelsize=self.config.text_size)
            
            ax1.set_xticks(x_space_pos)
            ax1.set_xticklabels(x_labels, rotation=90, ha='right', fontsize=self.config.text_size)
            
            ax1.grid(True, axis='x', linestyle='--', alpha=0.6)
            
            lines1, labels1 = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            
            # Place legend outside the plot at the top
            fig.legend(lines1 + lines2, labels1 + labels2,
                    loc='center', 
                    bbox_to_anchor=(0.5, .9),
                    ncol=2,
                    fontsize=self.config.text_size)
        else:
            ax1.text(0.5, 0.5, f'No paths of length {path_length} found',
                    ha='center', va='center')
        
        # Adjust layout to make room for the legend
        plt.subplots_adjust(top=0.85)
        
        # Save x_labels and composite_score in a csv file
        output_df = pd.DataFrame({
            'Path': x_labels,
            'CPS': specific_paths['composite_score'].values
        })
        
        
        return fig, output_df

    def plot_max_length_paths(self, df: pd.DataFrame) -> plt.Figure:
        """
        Plot paths with maximum length showing multiple metrics and their details in a table.
        
        Parameters:
        -----------
        df : pd.DataFrame
            DataFrame containing path data with required columns: 'path_length', 'freq',
            'items', 'composite_score', 'support', and 'avg_utilization'
            
        Returns:
        --------
        plt.Figure
            Figure object containing the visualization
        """
        def get_path_length(items):
            if isinstance(items, str):
                try:
                    items = eval(items)
                except:
                    return 1
            return len(items) if isinstance(items, list) else 1
        
        # Calculate path lengths
        analysis_df = df.copy()
        analysis_df['path_length'] = analysis_df['items'].apply(get_path_length)
        # analysis_df['route_id'] = range(1, len(analysis_df) + 1)

        # Create figure and axes
        fig, ax1 = plt.subplots(figsize=self.config.fig_size)
        
        # Create secondary axes
        ax2 = ax1.twinx()  # for composite score
        ax3 = ax1.twinx()  # for avg_utilization
        
        # Offset the right spines for visibility
        ax3.spines['right'].set_position(('outward', 60))
        
        # Find and filter maximum length paths
        max_length = analysis_df['path_length'].max()
        max_paths = analysis_df[analysis_df['path_length'] == max_length].nlargest(
            self.config.top_k, 'composite_score'
        )
        
        if not max_paths.empty:
            # Width for bars
            width = 0.4  # Reduced width to accommodate four bars
            
           # composite_score on primary axis (left)
            bars1 = ax1.bar(max_paths.index, max_paths['composite_score'],
                        width, color='lightblue', label='Composite Score')
            
            bars2 = ax2.bar(max_paths.index - width, max_paths['freq'], 
                        width, color='lightgreen', label='Frequency')
            
            # Avg Utilization on third secondary axis
            bars3 = ax3.bar(max_paths.index + width, max_paths['avg_utilization'], 
                        width, color='purple', label='Avg Utilization')
            
            # Customize axes  
            # ax1.set_title(f'Top {self.config.top_k} Paths of Maximum Length ({max_length})')
            ax1.set_xlabel('Route index', fontsize=self.config.text_size)
            
            ax1.set_ylabel('Composite Score', color='blue', fontsize=self.config.text_size)
            ax2.set_ylabel('Frequency', color='green', fontsize=self.config.text_size) 
            ax3.set_ylabel('Avg. Utilization', color='purple', fontsize=self.config.text_size)
            
            ax1.tick_params(axis='y', labelcolor='blue', labelsize=self.config.text_size)
            ax2.tick_params(axis='y', labelcolor='green', labelsize=self.config.text_size) 
            ax3.tick_params(axis='y', labelcolor='purple', labelsize=self.config.text_size)
            
            # Set x-ticks to route_ids
            plt.xticks(max_paths.index, max_paths['route_id'], fontsize=self.config.text_size)
            
            # Add grid
            ax1.grid(True, alpha=0.3)
            
            # Add value labels
            # def autolabel(bars, ax):
            #     for bar in bars:
            #         height = bar.get_height()
            #         ax.text(bar.get_x() + bar.get_width()/2, height,
            #                f'{height:.2f}',
            #                ha='center', va='bottom', rotation=45)
            
            # autolabel(bars1, ax1)
            # autolabel(bars2, ax2)
            # autolabel(bars3, ax3)
            # autolabel(bars4, ax4)
            
            # Add legends
            lines1, labels1 = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            lines3, labels3 = ax3.get_legend_handles_labels()
            # lines4, labels4 = ax4.get_legend_handles_labels()
            ax1.legend(lines1 + lines2 + lines3,
                      labels1 + labels2 + labels3,
                      loc='upper right',
                      bbox_to_anchor=(1.4, 1))
            
            # Add details table
            # self._add_path_details_table(max_paths, fig, ax1)
        else:
            ax1.text(0.5, 0.5, f'No paths of length {max_length} found',
                    ha='center', va='center', fontsize=self.config.text_size-2)
        
        plt.tight_layout()
        return fig

    
    def _add_path_details_table(self, df: pd.DataFrame, fig: plt.Figure, ax: plt.Axes) -> None:
        """
        Add a details table below the plot.
        
        Parameters:
        -----------
        df : pd.DataFrame
            DataFrame containing paths data
        fig : plt.Figure
            Figure object to add table to
        ax : plt.Axes
            Axes object of the plot
        """
        # Prepare table data
        table_data = [
            [str(row['items']), 
             f"{row['composite_score']:.3f}"]  # Format score to 3 decimal places
            for idx, row in df.iterrows()
        ]
        
        if table_data:
            # Create table
            table = ax.table(
                cellText=table_data,
                colLabels=['Path', 'Composite Score'],
                loc='bottom',
                bbox=[0, -0.50, 1, 0.3]
            )
            
            # Customize table appearance
            table.auto_set_font_size(False)
            table.set_fontsize(self.config.text_size)
            table.scale(1, 1.5)
            
            # Adjust cell wrapping for path column
            for cell in table._cells:
                if cell[1] == 1:  # Path column
                    table._cells[cell].set_text_props(wrap=True)
            
            # Set column widths
            table.auto_set_column_width([0, 1, 2])
            
            # Adjust subplot parameters to make room for table
            plt.subplots_adjust(bottom=0.25)
            
            # Adjust layout while preserving space for table
            plt.tight_layout(rect=[0, 0.25, 1, 1])

class GraphDataLoader:
    """Handles loading and preprocessing of graph data"""
    
    @staticmethod
    def load_road_history(output_dir: str, data_files: List[str]) -> pd.DataFrame:
        # Create empty list to store dataframes
        dfs = []
        
        # Load each file and append to list
        for file_name in data_files:
            try:
                file_path = Path(output_dir) / file_name
                df = pd.read_csv(file_path)
                # Select only required columns
                columns = ['time', 'road_id', 'utilization', 'inv_utilization']
                dfs.append(df[columns])
                logger.info(f"Successfully loaded {file_name}")
            except Exception as e:
                logger.error(f"Error loading {file_name}: {str(e)}")
                raise
        
        # Combine all dataframes
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            logger.info(f"Combined {len(dfs)} dataframes with {len(combined_df)} total rows")
            return combined_df
        else:
            raise ValueError("No data files were successfully loaded")

    
    @staticmethod
    def get_road_utility(road_history, time_slot=48):
        # Filter for the specific time slot
        time_slot_data = road_history[
            road_history['time'] == time_slot
        ]

        # Convert road_id to string and reset index
        result_df = time_slot_data[['road_id','utilization']].copy()
        result_df['road_id'] = result_df['road_id'].astype(str)
        
        return result_df.reset_index(drop=True)

def main(config: VisualizationConfig):
    """Main execution function"""
    try:
        # Initialize components
        data_loader = GraphDataLoader()
        path_analyzer = PathAnalyzer()
        path_visualizer = PathVisualizer(config)
        
        # Get road history data
        if not config.has_critical_path: 
            road_history = data_loader.load_road_history(
                config.output_dir,
                config.data_files
            )
        
        for time_slot in config.time_slots:
            logger.info(f"\nProcessing time slot: {time_slot}")

            # Get road utilization
            if config.has_critical_path:
                # save critical routes
                critical_routes_path_file = Path(config.output_dir) / f'critical_routes/{config.day_type}/{config.day_type}_critical_routes_t{time_slot}.csv'
                critical_routes =pd.read_csv(critical_routes_path_file)
            else: 
                road_utilization_df = data_loader.get_road_utility(road_history, time_slot)

                # Load frequent path data
                ext = ''
                if config.day_type == 'workday':
                    if time_slot == 48:
                        ext = '_sup0.0815'
                    elif time_slot == 78:
                        ext = '_sup0.085'
                    elif time_slot == 108:
                        ext = '_sup0.082'
                
                freq_path_file = f"{config.frequent_path}{time_slot}{ext}.csv"

                freq_itemsets_df = pd.read_csv(freq_path_file)

                transaction_stats={'workday':{48:224195, 78:228404, 108:224193},'holiday':{48:210230, 78:218566, 108:212580}} #node size 500
                total_transactions = transaction_stats[config.day_type][time_slot]
                
                # Calculate critical routes
                critical_routes = path_analyzer.calculate_critical_routes(
                    freq_itemsets_df,
                    road_utilization_df,
                    config.support_weight,
                    config.utilization_weight,
                    total_transactions
                )
                
                # save critical routes
                os.makedirs(Path(config.output_dir) / f'critical_routes/{config.day_type}', exist_ok=True)
                critical_routes_path_file = Path(config.output_dir) / f'critical_routes/{config.day_type}/{config.day_type}_critical_routes_t{time_slot}.csv'
                critical_routes.to_csv(critical_routes_path_file, index=False)

            # Create visualizations
            # path distribution visualization
            os.makedirs(Path(config.output_dir) / f'visualizations/{config.day_type}', exist_ok=True)
            fig = path_visualizer.plot_length_distribution(critical_routes)
            plt.savefig(
                Path(config.output_dir) / f'visualizations/{config.day_type}/critical_paths_distribution_{config.day_type}_t{time_slot}.png',
                bbox_inches='tight', dpi=300
            )
            plt.close(fig)
            
            # length based path visualization
            total_path_df = pd.DataFrame(columns=['Path', 'CPS', 'length'])
    
            for path_length in config.path_lengths_to_analyze:

                custome_fig_size = (15, 18)
                fig, path_df =path_visualizer.plot_specific_length_paths(
                    critical_routes, path_length, custome_fig_size
                )
                plt.savefig(
                    Path(config.output_dir) / f'visualizations/{config.day_type}/{config.day_type}_utilize_paths_len{path_length}_t{time_slot}.png',
                    bbox_inches='tight', dpi=300
                )
                plt.close(fig)

                # Add length column
                path_df['length'] = path_length
                total_path_df = pd.concat([total_path_df, path_df], ignore_index=True)

            filename = Path(config.output_dir) / f'visualizations/{config.day_type}/{config.day_type}_utilize_paths_t{time_slot}.csv'
            total_path_df.to_csv(filename, index=False)

            # max path
            # fig =path_visualizer.plot_max_length_paths(critical_routes)
            # plt.savefig(
            #     Path(config.output_dir) / f'visualizations/{config.day_type}/max_critical_paths_{config.day_type}_t{time_slot}.png',
            #     bbox_inches='tight'
            # )
            # plt.close(fig)
            
            # Create composite score visualization
            fig = path_visualizer.visualize_critical_routes(critical_routes, config.k_critical_path)
            plt.savefig(
                Path(config.output_dir) / f'visualizations/{config.day_type}/{config.day_type}_composite_scores_t{time_slot}.png',
                bbox_inches='tight'
            )
            plt.close(fig)
            
            logger.info(f"Completed processing time slot {time_slot}")
        
        logger.info("All visualizations completed successfully")
        
    except Exception as e:
        logger.error(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    '''  data_files=[
            'level1_road_history_workday_utilization_tune.csv',
            'level2_road_history_workday_utilization_tune.csv'
        ]
        
    '''
    config = VisualizationConfig(
        # output_dir="D:/Thesis/files_output_dir/output_utilization/",
        output_dir="C:/Users/mamun_pc/Dropbox/Thesis/data/output_data/output_utilization/tune_utilization/",

        ## workday-frequent_path
        # data_files=[
        #     'level1_road_history_workday_utilization_tune_AHP.csv',
        #     'level2_road_history_workday_utilization_tune_AHP.csv'
        # ],
        # day_type = 'workday',
        # # frequent_path = 'D:/Thesis/files_output_dir/output_utilization/frequent_paths/node_size_500/frequent_path_workday_t',
        # frequent_path = 'C:/Users/mamun_pc/Dropbox/Thesis/data/output_data/output_utilization/tune_utilization/frequent_path/size_500/frequent_path_workday_t',

        # ## holiday
        data_files=[
            'level1_road_history_holiday_utilization_tune_AHP.csv',
            'level2_road_history_holiday_utilization_tune_AHP.csv'
        ],
        day_type = 'holiday',
        frequent_path = 'C:/Users/mamun_pc/Dropbox/Thesis/data/output_data/output_utilization/tune_utilization/frequent_path/size_500/frequent_path_holiday_t',

        time_slots=[108],  # 48, 78, 108
        support_weight=0.7,
        utilization_weight=0.3,
        path_lengths_to_analyze=[8, 9, 10, 11, 12], #2, 3, 4, 5/ 8, 9,
        k_critical_path =20,
        has_critical_path = True,
        top_k=10,
        text_size = 28
    )
    
    main(config)
    

2025-04-09 22:29:06,699 - INFO - 
Processing time slot: 108
  total_path_df = pd.concat([total_path_df, path_df], ignore_index=True)
2025-04-09 22:39:12,756 - INFO - Completed processing time slot 108
2025-04-09 22:39:12,758 - INFO - All visualizations completed successfully


In [33]:
transaction_stats={'workday':{48:224195, 78:228404, 108:224193},'holiday':{48:210230, 78:218566, 108:212580}}
total_transactions = transaction_stats['holiday'][78]
total_transactions

218566

In [7]:
import pandas as pd
from pathlib import Path

time_slot = 108
k_critical_path = 20

## workday-frequent_path

output_dir="C:/Users/mamun_pc/Dropbox/Thesis/data/output_data/output_utilization/tune_utilization/"

## workday-frequent_path
# data_files=[
#     'level1_road_history_workday_utilization_tune_AHP.csv',
#     'level2_road_history_workday_utilization_tune_AHP.csv'
# ]
# day_type = 'workday'

## holiday
data_files=[
        'level1_road_history_holiday_utilization_tune_AHP.csv',
        'level2_road_history_holiday_utilization_tune_AHP.csv'
    ]
day_type = 'holiday'

def get_road_utility(road_history, time_slot=48):
    # Filter for the specific time slot
    time_slot_data = road_history[road_history['time'] == time_slot]

    # Convert road_id to string and reset index
    result_df = time_slot_data[['road_id', 'utilization']].copy()
    return result_df

# Load each file and concatenate
road_history_workday_utilization = pd.DataFrame()  # Initialize outside the loop
for file_name in data_files:
    file_path = Path(output_dir) / file_name  # Use Path for better path handling
    df = pd.read_csv(file_path)
    # Select only required columns
    columns = ['time', 'road_id', 'utilization', 'inv_utilization']
    road_history_workday_utilization = pd.concat([road_history_workday_utilization, df[columns]], ignore_index=True) # Use concat

critical_routes_path_file = Path(output_dir) / f'critical_routes/{day_type}/{day_type}_critical_routes_t{time_slot}.csv'
critical_routes = pd.read_csv(critical_routes_path_file)

top_k_routes = critical_routes.nlargest(k_critical_path, 'composite_score')

road_utilization = get_road_utility(road_history_workday_utilization, time_slot=time_slot) # Use the time_slot variable

route_details = []
for _, row in top_k_routes.iterrows():
    road_ids = eval(row["items"]) # Be cautious using eval, consider ast.literal_eval for safer parsing
    road_ids = [int(id) for id in road_ids]
    route_utils = road_utilization[road_utilization['road_id'].isin(road_ids)]
    # print(road_ids)
    # Handle cases where no matching road_ids are found
    if route_utils.empty:
        print(f'Route {row["route_id"]}: No utilization data found for this route.')
        route_details.append({
            'route_id': row['route_id'],
            'route_roads': [],
            'route_utilizations': []
        })
        continue  # Skip to the next route

    # Create a formatted string with road IDs and their utilizations
    route_util_str = ', '.join([f'{road_id} ({util:.2f})' for road_id, util in zip(route_utils['road_id'], route_utils['utilization'])])
    print(f'Route {row["route_id"]}: {route_util_str}, support: {row['support']}, avg_utilization: {row['avg_utilization']}, composite_score: {row['composite_score']}')

    route_details.append({
        'route_id': row['route_id'],
        'route_roads': list(route_utils['road_id']),
        'route_utilizations': list(route_utils['utilization']),
        'support': row['support'], 
        'avg_utilization': row['avg_utilization'],
        'composite_score': row['composite_score']
    })

route_details_df = pd.DataFrame(route_details)

file_path = Path(output_dir) / f'visualizations/{day_type}/{day_type}_top_routes_t{time_slot}.csv'
route_details_df.to_csv(file_path)
route_details_df



Route 1: 6252 (0.38), support: 0.16, avg_utilization: 0.38, composite_score: 0.41
Route 2: 6252 (0.38), 6350 (0.36), support: 0.16, avg_utilization: 0.37, composite_score: 0.4
Route 3: 6350 (0.36), support: 0.16, avg_utilization: 0.36, composite_score: 0.39
Route 4: 5332 (0.34), support: 0.19, avg_utilization: 0.34, composite_score: 0.39
Route 5: 6251 (0.33), 6252 (0.38), support: 0.16, avg_utilization: 0.35, composite_score: 0.39
Route 6: 6251 (0.33), 6252 (0.38), 6350 (0.36), support: 0.16, avg_utilization: 0.35, composite_score: 0.38
Route 7: 6252 (0.38), 6350 (0.36), 6351 (0.32), support: 0.16, avg_utilization: 0.35, composite_score: 0.38
Route 8: 2396 (0.34), support: 0.17, avg_utilization: 0.34, composite_score: 0.38
Route 9: 6252 (0.38), 6342 (0.33), support: 0.15, avg_utilization: 0.36, composite_score: 0.38
Route 10: 2732 (0.34), support: 0.17, avg_utilization: 0.34, composite_score: 0.38
Route 11: 6252 (0.38), 6342 (0.33), 6350 (0.36), support: 0.15, avg_utilization: 0.36, co

Unnamed: 0,route_id,route_roads,route_utilizations,support,avg_utilization,composite_score
0,1,[6252],[0.3776288707120788],0.16,0.38,0.41
1,2,"[6252, 6350]","[0.3776288707120788, 0.3554324353926731]",0.16,0.37,0.4
2,3,[6350],[0.3554324353926731],0.16,0.36,0.39
3,4,[5332],[0.338703970788673],0.19,0.34,0.39
4,5,"[6251, 6252]","[0.3264222680577252, 0.3776288707120788]",0.16,0.35,0.39
5,6,"[6251, 6252, 6350]","[0.3264222680577252, 0.3776288707120788, 0.355...",0.16,0.35,0.38
6,7,"[6252, 6350, 6351]","[0.3776288707120788, 0.3554324353926731, 0.316...",0.16,0.35,0.38
7,8,[2396],[0.344626446650965],0.17,0.34,0.38
8,9,"[6252, 6342]","[0.3776288707120788, 0.3328964931110548]",0.15,0.36,0.38
9,10,[2732],[0.3401711180889334],0.17,0.34,0.38


## Holiday

In [3]:
import pandas as pd
from pathlib import Path

output_dir = "D:/Thesis/files_output_dir/output_utilization/"
day_type = 'holiday'
time_slot = 108
k_critical_path = 20

## workday-frequent_path
data_files = [
    'level1_road_history_holiday_utilization_tune.csv',
    'level2_road_history_holiday_utilization_tune.csv'
]

def get_road_utility(road_history, time_slot=48):
    # Filter for the specific time slot
    time_slot_data = road_history[road_history['time'] == time_slot]

    # Convert road_id to string and reset index
    result_df = time_slot_data[['road_id', 'utilization']].copy()
    return result_df

# Load each file and concatenate
road_history_utilization = pd.DataFrame()  # Initialize outside the loop
for file_name in data_files:
    file_path = Path(output_dir) / file_name  # Use Path for better path handling
    df = pd.read_csv(file_path)
    # Select only required columns
    columns = ['time', 'road_id', 'utilization', 'inv_utilization']
    road_history_utilization = pd.concat([road_history_utilization, df[columns]], ignore_index=True) # Use concat

critical_routes_path_file = Path(output_dir) / f'frequent_paths/{day_type}/{day_type}_critical_routes_t{time_slot}.csv'
critical_routes = pd.read_csv(critical_routes_path_file)

top_k_routes = critical_routes.nlargest(k_critical_path, 'composite_score')

road_utilization = get_road_utility(road_history_utilization, time_slot=time_slot) # Use the time_slot variable

route_details = []
for _, row in top_k_routes.iterrows():
    road_ids = eval(row["items"]) # Be cautious using eval, consider ast.literal_eval for safer parsing
    road_ids = [int(id) for id in road_ids]
    route_utils = road_utilization[road_utilization['road_id'].isin(road_ids)]
    # print(road_ids)
    # Handle cases where no matching road_ids are found
    if route_utils.empty:
        print(f'Route {row["route_id"]}: No utilization data found for this route.')
        route_details.append({
            'route_id': row['route_id'],
            'route_roads': [],
            'route_utilizations': []
        })
        continue  # Skip to the next route

    # Create a formatted string with road IDs and their utilizations
    route_util_str = ', '.join([f'{road_id} ({util:.2f})' for road_id, util in zip(route_utils['road_id'], route_utils['utilization'])])
    print(f'Route {row["route_id"]}: {route_util_str}, support: {row['support']}, avg_utilization: {row['avg_utilization']}, composite_score: {row['composite_score']}')

    route_details.append({
        'route_id': row['route_id'],
        'route_roads': list(route_utils['road_id']),
        'route_utilizations': list(route_utils['utilization']),
        'support': row['support'], 
        'avg_utilization': row['avg_utilization'],
        'composite_score': row['composite_score']
    })

route_details_df = pd.DataFrame(route_details)

file_path = Path(output_dir) / f'visualizations/{day_type}/{day_type}_route_details_df_t{time_slot}.csv'
route_details_df.to_csv(file_path)
route_details_df


Route 1: 6252 (0.55), support: 0.17, avg_utilization: 0.55, composite_score: 0.42
Route 2: 6252 (0.55), 6350 (0.54), support: 0.17, avg_utilization: 0.54, composite_score: 0.41
Route 3: 6350 (0.54), support: 0.17, avg_utilization: 0.54, composite_score: 0.41
Route 4: 6251 (0.51), 6252 (0.55), 6350 (0.54), support: 0.17, avg_utilization: 0.53, composite_score: 0.4
Route 5: 6251 (0.51), 6252 (0.55), support: 0.17, avg_utilization: 0.53, composite_score: 0.4
Route 6: 5332 (0.52), support: 0.18, avg_utilization: 0.52, composite_score: 0.4
Route 7: 6252 (0.55), 6350 (0.54), 6351 (0.50), support: 0.17, avg_utilization: 0.53, composite_score: 0.4
Route 8: 6252 (0.55), 6342 (0.52), support: 0.16, avg_utilization: 0.54, composite_score: 0.4
Route 9: 6252 (0.55), 6342 (0.52), 6350 (0.54), support: 0.16, avg_utilization: 0.54, composite_score: 0.4
Route 10: 6252 (0.55), 6351 (0.50), support: 0.17, avg_utilization: 0.52, composite_score: 0.4
Route 11: 2396 (0.53), support: 0.16, avg_utilization: 0

Unnamed: 0,route_id,route_roads,route_utilizations,support,avg_utilization,composite_score
0,1,[6252],[0.5479440109655924],0.17,0.55,0.42
1,2,"[6252, 6350]","[0.5479440109655924, 0.5359416861619392]",0.17,0.54,0.41
2,3,[6350],[0.5359416861619392],0.17,0.54,0.41
3,4,"[6251, 6252, 6350]","[0.509702947409311, 0.5479440109655924, 0.5359...",0.17,0.53,0.4
4,5,"[6251, 6252]","[0.509702947409311, 0.5479440109655924]",0.17,0.53,0.4
5,6,[5332],[0.5212053764979008],0.18,0.52,0.4
6,7,"[6252, 6350, 6351]","[0.5479440109655924, 0.5359416861619392, 0.500...",0.17,0.53,0.4
7,8,"[6252, 6342]","[0.5479440109655924, 0.5224831896369657]",0.16,0.54,0.4
8,9,"[6252, 6342, 6350]","[0.5479440109655924, 0.5224831896369657, 0.535...",0.16,0.54,0.4
9,10,"[6252, 6351]","[0.5479440109655924, 0.5003549143928484]",0.17,0.52,0.4


In [13]:
for _, row in top_k_routes.iterrows():
    road_ids = eval(row["items"]) # Be cautious using eval, consider ast.literal_eval for safer parsing
    road_ids = [int(id) for id in road_ids]
    route_utils = road_utilization[road_utilization['road_id'].isin(road_ids)]
    print(row["items"])
    # print(road_ids)
    # print(route_utils)

# print(road_utilization.head())
# road_utilization[road_utilization['road_id']==130]['utilization']

['12062']
['12047']
['12051', '12047']
['12047', '13262']
['12052', '12047']
['12051', '12052', '12047']
['12051']
['12051', '12047', '13262']
['12051', '12052', '12047', '13262']
['12052', '12047', '13262']
['12051', '12052']
['12051', '13262']
['12051', '12052', '13262']
['12052']
['13262']
['12052', '13262']
['12051', '12052', '12047', '13262', '12108']
['12051', '12052', '12043', '12047', '13262']
['12051', '12052', '12047', '12108']
['12051', '12052', '12043', '12047']


In [14]:
path_length=2
top_k=10

def get_path_length(items):
            if isinstance(items, str):
                try:
                    items = eval(items)
                except:
                    return 1
            return len(items) if isinstance(items, list) else 1
            
analysis_df = critical_routes.copy()
analysis_df['path_length'] = analysis_df['items'].apply(get_path_length)

# fig, ax1 = plt.subplots(figsize=self.config.fig_size)
# ax2 = ax1.twinx()

specific_paths = analysis_df[analysis_df['path_length'] == path_length].nlargest(
    top_k, 'composite_score'
)

specific_paths.head()

Unnamed: 0,route_id,items,freq,support,avg_utilization,composite_score,path_length
2,3,"['12051', '12047']",24984,0.11,0.69,0.35,2
3,4,"['12047', '13262']",25460,0.11,0.68,0.35,2
4,5,"['12052', '12047']",24984,0.11,0.68,0.35,2
10,11,"['12051', '12052']",24984,0.11,0.68,0.34,2
11,12,"['12051', '13262']",24984,0.11,0.68,0.34,2


## test case

In [23]:
time_slot = 48
# Load frequent path data
freq_path_file = f"{'D:/Thesis/files_output_dir/output_utilization/frequent_paths/frequent_path_holiday_t'}{time_slot}.csv"

freq_itemsets_df = pd.read_csv(freq_path_file)
analysis_df = freq_itemsets_df.copy()
        
# # Calculate support
# total_paths = analysis_df['freq'].sum()
# analysis_df['support'] = analysis_df['freq'] / total_paths

In [None]:
# Calculate support
# Assuming total transactions is the highest frequency (34847)
total_transactions = len(analysis_df) #max(analysis_df['freq'])

# Calculate support for each itemset
analysis_df['support'] = analysis_df['freq'] / total_transactions

# Display the result
print(analysis_df[['items', 'freq', 'support']][0:10])

                                              items   freq   support
0                                          ['2565']  34847  0.015638
1                                  ['2565', '2914']  31644  0.014201
2                          ['2565', '2914', '3197']  31294  0.014044
3                  ['2565', '2914', '3197', '2458']  29663  0.013312
4                  ['2565', '2914', '3197', '2389']  31119  0.013965
5          ['2565', '2914', '3197', '2389', '2458']  29663  0.013312
6          ['2565', '2914', '3197', '2389', '4353']  31119  0.013965
7  ['2565', '2914', '3197', '2389', '4353', '2458']  29663  0.013312
8                  ['2565', '2914', '3197', '4353']  31294  0.014044
9          ['2565', '2914', '3197', '4353', '2458']  29663  0.013312


In [24]:
analysis_df.head()

Unnamed: 0,items,freq
0,['2565'],34847
1,"['2565', '2914']",31644
2,"['2565', '2914', '3197']",31294
3,"['2565', '2914', '3197', '2458']",29663
4,"['2565', '2914', '3197', '2389']",31119


In [26]:
max(analysis_df['freq'])

42211

In [29]:
total_transactions = len(analysis_df)
total_transactions

2228319