In [1]:
# Process Mining Visualizations with Plotly
# ========================================

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import networkx as nx
from datetime import datetime, timedelta

# Load the dataset
df = pd.read_csv('input/BPI2020_DomesticDeclarations.csv')

# Convert timestamp to datetime
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])

# 1. Process Map Visualization
# ---------------------------



In [2]:
# Sort the dataframe by case ID and timestamp
df_sorted = df.sort_values(['case:id', 'time:timestamp'])

# Create pairs of consecutive activities
df_sorted['next_activity'] = df_sorted.groupby('case:id')['concept:name'].shift(-1)
transition_df = df_sorted.dropna(subset=['next_activity'])

# Count transitions between activities
transitions = transition_df.groupby(['concept:name', 'next_activity']).size().reset_index(name='weight')
transitions = transitions.sort_values('weight', ascending=False)

# Create a graph using networkx
G = nx.from_pandas_edgelist(transitions, 'concept:name', 'next_activity', edge_attr='weight')

# Calculate node positions using spring layout
pos = nx.spring_layout(G, seed=42)

# Get node attributes
node_degrees = dict(G.degree())
node_sizes = [v * 10 for v in node_degrees.values()]

# Create edges
edge_x = []
edge_y = []
edge_traces = []

# Create a trace for each edge with width based on weight
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    weight = edge[2]['weight']
    
    # Normalize weight for better visualization
    width = 1 + 5 * (weight / transitions['weight'].max())
    opacity = 0.5 + 0.5 * (weight / transitions['weight'].max())
    
    edge_trace = go.Scatter(
        x=[x0, x1, None],
        y=[y0, y1, None],
        line=dict(width=width, color='rgba(150,150,150,{})'.format(opacity)),
        hoverinfo='text',
        text=f'{edge[0]} → {edge[1]}<br>Count: {weight}',
        mode='lines'
    )
    edge_traces.append(edge_trace)

# Create nodes
node_x = []
node_y = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=list(G.nodes()),
    textposition="bottom center",
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        size=node_sizes,
        color=list(node_degrees.values()),
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left'
            #titleside='right'
        ),
        line_width=2
    ),
    hoverinfo='text',
    hovertext=[f'{node}<br>Connections: {node_degrees[node]}' for node in G.nodes()]
)

# Create figure
fig = go.Figure(data=edge_traces + [node_trace],
                layout=go.Layout(
                    title='Process Flow Network',
                    #titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20, l=5, r=5, t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    height=800,
                    width=1000
                ))
fig.show()



In [3]:
# 2. Case Variants Sunburst Chart
# ------------------------------

# Get the first 5 steps of each case
variant_steps = 5  
case_variants = df_sorted.groupby('case:id')['concept:name'].apply(lambda x: list(x)[:variant_steps]).reset_index()

# Create hierarchical data for sunburst chart
sunburst_data = []

for _, row in case_variants.iterrows():
    case_id = row['case:id']
    activities = row['concept:name']
    
    path = ['All Cases']
    for i, activity in enumerate(activities, 1):
        path.append(activity)
        
        # Add each path level to the data
        sunburst_data.append({
            'case': case_id,
            'path': '/'.join(path),
            'depth': i
        })

# Count occurrences of each path
path_counts = pd.DataFrame(sunburst_data).groupby('path').size().reset_index(name='count')

# Prepare labels and parents for sunburst chart
labels = ['All Cases']
parents = ['']
values = [len(case_variants)]

for _, row in path_counts.iterrows():
    path_parts = row['path'].split('/')
    if len(path_parts) > 1:
        label = path_parts[-1]
        parent = '/'.join(path_parts[:-1])
        
        labels.append(label)
        parents.append(parent)
        values.append(row['count'])

# Create sunburst chart
fig = go.Figure(go.Sunburst(
    labels=labels,
    parents=parents,
    values=values,
    branchvalues="total",
    maxdepth=variant_steps,
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percentage: %{percentEntry:.2%}<extra></extra>'
))

fig.update_layout(
    title='Case Variants Sunburst (First {} Activities)'.format(variant_steps),
    margin=dict(t=30, l=0, r=0, b=0),
    height=700
)
fig.show()



In [4]:
# 3. Performance Analysis Heatmap
# ------------------------------

# Calculate case durations
case_start = df.groupby('case:id')['time:timestamp'].min().reset_index()
case_end = df.groupby('case:id')['time:timestamp'].max().reset_index()

case_duration = pd.merge(case_start, case_end, on='case:id', suffixes=('_start', '_end'))
case_duration['duration_days'] = (case_duration['time:timestamp_end'] - case_duration['time:timestamp_start']).dt.total_seconds() / (60*60*24)

# Add amount information
case_amount = df.groupby('case:id')['case:Amount'].first().reset_index()
case_analysis = pd.merge(case_duration, case_amount, on='case:id')

# Add activity count
activity_count = df.groupby('case:id').size().reset_index(name='activity_count')
case_analysis = pd.merge(case_analysis, activity_count, on='case:id')

# Create binned columns for heatmap
case_analysis['duration_bin'] = pd.qcut(case_analysis['duration_days'], 10, labels=False, duplicates='drop')
case_analysis['amount_bin'] = pd.qcut(case_analysis['case:Amount'], 10, labels=False, duplicates='drop')
case_analysis['activity_bin'] = pd.qcut(case_analysis['activity_count'], 10, labels=False, duplicates='drop')

# Create pivot table for heatmap
heatmap_pivot = pd.pivot_table(
    case_analysis, 
    values='case:id',
    index='duration_bin', 
    columns='amount_bin',
    aggfunc='count'
).fillna(0)

# Create heatmap
fig = px.imshow(
    heatmap_pivot,
    labels=dict(x="Amount (Decile)", y="Duration (Decile)", color="Number of Cases"),
    x=['D'+str(i+1) for i in range(10)],
    y=['D'+str(i+1) for i in range(10)],
    color_continuous_scale="Viridis",
    title="Relationship Between Case Duration and Declaration Amount"
)
fig.update_layout(
    xaxis_title="Declaration Amount (Increasing →)",
    yaxis_title="Case Duration (Increasing →)"
)
fig.show()



In [5]:
# 4. Activity Timeline
# ------------------

# Create a timeline of activities for a few sample cases
sample_cases = df['case:id'].unique()[:10]  # Take first 10 cases

# Prepare data for Gantt chart
gantt_data = []

for case_id in sample_cases:
    case_df = df[df['case:id'] == case_id].sort_values('time:timestamp')
    
    start_time = case_df['time:timestamp'].min()
    
    for _, row in case_df.iterrows():
        activity = row['concept:name']
        timestamp = row['time:timestamp']
        relative_day = (timestamp - start_time).total_seconds() / (60*60*24)
        
        gantt_data.append({
            'Case': case_id,
            'Activity': activity,
            'Start': relative_day,
            'Duration': 0.2,  # Fixed small duration for visibility
            'Resource': row['org:resource'],
            'Role': row['org:role']
        })

# Convert to DataFrame
gantt_df = pd.DataFrame(gantt_data)

# Create Gantt chart
fig = px.timeline(
    gantt_df, 
    x_start='Start', 
    x_end=gantt_df['Start'] + gantt_df['Duration'], 
    y='Case',
    color='Activity',
    hover_data=['Resource', 'Role'],
    title="Activity Timeline for Sample Cases (Days from Case Start)",
    height=600
)

fig.update_yaxes(autorange="reversed")  # To show the earliest case at the top
fig.show()



In [6]:
# 5. Resource Allocation Analysis
# -----------------------------

# Resource activity over time
resource_time = df.groupby([pd.Grouper(key='time:timestamp', freq='M'), 'org:resource']).size().reset_index(name='count')
resource_time['month'] = resource_time['time:timestamp'].dt.strftime('%Y-%m')

# Pivot the data for the heatmap
resource_pivot = resource_time.pivot(index='month', columns='org:resource', values='count').fillna(0)

# Create heatmap
fig = px.imshow(
    resource_pivot,
    labels=dict(x="Resource", y="Month", color="Activity Count"),
    color_continuous_scale="Viridis",
    title="Resource Workload Over Time"
)
fig.update_layout(
    xaxis_title="Resource",
    yaxis_title="Month",
    height=600
)
fig.show()




'M' is deprecated and will be removed in a future version, please use 'ME' instead.



In [7]:
# 6. Statistical Process Control Chart
# ----------------------------------

# Calculate monthly case throughput and duration statistics
monthly_stats = case_duration.copy()
monthly_stats['month'] = monthly_stats['time:timestamp_start'].dt.strftime('%Y-%m')

# Group by month and calculate statistics
monthly_duration = monthly_stats.groupby('month')['duration_days'].agg(['mean', 'std', 'count']).reset_index()
monthly_duration = monthly_duration.sort_values('month')

# Calculate control limits (3-sigma)
overall_mean = monthly_duration['mean'].mean()
overall_std = monthly_duration['std'].mean()

monthly_duration['ucl'] = overall_mean + 3 * overall_std
monthly_duration['lcl'] = np.maximum(0, overall_mean - 3 * overall_std)  # Ensure lower limit is not negative

# Create control chart
fig = go.Figure()

# Add mean line
fig.add_trace(go.Scatter(
    x=monthly_duration['month'],
    y=monthly_duration['mean'],
    mode='lines+markers',
    name='Average Duration',
    line=dict(color='blue'),
    marker=dict(size=8)
))

# Add upper control limit
fig.add_trace(go.Scatter(
    x=monthly_duration['month'],
    y=monthly_duration['ucl'],
    mode='lines',
    name='Upper Control Limit (UCL)',
    line=dict(color='red', dash='dash')
))

# Add lower control limit
fig.add_trace(go.Scatter(
    x=monthly_duration['month'],
    y=monthly_duration['lcl'],
    mode='lines',
    name='Lower Control Limit (LCL)',
    line=dict(color='red', dash='dash')
))

# Add center line (process mean)
fig.add_trace(go.Scatter(
    x=monthly_duration['month'],
    y=[overall_mean] * len(monthly_duration),
    mode='lines',
    name='Process Average',
    line=dict(color='green', dash='dot')
))

fig.update_layout(
    title='Statistical Process Control: Monthly Average Case Duration',
    xaxis_title='Month',
    yaxis_title='Case Duration (Days)',
    hovermode='closest',
    height=500
)
fig.show()



In [8]:
# 7. Advanced Metrics: Bottleneck Analysis
# --------------------------------------

# Calculate throughput time for each activity
activity_instances = df.groupby(['case:id', 'concept:name'])['time:timestamp'].agg(['min', 'max']).reset_index()
activity_instances['duration_hours'] = (activity_instances['max'] - activity_instances['min']).dt.total_seconds() / 3600

# Get average duration by activity
activity_duration = activity_instances.groupby('concept:name')['duration_hours'].agg(['mean', 'count']).reset_index()
activity_duration = activity_duration.sort_values('mean', ascending=False)

# Calculate "waiting time" before each activity
df_sorted['prev_timestamp'] = df_sorted.groupby('case:id')['time:timestamp'].shift(1)
df_sorted['waiting_hours'] = (df_sorted['time:timestamp'] - df_sorted['prev_timestamp']).dt.total_seconds() / 3600

# Get average waiting time by activity
waiting_by_activity = df_sorted.dropna(subset=['waiting_hours']).groupby('concept:name')['waiting_hours'].mean().reset_index()
waiting_by_activity = waiting_by_activity.sort_values('waiting_hours', ascending=False)

# Combine duration and waiting time
bottleneck_analysis = pd.merge(
    activity_duration, 
    waiting_by_activity, 
    on='concept:name', 
    how='outer'
).fillna(0)

bottleneck_analysis['total_time'] = bottleneck_analysis['mean'] + bottleneck_analysis['waiting_hours']
bottleneck_analysis = bottleneck_analysis.sort_values('total_time', ascending=False).head(10)

# Create stacked bar chart
fig = go.Figure()

fig.add_trace(go.Bar(
    x=bottleneck_analysis['concept:name'],
    y=bottleneck_analysis['mean'],
    name='Processing Time',
    marker_color='rgb(55, 83, 109)'
))

fig.add_trace(go.Bar(
    x=bottleneck_analysis['concept:name'],
    y=bottleneck_analysis['waiting_hours'],
    name='Waiting Time',
    marker_color='rgb(219, 64, 82)'
))

fig.update_layout(
    title='Top 10 Process Bottlenecks: Processing vs. Waiting Time',
    xaxis_title='Activity',
    yaxis_title='Time (Hours)',
    barmode='stack',
    xaxis_tickangle=-45,
    height=600
)
fig.show()



In [9]:
# 8. Process Conformance Analysis
# -----------------------------

# Define the expected process (simplified for this example)
expected_sequence = [
    'Declaration SUBMITTED by EMPLOYEE',
    'Declaration APPROVED by ADMINISTRATION',
    'Declaration APPROVED by SUPERVISOR',
    'Request Payment',
    'Payment Handled'
]

# Check for each case if it follows the expected sequence
conformance_results = []

for case_id in df['case:id'].unique():
    case_activities = df[df['case:id'] == case_id].sort_values('time:timestamp')['concept:name'].tolist()
    
    # Check if all expected activities are present in the correct order
    expected_idx = 0
    conforms = True
    missing_activities = []
    
    for activity in case_activities:
        if expected_idx < len(expected_sequence) and activity == expected_sequence[expected_idx]:
            expected_idx += 1
    
    # If we didn't go through all expected activities, mark as non-conformant
    if expected_idx < len(expected_sequence):
        conforms = False
        missing_activities = expected_sequence[expected_idx:]
    
    conformance_results.append({
        'case_id': case_id,
        'conforms': conforms,
        'missing_activities': missing_activities if not conforms else []
    })

# Convert to DataFrame
conformance_df = pd.DataFrame(conformance_results)

# Calculate conformance rate
conformance_rate = conformance_df['conforms'].mean() * 100

# Create pie chart for conformance
labels = ['Conformant', 'Non-conformant']
values = [
    conformance_df['conforms'].sum(),
    len(conformance_df) - conformance_df['conforms'].sum()
]

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=.4,
    marker_colors=['rgb(133, 189, 140)', 'rgb(219, 64, 82)']
)])

fig.update_layout(
    title_text=f'Process Conformance Analysis: {conformance_rate:.1f}% Conformant Cases',
    annotations=[dict(text=f'{conformance_rate:.1f}%', x=0.5, y=0.5, font_size=20, showarrow=False)]
)
fig.show()



In [10]:
# 9. Advanced Visualization: 3D Process Cube
# ----------------------------------------

# Create a 3D visualization of the process cube (time, activity, resource)
# Sample to reduce data points for better visualization
sampled_df = df.sample(min(5000, len(df)))

# Create figure
fig = go.Figure(data=[go.Scatter3d(
    x=sampled_df['time:timestamp'].astype(np.int64) // 10**9,  # Convert to Unix timestamp
    y=[hash(activity) % 1000 for activity in sampled_df['concept:name']],  # Hash activity names for positioning
    z=[hash(resource) % 1000 for resource in sampled_df['org:resource']],  # Hash resources for positioning
    mode='markers',
    marker=dict(
        size=5,
        color=sampled_df['case:Amount'],
        colorscale='Viridis',
        opacity=0.8,
        colorbar=dict(title='Amount')
    ),
    text=[f"Case: {case}<br>Activity: {activity}<br>Resource: {resource}<br>Amount: {amount}"
          for case, activity, resource, amount in zip(
              sampled_df['case:id'], 
              sampled_df['concept:name'], 
              sampled_df['org:resource'], 
              sampled_df['case:Amount']
          )],
    hoverinfo='text'
)])

# Update layout
fig.update_layout(
    title='3D Process Cube: Time × Activity × Resource',
    scene=dict(
        xaxis_title='Time',
        yaxis_title='Activity',
        zaxis_title='Resource',
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        yaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        ),
        zaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            zerolinecolor='rgb(255, 255, 255)',
            showbackground=True,
            backgroundcolor='rgb(230, 230,230)'
        )
    ),
    height=700
)
fig.show()



In [11]:
# 10. Process Performance Dashboard
# -------------------------------

# Create a dashboard with multiple metrics
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Average Case Duration by Month', 
        'Number of Cases by Month',
        'Declaration Amount Statistics', 
        'Activity Distribution'
    ),
    specs=[
        [{"type": "scatter"}, {"type": "bar"}],
        [{"type": "box"}, {"type": "pie"}]
    ]
)

# 1. Average Case Duration by Month
monthly_duration_pivot = monthly_stats.groupby('month')['duration_days'].mean().reset_index()
monthly_duration_pivot = monthly_duration_pivot.sort_values('month')

fig.add_trace(
    go.Scatter(
        x=monthly_duration_pivot['month'], 
        y=monthly_duration_pivot['duration_days'],
        mode='lines+markers',
        marker=dict(color='blue'),
        name='Avg Duration'
    ),
    row=1, col=1
)

# 2. Number of Cases by Month
monthly_cases = monthly_stats.groupby('month').size().reset_index(name='count')
monthly_cases = monthly_cases.sort_values('month')

fig.add_trace(
    go.Bar(
        x=monthly_cases['month'], 
        y=monthly_cases['count'],
        marker=dict(color='green'),
        name='Case Count'
    ),
    row=1, col=2
)

# 3. Declaration Amount Statistics
fig.add_trace(
    go.Box(
        y=case_analysis['case:Amount'],
        name='Amount',
        marker=dict(color='red'),
        boxmean=True
    ),
    row=2, col=1
)

# 4. Activity Distribution
top_activities = df['concept:name'].value_counts().reset_index()
top_activities.columns = ['Activity', 'Count']
top_activities = top_activities.head(5)

fig.add_trace(
    go.Pie(
        labels=top_activities['Activity'],
        values=top_activities['Count'],
        textinfo='label+percent',
        hole=.3
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    title_text="Process Performance Dashboard",
    height=800,
    showlegend=False
)

# Update xaxis properties
fig.update_xaxes(title_text="Month", row=1, col=1, tickangle=45)
fig.update_xaxes(title_text="Month", row=1, col=2, tickangle=45)

# Update yaxis properties
fig.update_yaxes(title_text="Duration (Days)", row=1, col=1)
fig.update_yaxes(title_text="Number of Cases", row=1, col=2)
fig.update_yaxes(title_text="Declaration Amount", row=2, col=1)

fig.show()