In [1]:
import pandas as pd# BPI2020 Domestic Declarations - Comprehensive EDA
# ================================================

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import datetime as dt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 1. Load and prepare the dataset
# ==============================
print("1. Loading and preparing the dataset...")

df = pd.read_csv('input/BPI2020_DomesticDeclarations.csv')
print(f"Dataset shape: {df.shape}")

# Display first few rows
display(df.head())



1. Loading and preparing the dataset...
Dataset shape: (56437, 10)


Unnamed: 0,id,org:resource,concept:name,time:timestamp,org:role,case:id,case:concept:name,case:BudgetNumber,case:DeclarationNumber,case:Amount
0,st_step 86794_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 09:49:50+00:00,EMPLOYEE,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
1,st_step 86793_0,STAFF MEMBER,Declaration FINAL_APPROVED by SUPERVISOR,2017-01-09 11:27:48+00:00,SUPERVISOR,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
2,dd_declaration 86791_19,SYSTEM,Request Payment,2017-01-10 09:34:44+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
3,dd_declaration 86791_20,SYSTEM,Payment Handled,2017-01-12 17:31:22+00:00,UNDEFINED,declaration 86791,declaration 86791,budget 86566,declaration number 86792,26.851205
4,st_step 86798_0,STAFF MEMBER,Declaration SUBMITTED by EMPLOYEE,2017-01-09 10:26:14+00:00,EMPLOYEE,declaration 86795,declaration 86795,budget 86566,declaration number 86796,182.464172


In [2]:
# Convert timestamp to datetime
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])

# Extract time components for temporal analysis
df['date'] = df['time:timestamp'].dt.date
df['year'] = df['time:timestamp'].dt.year
df['month'] = df['time:timestamp'].dt.month
df['day'] = df['time:timestamp'].dt.day
df['day_of_week'] = df['time:timestamp'].dt.dayofweek
df['hour'] = df['time:timestamp'].dt.hour
df['week'] = df['time:timestamp'].dt.isocalendar().week
df['month_year'] = df['time:timestamp'].dt.strftime('%Y-%m')

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values[missing_values > 0] if not missing_values.empty else "No missing values")




Missing values in each column:
Series([], dtype: int64)


In [3]:
# 2. Basic statistics and distribution analysis
# ============================================
print("\n2. Analyzing basic statistics and distributions...")

# 2.1 Key metrics
num_cases = df['case:id'].nunique()
num_resources = df['org:resource'].nunique()
num_activities = df['concept:name'].nunique()
num_roles = df['org:role'].nunique()

print(f"Number of cases: {num_cases}")
print(f"Number of unique resources: {num_resources}")
print(f"Number of unique activities: {num_activities}")
print(f"Number of unique roles: {num_roles}")




2. Analyzing basic statistics and distributions...
Number of cases: 10500
Number of unique resources: 2
Number of unique activities: 17
Number of unique roles: 7


In [4]:
# 2.2 Time range
min_date = df['time:timestamp'].min()
max_date = df['time:timestamp'].max()
date_range = (max_date - min_date).days

print(f"Time range: {min_date.date()} to {max_date.date()} ({date_range} days)")

# 2.3 Amount statistics
amount_stats = df['case:Amount'].describe()
print("\nDeclaration amount statistics:")
print(amount_stats)



Time range: 2017-01-09 to 2019-06-17 (889 days)

Declaration amount statistics:
count    56437.000000
mean        93.915101
std        159.169281
min          0.000000
25%         23.681716
50%         43.425108
75%         94.342744
max       3292.536991
Name: case:Amount, dtype: float64


In [5]:
# 3. Activities analysis
# =====================
print("\n3. Analyzing activities...")

# 3.1 Activity frequency
activity_counts = df['concept:name'].value_counts().reset_index()
activity_counts.columns = ['Activity', 'Count']
activity_counts['Percentage'] = (activity_counts['Count'] / len(df) * 100).round(2)

# Display top activities
print("\nTop activities by frequency:")
display(activity_counts.head(10))




3. Analyzing activities...

Top activities by frequency:


Unnamed: 0,Activity,Count,Percentage
0,Declaration SUBMITTED by EMPLOYEE,11531,20.43
1,Declaration FINAL_APPROVED by SUPERVISOR,10131,17.95
2,Payment Handled,10044,17.8
3,Request Payment,10040,17.79
4,Declaration APPROVED by ADMINISTRATION,8202,14.53
5,Declaration APPROVED by BUDGET OWNER,2820,5.0
6,Declaration REJECTED by EMPLOYEE,1365,2.42
7,Declaration REJECTED by ADMINISTRATION,952,1.69
8,Declaration APPROVED by PRE_APPROVER,685,1.21
9,Declaration REJECTED by SUPERVISOR,293,0.52


In [6]:
# 3.2 Visualize activity distribution
fig = px.bar(
    activity_counts, 
    x='Activity', 
    y='Count',
    title='Activity Distribution',
    color='Count',
    text='Percentage',
    color_continuous_scale='Viridis',
    height=600
)
fig.update_layout(xaxis_tickangle=-45)
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.show()



In [7]:
# 3.3 Activities by role
activity_role = df.groupby(['concept:name', 'org:role']).size().reset_index(name='Count')
activity_role_pivot = activity_role.pivot(index='concept:name', columns='org:role', values='Count').fillna(0)

# Visualize activities by role
fig = px.imshow(
    activity_role_pivot,
    labels=dict(x="Role", y="Activity", color="Count"),
    x=activity_role_pivot.columns,
    y=activity_role_pivot.index,
    color_continuous_scale='Viridis',
    title='Activities Performed by Each Role',
    height=800
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()



In [8]:
# 4. Resource and role analysis
# ============================
print("\n4. Analyzing resources and roles...")

# 4.1 Resource activity counts
resource_counts = df['org:resource'].value_counts().reset_index()
resource_counts.columns = ['Resource', 'Count']

fig = px.pie(
    resource_counts, 
    values='Count', 
    names='Resource',
    title='Distribution of Activities by Resource',
    hole=0.4,
    color_discrete_sequence=px.colors.qualitative.Plotly
)
fig.show()




4. Analyzing resources and roles...


In [9]:
# 4.2 Role distribution
role_counts = df['org:role'].value_counts().reset_index()
role_counts.columns = ['Role', 'Count']
role_counts['Percentage'] = (role_counts['Count'] / len(df) * 100).round(2)

fig = px.pie(
    role_counts, 
    values='Count', 
    names='Role',
    title='Distribution of Roles',
    color_discrete_sequence=px.colors.sequential.Viridis,
    labels={'Count':'Number of Activities'}
)
fig.update_traces(textinfo='label+percent')
fig.show()



In [10]:
# 5. Temporal analysis
# ===================
print("\n5. Performing temporal analysis...")

# 5.1 Activities over time
activities_by_month = df.groupby('month_year').size().reset_index(name='Count')
activities_by_month = activities_by_month.sort_values('month_year')

fig = px.line(
    activities_by_month, 
    x='month_year', 
    y='Count',
    title='Number of Activities Over Time (Monthly)',
    markers=True,
    labels={'month_year': 'Month', 'Count': 'Number of Activities'}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()




5. Performing temporal analysis...


In [11]:
# 5.2 Activities by day of week
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
activities_by_dow = df.groupby('day_of_week').size().reset_index(name='Count')
activities_by_dow['Day'] = activities_by_dow['day_of_week'].apply(lambda x: days[x])

fig = px.bar(
    activities_by_dow, 
    x='Day', 
    y='Count',
    title='Activities by Day of Week',
    color='Count',
    color_continuous_scale='Viridis',
    category_orders={"Day": days}
)
fig.show()



In [12]:
# 5.3 Activities by hour
activities_by_hour = df.groupby('hour').size().reset_index(name='Count')

fig = px.bar(
    activities_by_hour, 
    x='hour', 
    y='Count',
    title='Activities by Hour of Day',
    color='Count',
    color_continuous_scale='Viridis',
    labels={'hour': 'Hour of Day', 'Count': 'Number of Activities'}
)
fig.show()



In [13]:
# 5.4 Heatmap of activities by hour and day of week
heatmap_data = df.groupby(['day_of_week', 'hour']).size().reset_index(name='Count')
heatmap_pivot = heatmap_data.pivot(index='day_of_week', columns='hour', values='Count').fillna(0)

fig = px.imshow(
    heatmap_pivot,
    labels=dict(x="Hour of Day", y="Day of Week", color="Count"),
    x=heatmap_pivot.columns,
    y=[days[i] for i in range(7)],
    color_continuous_scale='Viridis',
    title='Heatmap of Activities by Day and Hour'
)
fig.show()



In [14]:
# 6. Amount analysis
# =================
print("\n6. Analyzing declaration amounts...")

# 6.1 Distribution of declaration amounts
fig = px.histogram(
    df, 
    x='case:Amount',
    nbins=50,
    title='Distribution of Declaration Amounts',
    marginal='box',
    color_discrete_sequence=['darkblue']
)
fig.show()




6. Analyzing declaration amounts...


In [15]:
# 6.2 Box plot of amounts by activity
fig = px.box(
    df,
    x='concept:name',
    y='case:Amount',
    title='Distribution of Declaration Amounts by Activity',
    color='concept:name',
    height=700
)
fig.update_layout(xaxis_tickangle=-45, showlegend=False)
fig.show()



In [16]:
# 6.3 Amount statistics by role
amount_by_role = df.groupby('org:role')['case:Amount'].agg(['count', 'mean', 'median', 'sum']).reset_index()
amount_by_role = amount_by_role.sort_values('sum', ascending=False)

fig = px.bar(
    amount_by_role,
    x='org:role',
    y='sum',
    title='Total Declaration Amount by Role',
    text='mean',
    hover_data=['count', 'median'],
    color='sum',
    color_continuous_scale='Viridis',
    labels={'sum': 'Total Amount', 'org:role': 'Role', 'mean': 'Average Amount'}
)
fig.update_traces(texttemplate='Avg: €%.2f', textposition='outside')
fig.update_layout(xaxis_tickangle=-45)
fig.show()



In [17]:
# 6.4 Monthly trend in declaration amounts
monthly_amounts = df.groupby('month_year')['case:Amount'].agg(['sum', 'mean', 'count']).reset_index()
monthly_amounts = monthly_amounts.sort_values('month_year')

fig = make_subplots(rows=2, cols=1, 
                    subplot_titles=('Total Monthly Declaration Amount', 'Average Declaration Amount'))

fig.add_trace(
    go.Bar(x=monthly_amounts['month_year'], y=monthly_amounts['sum'], name='Total Amount'),
    row=1, col=1
)

fig.add_trace(
    go.Line(x=monthly_amounts['month_year'], y=monthly_amounts['mean'], name='Average Amount', 
            line=dict(color='darkred'), mode='lines+markers'),
    row=2, col=1
)

fig.update_layout(height=800, title_text="Declaration Amount Trends by Month")
fig.update_xaxes(tickangle=-45)
fig.show()




plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




In [18]:
# 7. Case analysis
# ===============
print("\n7. Analyzing case properties...")

# 7.1 Case duration
# Calculate start and end time for each case
case_start = df.groupby('case:id')['time:timestamp'].min().reset_index()
case_end = df.groupby('case:id')['time:timestamp'].max().reset_index()
case_duration = pd.merge(case_start, case_end, on='case:id', suffixes=('_start', '_end'))
case_duration['duration_days'] = (case_duration['time:timestamp_end'] - case_duration['time:timestamp_start']).dt.total_seconds() / (60*60*24)
case_duration['duration_hours'] = (case_duration['time:timestamp_end'] - case_duration['time:timestamp_start']).dt.total_seconds() / 3600

# Visualize case duration distribution
fig = px.histogram(
    case_duration,
    x='duration_days',
    nbins=50,
    title='Distribution of Case Duration (Days)',
    marginal='box',
    color_discrete_sequence=['darkgreen']
)
fig.show()




7. Analyzing case properties...


In [19]:
# 7.2 Number of activities per case
activities_per_case = df.groupby('case:id').size().reset_index(name='num_activities')

fig = px.histogram(
    activities_per_case,
    x='num_activities',
    nbins=30,
    title='Number of Activities per Case',
    marginal='box',
    color_discrete_sequence=['darkred']
)
fig.show()



In [20]:
# 7.3 Scatter plot of case duration vs. declaration amount
# Add information about amounts to case_duration dataframe
case_amounts = df.groupby('case:id')['case:Amount'].first().reset_index()  # Taking first amount since amount is the same for a case
case_analysis = pd.merge(case_duration, case_amounts, on='case:id')

fig = px.scatter(
    case_analysis,
    x='duration_days',
    y='case:Amount',
    title='Case Duration vs. Declaration Amount',
    trendline='ols',
    labels={'duration_days': 'Case Duration (Days)', 'case:Amount': 'Declaration Amount'},
    opacity=0.7
)
fig.show()



In [21]:
# 7.4 Case durations by activity count
case_analysis = pd.merge(case_duration, activities_per_case, on='case:id')

fig = px.scatter(
    case_analysis,
    x='num_activities',
    y='duration_days',
    title='Case Duration vs. Number of Activities',
    trendline='ols',
    labels={'duration_days': 'Case Duration (Days)', 'num_activities': 'Number of Activities'},
    opacity=0.7
)
fig.show()



In [22]:
# 8. Process flow analysis
# =======================
print("\n8. Analyzing process flow...")

# 8.1 Prepare data for Sankey diagram of process flow
df_sorted = df.sort_values(['case:id', 'time:timestamp'])
df_sorted['next_activity'] = df_sorted.groupby('case:id')['concept:name'].shift(-1)

# Remove rows where next_activity is NaN (end of case)
transition_df = df_sorted.dropna(subset=['next_activity'])

# Count transitions
transitions = transition_df.groupby(['concept:name', 'next_activity']).size().reset_index(name='count')
transitions = transitions.sort_values('count', ascending=False)

# Get top 20 transitions for better visualization
top_transitions = transitions.head(20)

# Create Sankey diagram
source_indices = {activity: idx for idx, activity in enumerate(pd.unique(top_transitions[['concept:name', 'next_activity']].values.ravel('K')))}

sources = [source_indices[row['concept:name']] for _, row in top_transitions.iterrows()]
targets = [source_indices[row['next_activity']] for _, row in top_transitions.iterrows()]
values = top_transitions['count'].tolist()
labels = list(source_indices.keys())

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=15,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=labels
    ),
    link=dict(
        source=sources,
        target=targets,
        value=values
    )
)])
fig.update_layout(title_text="Top 20 Process Transitions (Activity Flow)", height=800)
fig.show()




8. Analyzing process flow...


In [23]:
# 8.2 Case variants
# Create sequences of activities for each case
case_sequences = df_sorted.groupby('case:id')['concept:name'].apply(list).reset_index()
case_sequences['sequence_str'] = case_sequences['concept:name'].apply(lambda x: ' -> '.join(x))
variant_counts = case_sequences['sequence_str'].value_counts().reset_index()
variant_counts.columns = ['Variant', 'Count']
variant_counts['Percentage'] = (variant_counts['Count'] / variant_counts['Count'].sum() * 100).round(2)

# Display top variants
print("\nTop 10 case variants:")
display(variant_counts.head(10))

# Visualize top case variants
fig = px.bar(
    variant_counts.head(10), 
    x='Count', 
    y='Variant',
    title='Top 10 Case Variants',
    text='Percentage',
    color='Count',
    color_continuous_scale='Viridis',
    orientation='h',
    height=500,
    labels={'Count': 'Number of Cases', 'Variant': 'Process Variant'}
)
fig.update_traces(texttemplate='%{text}%', textposition='outside')
fig.show()




Top 10 case variants:


Unnamed: 0,Variant,Count,Percentage
0,Declaration SUBMITTED by EMPLOYEE -> Declarati...,4618,43.98
1,Declaration SUBMITTED by EMPLOYEE -> Declarati...,2473,23.55
2,Declaration SUBMITTED by EMPLOYEE -> Declarati...,1392,13.26
3,Declaration SUBMITTED by EMPLOYEE -> Declarati...,575,5.48
4,Declaration SUBMITTED by EMPLOYEE -> Declarati...,345,3.29
5,Declaration SUBMITTED by EMPLOYEE -> Declarati...,188,1.79
6,Declaration SUBMITTED by EMPLOYEE -> Declarati...,174,1.66
7,Declaration SAVED by EMPLOYEE,134,1.28
8,Declaration SUBMITTED by EMPLOYEE -> Declarati...,77,0.73
9,Declaration SUBMITTED by EMPLOYEE -> Declarati...,57,0.54


In [24]:
# 9. Statistical tests and correlations
# ===================================
print("\n9. Performing statistical tests and correlation analysis...")

# 9.1 Correlation between numerical variables
numerical_cols = ['case:Amount', 'duration_days', 'num_activities']
case_correlation = pd.merge(
    case_analysis, 
    df.groupby('case:id')['case:Amount'].first().reset_index(),
    on='case:id'
)

# Calculate correlation matrix
correlation = case_correlation[numerical_cols].corr()

# Visualize correlation matrix
fig = go.Figure(data=go.Heatmap(
    z=correlation.values,
    x=correlation.columns,
    y=correlation.index,
    colorscale='Viridis',
    zmin=-1,
    zmax=1
))
fig.update_layout(title='Correlation Between Case Properties')
fig.show()




9. Performing statistical tests and correlation analysis...


In [25]:
# 9.2 ANOVA: Compare declaration amounts across different roles
roles = df['org:role'].unique()
if len(roles) > 2:
    from scipy.stats import f_oneway
    
    role_groups = [df[df['org:role'] == role]['case:Amount'].values for role in roles]
    f_statistic, p_value = f_oneway(*role_groups)
    
    print("\nANOVA Test for Declaration Amounts across Roles:")
    print(f"F-statistic: {f_statistic:.4f}, p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("There is a significant difference in declaration amounts across different roles.")
    else:
        print("There is no significant difference in declaration amounts across different roles.")




ANOVA Test for Declaration Amounts across Roles:
F-statistic: 6.8130, p-value: 0.0000
There is a significant difference in declaration amounts across different roles.


In [26]:
# 9.3 Chi-square test for independence between activity and role
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df['concept:name'], df['org:role'])
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-square Test for Independence between Activity and Role:")
print(f"Chi-square statistic: {chi2:.4f}, p-value: {p:.4f}, degrees of freedom: {dof}")
if p < 0.05:
    print("There is a significant association between the type of activity and the role performing it.")
else:
    print("There is no significant association between the type of activity and the role performing it.")




Chi-square Test for Independence between Activity and Role:
Chi-square statistic: 338622.0000, p-value: 0.0000, degrees of freedom: 96
There is a significant association between the type of activity and the role performing it.


In [27]:
# 10. Advanced process metrics
# ==========================
print("\n10. Calculating advanced process metrics...")

# 10.1 Activity throughput times (how long activities take)
# Group by case and activity, find min and max timestamp for each activity instance
activity_instances = df.groupby(['case:id', 'concept:name'])['time:timestamp'].agg(['min', 'max']).reset_index()
activity_instances['duration_hours'] = (activity_instances['max'] - activity_instances['min']).dt.total_seconds() / 3600

# Summary statistics for activity durations
activity_duration_summary = activity_instances.groupby('concept:name')['duration_hours'].agg(['mean', 'median', 'min', 'max', 'count']).reset_index()
activity_duration_summary = activity_duration_summary.sort_values('mean', ascending=False)

# Visualize average activity durations
fig = px.bar(
    activity_duration_summary, 
    x='concept:name', 
    y='mean',
    error_y=activity_duration_summary['max'] - activity_duration_summary['mean'],
    title='Average Duration of Activities (Hours)',
    color='mean',
    color_continuous_scale='Viridis',
    labels={'mean': 'Average Duration (Hours)', 'concept:name': 'Activity'},
    text='count'
)
fig.update_layout(xaxis_tickangle=-45)
fig.update_traces(texttemplate='%{text} instances', textposition='outside')
fig.show()




10. Calculating advanced process metrics...


In [28]:
# 10.2 Rework analysis (repeated activities in cases)
rework_analysis = df.groupby(['case:id', 'concept:name']).size().reset_index(name='frequency')
rework_cases = rework_analysis[rework_analysis['frequency'] > 1]

# Count cases with rework by activity
rework_by_activity = rework_cases.groupby('concept:name')['case:id'].nunique().reset_index()
rework_by_activity.columns = ['Activity', 'Cases with Rework']
rework_by_activity = rework_by_activity.sort_values('Cases with Rework', ascending=False)

# Visualize rework by activity
fig = px.bar(
    rework_by_activity,
    x='Activity',
    y='Cases with Rework',
    title='Number of Cases with Rework by Activity',
    color='Cases with Rework',
    color_continuous_scale='Viridis'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()



In [29]:
# 10.3 Throughput efficiency over time
# Calculate monthly case throughput (average duration)
case_duration['month_year'] = case_duration['time:timestamp_start'].dt.strftime('%Y-%m')
monthly_throughput = case_duration.groupby('month_year')['duration_days'].mean().reset_index()
monthly_throughput = monthly_throughput.sort_values('month_year')

fig = px.line(
    monthly_throughput,
    x='month_year',
    y='duration_days',
    title='Average Case Duration by Month',
    markers=True,
    labels={'duration_days': 'Average Duration (Days)', 'month_year': 'Month'}
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()


In [30]:
# 11. Outlier detection and analysis
# ================================
print("\n11. Detecting and analyzing outliers...")

# 11.1 Identify outliers using z-score for amounts
from scipy import stats

df['amount_zscore'] = stats.zscore(df['case:Amount'])
amount_outliers = df[np.abs(df['amount_zscore']) > 3]

print(f"\nNumber of outliers detected in amounts (using |Z-score| > 3): {len(amount_outliers)}")
print(f"Percentage of outliers: {len(amount_outliers) / len(df) * 100:.2f}%")




11. Detecting and analyzing outliers...

Number of outliers detected in amounts (using |Z-score| > 3): 1072
Percentage of outliers: 1.90%


In [31]:
# 11.2 Visualize outliers
fig = px.scatter(
    df,
    x='time:timestamp',
    y='case:Amount',
    color=np.abs(df['amount_zscore']) > 3,
    title='Declaration Amounts Over Time (Outliers Highlighted)',
    color_discrete_sequence=['blue', 'red'],
    labels={'color': 'Is Outlier'}
)
fig.show()



In [32]:
# 11.3 Case duration outliers
case_duration['duration_zscore'] = stats.zscore(case_duration['duration_days'])
duration_outliers = case_duration[np.abs(case_duration['duration_zscore']) > 3]

print(f"Number of outliers detected in case durations (using |Z-score| > 3): {len(duration_outliers)}")
print(f"Percentage of outliers: {len(duration_outliers) / len(case_duration) * 100:.2f}%")

# Visualize case duration outliers
fig = px.scatter(
    case_duration,
    x='time:timestamp_start',
    y='duration_days',
    color=np.abs(case_duration['duration_zscore']) > 3,
    title='Case Durations Over Time (Outliers Highlighted)',
    color_discrete_sequence=['blue', 'red'],
    labels={'color': 'Is Outlier', 'time:timestamp_start': 'Case Start Date', 'duration_days': 'Duration (Days)'}
)
fig.show()



Number of outliers detected in case durations (using |Z-score| > 3): 160
Percentage of outliers: 1.52%


In [33]:
# 12. Key insights and conclusions
# ===============================
print("\n12. Key insights and conclusions")

print("\n=== Summary of Process Metrics ===")
print(f"Total number of cases: {num_cases}")
print(f"Total number of activities: {len(df)}")
print(f"Average activities per case: {len(df) / num_cases:.2f}")
print(f"Average case duration: {case_duration['duration_days'].mean():.2f} days")
print(f"Median case duration: {case_duration['duration_days'].median():.2f} days")
print(f"Average declaration amount: €{df['case:Amount'].mean():.2f}")
print(f"Total declaration amount: €{df['case:Amount'].sum():.2f}")
print(f"Number of cases with rework: {rework_cases['case:id'].nunique()}")
print(f"Percentage of cases with rework: {rework_cases['case:id'].nunique() / num_cases * 100:.2f}%")

# Most common process variants
print(f"\nMost common process variant covers {variant_counts['Percentage'].iloc[0]:.2f}% of cases")
print(f"Top 3 variants cover {variant_counts['Percentage'].iloc[:3].sum():.2f}% of cases")

print("\n=== Key Performance Bottlenecks ===")
print("Activities with longest average duration:")
display(activity_duration_summary[['concept:name', 'mean', 'median', 'count']].head(5))

print("\nActivities with most rework:")
display(rework_by_activity.head(5))

print("\n=== Main Process Insights ===")
print("1. Most common activities and their frequencies")
print("2. Typical process flow and common variants")
print("3. Distribution of case durations and influencing factors")
print("4. Declaration amount patterns and relationship with process characteristics")
print("5. Resource and role distribution in the process")
print("6. Temporal patterns in process execution")
print("7. Critical bottlenecks and areas for improvement")

# End of analysis
print("\nEnd of Exploratory Data Analysis")


12. Key insights and conclusions

=== Summary of Process Metrics ===
Total number of cases: 10500
Total number of activities: 56437
Average activities per case: 5.37
Average case duration: 11.53 days
Median case duration: 7.33 days
Average declaration amount: €93.92
Total declaration amount: €5300286.55
Number of cases with rework: 1019
Percentage of cases with rework: 9.70%

Most common process variant covers 43.98% of cases
Top 3 variants cover 80.79% of cases

=== Key Performance Bottlenecks ===
Activities with longest average duration:


Unnamed: 0,concept:name,mean,median,count
9,Declaration REJECTED by EMPLOYEE,36.888599,0.0,1212
14,Declaration SUBMITTED by EMPLOYEE,32.973853,0.0,10365
7,Declaration REJECTED by ADMINISTRATION,29.178346,0.0,846
11,Declaration REJECTED by PRE_APPROVER,17.550031,0.0,81
2,Declaration APPROVED by PRE_APPROVER,9.439548,0.0,666



Activities with most rework:


Unnamed: 0,Activity,Cases with Rework
10,Declaration SUBMITTED by EMPLOYEE,1019
0,Declaration APPROVED by ADMINISTRATION,210
6,Declaration REJECTED by EMPLOYEE,139
4,Declaration REJECTED by ADMINISTRATION,93
3,Declaration FINAL_APPROVED by SUPERVISOR,58



=== Main Process Insights ===
1. Most common activities and their frequencies
2. Typical process flow and common variants
3. Distribution of case durations and influencing factors
4. Declaration amount patterns and relationship with process characteristics
5. Resource and role distribution in the process
6. Temporal patterns in process execution
7. Critical bottlenecks and areas for improvement

End of Exploratory Data Analysis
