# Road Traffic Fine Management

## Setup

In [None]:
import pandas as pd
import pm4py
import numpy as np

In [None]:
log_raw = pm4py.read_xes("Road_Traffic_Fine_Management_Process.xes")

In [None]:
log_raw = pm4py.format_dataframe(log_raw, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
log_raw['time:timestamp'] = pd.to_datetime(log_raw['time:timestamp'])

log_raw.head(7)

In [None]:
# Raw log analysis

num_events = len(log_raw)
num_cases = len(log_raw['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

start_activities = pm4py.get_start_activities(log_raw)
end_activities = pm4py.get_end_activities(log_raw)
all_activities = log_raw["concept:name"].unique().tolist()
print(f"Start activities: {start_activities}\nEnd activities: {end_activities}\nAll activites: {all_activities}")

## Data Cleaning

### NaN values

Check which columns have NaN values

In [None]:
log_df = log_raw.copy()

for col in log_df:
    if log_df[col].isna().any():
        print(f"{col.ljust(22, ' ')}: missing values")
    else:
        print(f"{col.ljust(22, ' ')}: clean")

Change NaN values to _zero_ in columns _amount_, _paymentAmount_, _totalPaymentAmount_ and _expense_

In [None]:
log_df["amount"] = log_df["amount"].fillna(0)
log_df["paymentAmount"] = log_df["paymentAmount"].fillna(0)
log_df["totalPaymentAmount"] = log_df["totalPaymentAmount"].fillna(0)
log_df["expense"] = log_df["expense"].fillna(0)

log_df.head()

### Remove matricola column

Remove attribute matricola because it's always either NaN or 0, so it's not useful

In [None]:
print(log_df["matricola"].unique())

In [None]:
log_df.drop(["matricola"], axis="columns", inplace=True)

log_df.head()

### Rename columns

Rename columns' names to improve readability of the datalog

In [None]:
# log_df.rename(columns={"amount" : "amount",
#                         "expense" : "extraAmount",
#                         "paymentAmount": "paymentAmount",
#                         "totalPaymentAmount" : "totalAmount"}, inplace=True)

# log_df.head()

### Fix _amount_ column

Collapse _amount_, _expense_ and _paymentAmount_ in the _amount_ column to make the datalog more readable

In [None]:
def correctAmount(log_row):
    activity = log_row["concept:name"]

    if activity == "Create Fine" or activity == "Add Penalty":
        return log_row["amount"]
    elif activity == "Send Fine":
        return log_row["expense"]
    elif activity == "Payment":
        return log_row["paymentAmount"]
    return 0

log_df["amount"] = log_df.apply(correctAmount, axis="columns")

log_df.head()

### Add _dueAmount_ column

To keep track more easily of how much money is needed in a case, a _dueAmount_ column where the incremental sum of _amount_ is kept

In [None]:
# incr_amount = 0
# last_case = None
# def incrementalDueAmount(log_row):
#     global incr_amount, last_case
#     if last_case == None or log_row["case:concept:name"] != last_case:
#         last_case = log_row["case:concept:name"]
#         incr_amount = log_row["amount"]
#     elif log_row["concept:name"] != "Payment":
#         incr_amount += log_row["amount"]
#     return incr_amount

# log_df["dueAmount"] = log_df.apply(incrementalDueAmount, axis="columns")

# log_df.head(10)

### Add _elapsed_ column

In [None]:
# last_timestamp = None
# last_case = None
# def incrementalElapsed(log_row):
#     global last_timestamp, last_case
#     if last_case == None or log_row["case:concept:name"] != last_case:
#         last_case = log_row["case:concept:name"]
#         last_timestamp = log_row["time:timestamp"]
#         return 0
#     else:
#         elapsed_seconds = log_row["time:timestamp"] - last_timestamp
#         last_timestamp = log_row["time:timestamp"]
#         return int(elapsed_seconds.total_seconds())

# log_df["elapsed"] = log_df.apply(incrementalElapsed, axis="columns")

# log_df.head(10)

### Add _status_ and _completed_ columns

Add column _status_ with a more readable description of the _dismissal_ column

In [None]:
print(log_df["dismissal"].unique())

def setStatus(log_row):
    dismissal = log_row["dismissal"]

    if dismissal == "#":
        return "Prefecture"
    elif dismissal == "G":
        return "Judge"
    elif dismissal == "NIL":
        return "Not Payed"
    elif pd.isna(dismissal):
        return "Unknown"
    return dismissal

log_df["status"] = log_df.apply(setStatus, axis="columns")

Add column _completed_, based on _status_ column, that shows if a process is completed

In [None]:
def setCompleted(log_row):
    status = log_row["status"]

    if status in ["Prefecture", "Judge"]:
        return "Yes"
    elif status in ["Not Payed", "Unknown"]:
        return "No"
    return "Unknown"

log_df["completed"] = log_df.apply(setCompleted, axis="columns")

log_df.head(10)

### Change column order

Change order of columns to make the dataframe more readable

In [None]:
print(log_df.columns)

In [None]:
columns = ['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp', # general attributes
            'amount', 'expense', 'points', 'paymentAmount', # number attributes
            'totalPaymentAmount', # current total amount paid by offender and total due (sum of fines, expenses and penalties)
            'dismissal', 'status', "completed", # status code
            'lastSent', 'vehicleClass', 'article', 'notificationType',
            'lifecycle:transition', '@@index', '@@case_index']

log_df = log_df[columns]

log_df.head(10)

## Data Filtering

In [None]:
filtered_log = log_df.copy(deep=True)

num_events = len(log_raw)
num_cases = len(log_raw['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

### Unknown-coded cases

Remove cases with "unknown" as _completed_ code

In [None]:
filtered_log = filtered_log[filtered_log['completed'] != "Unknown"]

# unknown_cases = pm4py.filter_trace_attribute_values(filtered_log, 'completed', ["Unknown"], retain=True)
# filtered_log = filtered_log[filtered_log["completed"] != "Unknown"]
# filtered_log = pd.concat([filtered_log, unknown_cases]).drop_duplicates(keep=False)

legal_events = len(filtered_log)
all_events = len(log_df)
# print(f"Filtered cases: {len(filtered_log['case:concept:name'].unique())}")
print(f"Filtered events: {legal_events}/{all_events} ({round((legal_events/all_events) * 100, 2)}%)")

### Zero duration cases

Calculate some statistics about raw logs data

In [None]:
case_durations = pm4py.get_all_case_durations(log_df)
min_raw = min(case_durations)
max_raw = max(case_durations)
mean_raw = np.mean(case_durations)

print(f"Min Case Duration: {min_raw}\nMax Case Duration: {max_raw}\nMean Case Duration: {mean_raw}")

Filter cases with duration 0

In [None]:
min_not_zero = min([x for x in case_durations if x>0])
print(min_not_zero)
print(max_raw)
filtered_log = pm4py.filter_case_performance(filtered_log, min_not_zero, max_raw)

legal_cases = len(filtered_log['case:concept:name'].unique())
all_cases = len(log_df['case:concept:name'].unique())
print(f"Filtered cases: {legal_cases}/{all_cases} ({round((legal_cases/all_cases) * 100, 2)}%)")

### Start/End activities

Remove cases with illegal start activities

In [None]:
# print(pm4py.get_start_activities(filtered_log))

# filtered_log = pm4py.filter_start_activities(filtered_log, ['Create Fine'])

# legal_start_cases = len(filtered_log['case:concept:name'].unique())
# all_cases = len(log_df['case:concept:name'].unique())
# print(f"Filtered events: {len(filtered_log)}")
# print(f"Filtered cases: {legal_start_cases}/{all_cases} ({round((legal_start_cases/all_cases) * 100, 2)}%)")

Remove cases with illegal end activities

In [None]:
print(pm4py.get_end_activities(filtered_log))

filtered_log = pm4py.filter_end_activities(filtered_log, ['Payment', 'Send for Credit Collection', 'Send Appeal to Prefecture', 'Appeal to Judge'])

legal_cases = len(filtered_log['case:concept:name'].unique())
all_cases = len(log_df['case:concept:name'].unique())
print(f"Filtered events: {len(filtered_log)}")
print(f"Filtered cases: {legal_cases}/{all_cases} ({round((legal_cases/all_cases) * 100, 2)}%)")

In [None]:
filtered_log.head(10)

## Knowledge Uplift Trail

The starting point of the project is the log, provided as an ```.xes``` file. The log is processed in two phases: cleaning and filtering. In the cleaning process, the log is converted to a DataFrame, compatible with the libraries used, some parts of the log are removed (_matricola_), changed (_amount_) or added (_status_, _completed_, etc.). In the filtering process, data not useful for analysis are removed through a series of filters (time, values and activies).

After these two preliminary phases, statistical methods are used to do a general analysis of the data contained in the log.

## Statistical Analysis

In [None]:
import matplotlib.pyplot as plt
import time
import datetime

### General analysis

General case durations

In [None]:
case_durations = pm4py.get_all_case_durations(filtered_log)
min_duration = min(case_durations)
min_time = datetime.timedelta(seconds=min_duration)
max_duration = max(case_durations)
max_time = datetime.timedelta(seconds=max_duration)
mean_duration = np.mean(case_durations)
mean_time = datetime.timedelta(seconds=mean_duration)

print(f"Min Case Duration: {min_duration} -> {min_time}")
print(f"Max Case Duration: {max_duration} -> {max_time}")
print(f"Mean Case Duration: {mean_duration} -> {mean_time}")

In [None]:
case_durations_df = filtered_log.groupby('case:concept:name', as_index=False).agg(\
    StartTime = ('time:timestamp', lambda x: x.min()),
    Duration = ('time:timestamp', lambda x: x.max() - x.min())
)

def format_duration(row):
    total_seconds = int(row.total_seconds())
    # hours, remainder = divmod(total_seconds, 3600)
    # minutes, seconds = divmod(remainder, 60)
    # return f"{hours:02}:{minutes:02}:{seconds:02}"
    return total_seconds

case_durations_df['Duration'] = case_durations_df['Duration'].apply(format_duration)

def att(row):
    return "att"

case_durations_df["concept:name"] = case_durations_df.apply(att, axis="columns")

def pick_year(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").year

def pick_month(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").month

def pick_day(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").day

case_durations_df["StartYear"] = case_durations_df.apply(pick_year, axis="columns")
case_durations_df["StartMonth"] = case_durations_df.apply(pick_month, axis="columns")
case_durations_df["StartDay"] = case_durations_df.apply(pick_day, axis="columns")

case_durations_df.head()

Case frequency by duration

In [None]:
xlabels = ["Less than 1 day",
            "Less than 1 week",
            "Less than 1 month",
            "Less than 6 months",
            "Less than 1 year",
            "Less than 2 years",
            "Less than 5 years",
            "Others"]
freqs = [len(case_durations_df[case_durations_df["Duration"] <= 60*60*24]), # 1 day
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24 * 7]), # 1 week
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*7 * 4]), # 1 month
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*7*4 * 6]), # 6 months
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24 * 365]), # 1 year
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*365 * 2]), # 2 years
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*365 * 5]), # 5 years
            len(case_durations_df)]

for x in range(len(freqs)-1, 0, -1):
    freqs[x] -= freqs[x-1]

for x in range(len(freqs)):
    print(f"{xlabels[x].ljust(20, ' ')}: {str(freqs[x]).ljust(5, ' ')} ({round(freqs[x]/len(case_durations_df) * 100, 2)}%)")

plt.figure(figsize=(9, 6))
plt.bar(xlabels, freqs)
plt.xlabel("Duration")
plt.ylabel("Frequency")
plt.title("Case frequency by duration")

y_custom_ticks = [x*5000 for x in range(10)]
y_custom_ticks.remove(5000)
y_custom_ticks.append(min(freqs[:len(freqs)-1]))
y_custom_ticks.append(max(freqs))
y_custom_ticks.sort()

plt.yticks(y_custom_ticks)
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

In [None]:
print(pm4py.get_end_activities(filtered_log))

end_acts = pm4py.get_end_activities(filtered_log)
x_axis = []
y_axis = []

for i in end_acts.keys():
    x_axis.append(i)
    y_axis.append(end_acts[i])

others_amount = end_acts["Appeal to Judge"]+end_acts["Send for Credit Collection"]+end_acts["Send Appeal to Prefecture"]
print(f"Payments: {end_acts['Payment']}")
print(f"Others: {others_amount}")

In [None]:
paid_cases = pm4py.filter_end_activities(filtered_log, ['Payment'])
unpaid_cases = pm4py.filter_end_activities(filtered_log, ['Payment'], retain=False)

# print(f"Paid fine min: {paid_cases[paid_cases['concept:name'] == 'Create Fine']['amount'].min()}")
# print(f"Paid fine max: {paid_cases[paid_cases['concept:name'] == 'Create Fine']['amount'].max()}")
# print(f"Paid fine mean: {paid_cases[paid_cases['concept:name'] == 'Create Fine']['amount'].mean()}")

# print(f"Unpaid fine min: {unpaid_cases[unpaid_cases['concept:name'] == 'Create Fine']['amount'].min()}")
# print(f"Unpaid fine max: {unpaid_cases[unpaid_cases['concept:name'] == 'Create Fine']['amount'].max()}")
# print(f"Unpaid fine mean: {unpaid_cases[unpaid_cases['concept:name'] == 'Create Fine']['amount'].mean()}")

# paid_cases.head(10)

In [None]:
paid_ids = paid_cases["case:concept:name"].unique().tolist()
unpaid_ids = unpaid_cases["case:concept:name"].unique().tolist()

paid_durations = pm4py.filter_event_attribute_values(case_durations_df, "case:concept:name", paid_ids) #case_durations_df[case_durations_df["case:concept:name"] in paid_ids]
unpaid_durations = pm4py.filter_event_attribute_values(case_durations_df, "case:concept:name", unpaid_ids) #case_durations_df[case_durations_df["case:concept:name"] in unpaid_ids]

# for c in paid_ids:
#     # print(c)
#     a = case_durations_df[case_durations_df["case:concept:name"] == c]
#     print(a.iloc[0]["Duration"])

print(f"Paid min duration: {paid_durations['Duration'].min()}")
print(f"Paid max duration: {paid_durations['Duration'].max()}")
print(f"Paid mean duration: {paid_durations['Duration'].mean()}")

print(f"Paid min duration: {unpaid_durations['Duration'].min()}")
print(f"Paid max duration: {unpaid_durations['Duration'].max()}")
print(f"Paid mean duration: {unpaid_durations['Duration'].mean()}")

plt.figure(figsize=(9, 6))
plt.bar(["Paid", "Others"], [paid_durations['Duration'].mean()/(60*60*24), unpaid_durations['Duration'].mean()/(60*60*24)], width=0.4)
# plt.xlabel("Duration")
plt.ylabel("Duration (days)")
# plt.title("Case frequency by duration")
plt.show()

In [None]:
# tolgo le varianti senza invio della multa perché non fanno parte della correlazione che cerco
log_new_paid = pm4py.filter_variants(paid_cases, [('Create Fine', 'Payment')], retain=False)

# Filter rows for 'Create Fine' and 'Send Fine'
create_fine_df = log_new_paid[log_new_paid['concept:name'] == 'Create Fine']
send_fine_df = log_new_paid[log_new_paid['concept:name'] == 'Send Fine']

# Merge the two DataFrames on 'case:concept:name'
merged_df = pd.merge(create_fine_df, send_fine_df, on='case:concept:name', suffixes=('_create', '_send'))

time_differences_paid = (merged_df['time:timestamp_send'] - merged_df['time:timestamp_create']).tolist()

print(f"Paid timeliness min: {min(time_differences_paid)}")
print(f"Paid timeliness max: {max(time_differences_paid)}")
paid_mean = sum([int(x.total_seconds()) for x in time_differences_paid])/len(time_differences_paid)
print(f"Paid timeliness mean: {paid_mean}")

plt.figure(figsize=(9, 6))
plt.scatter(range(len(time_differences_paid)), [int(x.total_seconds()) for x in time_differences_paid])
plt.xlabel('Case')
plt.ylabel('Notification timeliness')
plt.title('Scatter Plot of Notification timeliness (paid)')
plt.grid(True)
plt.show()

In [None]:
# Filter rows for 'Create Fine' and 'Send Fine'
create_fine_df = unpaid_cases[unpaid_cases['concept:name'] == 'Create Fine']
send_fine_df = unpaid_cases[unpaid_cases['concept:name'] == 'Send Fine']

# Merge the two DataFrames on 'case:concept:name'
merged_df = pd.merge(create_fine_df, send_fine_df, on='case:concept:name', suffixes=('_create', '_send'))

time_differences_unpaid = (merged_df['time:timestamp_send'] - merged_df['time:timestamp_create']).tolist()

print(f"Unpaid timeliness min: {min(time_differences_unpaid)}")
print(f"Unpaid timeliness max: {max(time_differences_unpaid)}")
unpaid_mean = sum([int(x.total_seconds()) for x in time_differences_unpaid])/len(time_differences_unpaid)
print(f"Unpaid timeliness mean: {unpaid_mean}")

plt.figure(figsize=(9, 6))
plt.scatter(range(len(time_differences_unpaid)), [int(x.total_seconds()) for x in time_differences_unpaid])
plt.xlabel('Case')
plt.ylabel('Notification timeliness')
plt.title('Scatter Plot of Notification timeliness (unpaid)')
plt.grid(True)
plt.show()

In [None]:
from scipy.stats import ttest_ind

t_stat, p_val = ttest_ind([int(x.total_seconds()) for x in time_differences_paid], [int(x.total_seconds()) for x in time_differences_unpaid], equal_var=False)

print(f"T-stat: {t_stat}\nP-value: {p_val}")

In [None]:
months = [int(x) for x in case_durations_df["StartMonth"].unique().tolist()]
months.sort()

paid_case_durations = pm4py.filter_event_attribute_values(case_durations_df, "case:concept:name", paid_cases["case:concept:name"].unique().tolist())

means = []
amounts = []
for x in months:
    filtered_durations = paid_case_durations[paid_case_durations["StartMonth"] == x]
    means.append(filtered_durations["Duration"].mean())
    amounts.append(len(filtered_durations))

plt.figure(figsize=(9, 6))
plt.bar(months, amounts)
plt.xlabel("Month")
plt.ylabel("Amount")
plt.title("Paid cases by month")
plt.xticks(months, ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], rotation=45)
plt.grid(True)
plt.show()

In [None]:
months = [int(x) for x in case_durations_df["StartMonth"].unique().tolist()]
months.sort()

unpaid_case_durations = pm4py.filter_event_attribute_values(case_durations_df, "case:concept:name", unpaid_cases["case:concept:name"].unique().tolist())

means = []
amounts = []
for x in months:
    filtered_durations = unpaid_case_durations[unpaid_case_durations["StartMonth"] == x]
    means.append(filtered_durations["Duration"].mean())
    amounts.append(len(filtered_durations))

plt.figure(figsize=(9, 6))
plt.bar(months, amounts)
plt.xlabel("Month")
plt.ylabel("Amount")
plt.title("Unpaid cases by month")
plt.xticks(months, ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], rotation=45)
plt.grid(True)
plt.show()

## Process discover

In [None]:
# def print_top_variants_info(input_log, k=5):
#     filtered_log_top = pm4py.filter_variants_top_k(input_log, k)

#     num_events = len(filtered_log)
#     num_events_top = len(filtered_log_top)
#     num_cases = len(filtered_log['case:concept:name'].unique())
#     num_cases_top = len(filtered_log_top['case:concept:name'].unique())
#     print(f"Number of events: {num_events_top}/{num_events} ({round(num_events_top/num_events * 100, 2)}%)")
#     print(f"Number of cases: {num_cases_top}/{num_cases} ({round(num_cases_top/num_cases * 100, 2)}%)")

#     net, im, fm = pm4py.discover_petri_net_inductive(filtered_log_top)
#     pm4py.view_petri_net(net, im, fm, format='png')

In [None]:
# print_top_variants_info(filtered_log, 1)

In [None]:
# print_top_variants_info(filtered_log, 3)

In [None]:
# print_top_variants_info(filtered_log, 5)

In [None]:
# print_top_variants_info(filtered_log, 10)