# Road Traffic Fine Management

## Setup

In [None]:
import pandas as pd
import pm4py
import numpy as np

In [None]:
log_raw = pm4py.read_xes("Road_Traffic_Fine_Management_Process.xes")

In [None]:
log_raw = pm4py.format_dataframe(log_raw, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
log_raw['time:timestamp'] = pd.to_datetime(log_raw['time:timestamp'])

log_raw.head(7)

In [None]:
# Raw log analysis

num_events = len(log_raw)
num_cases = len(log_raw['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

start_activities = pm4py.get_start_activities(log_raw)
end_activities = pm4py.get_end_activities(log_raw)
all_activities = log_raw["concept:name"].unique().tolist()
print(f"Start activities: {start_activities}\nEnd activities: {end_activities}\nAll activites: {all_activities}")

## Data Cleaning

### NaN values

Check which columns have NaN values

In [None]:
log_df = log_raw.copy()

for col in log_df:
    if log_df[col].isna().any():
        print(f"{col.ljust(22, ' ')}: missing values")
    else:
        print(f"{col.ljust(22, ' ')}: clean")

Change NaN values to _zero_ in columns _amount_, _paymentAmount_, _totalPaymentAmount_ and _expense_

In [None]:
log_df["amount"] = log_df["amount"].fillna(0)
log_df["paymentAmount"] = log_df["paymentAmount"].fillna(0)
log_df["totalPaymentAmount"] = log_df["totalPaymentAmount"].fillna(0)
log_df["expense"] = log_df["expense"].fillna(0)

log_df.head()

### Remove matricola column

Remove attribute matricola because it's always either NaN or 0, so it's not useful

In [None]:
print(log_df["matricola"].unique())

In [None]:
log_df.drop(["matricola"], axis="columns", inplace=True)

log_df.head()

### Rename columns

Rename columns' names to improve readability of the datalog

In [None]:
# log_df.rename(columns={"amount" : "amount",
#                         "expense" : "extraAmount",
#                         "paymentAmount": "paymentAmount",
#                         "totalPaymentAmount" : "totalAmount"}, inplace=True)

# log_df.head()

### Fix _amount_ column

Collapse _amount_, _expense_ and _paymentAmount_ in the _amount_ column to make the datalog more readable

In [None]:
def correctAmount(log_row):
    activity = log_row["concept:name"]

    if activity == "Create Fine" or activity == "Add Penalty":
        return log_row["amount"]
    elif activity == "Send Fine":
        return log_row["expense"]
    elif activity == "Payment":
        return log_row["paymentAmount"]
    return 0

log_df["amount"] = log_df.apply(correctAmount, axis="columns")

log_df.head()

### Add _dueAmount_ column

To keep track more easily of how much money is needed in a case, a _dueAmount_ column where the incremental sum of _amount_ is kept

In [None]:
log_df.head(10)

In [None]:
incr_amount = 0
last_case = None
def incrementalDueAmount(log_row):
    global incr_amount, last_case
    if last_case == None or log_row["case:concept:name"] != last_case:
        last_case = log_row["case:concept:name"]
        incr_amount = log_row["amount"]
    else:
        incr_amount += log_row["amount"]
    return incr_amount

log_df["dueAmount"] = log_df.apply(incrementalDueAmount, axis="columns")

log_df.head(10)

### Add _status_ and _completed_ columns

Add column _status_ with a more readable description of the _dismissal_ column

In [None]:
def setStatus(log_row):
    dismissal = log_row["dismissal"]

    if dismissal == "#":
        return "Prefecture"
    elif dismissal == "G":
        return "Judge"
    elif dismissal == "NIL":
        return "Not Payed"
    elif pd.isna(dismissal):
        return "Unknown"
    return dismissal

log_df["status"] = log_df.apply(setStatus, axis="columns")

Add column _completed_, based on _status_ column, that shows if a process is completed

In [None]:
def setCompleted(log_row):
    status = log_row["status"]

    if status in ["Prefecture", "Judge"]:
        return "Yes"
    elif status in ["Not Payed", "Unknown"]:
        return "No"
    return "Unknown"

log_df["completed"] = log_df.apply(setCompleted, axis="columns")

log_df.head(10)

### Change column order

Change order of columns to make the dataframe more readable

In [None]:
print(log_df.columns)

In [None]:
columns = ['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp', # general attributes
            'amount', 'expense', 'points', 'paymentAmount', # number attributes
            'totalPaymentAmount', 'dueAmount', # current total amount paid by offender and total due (sum of fines, expenses and penalties)
            'dismissal', 'status', "completed", # status code
            'lastSent', 'vehicleClass', 'article', 'notificationType',
            'lifecycle:transition', '@@index', '@@case_index']

log_df = log_df[columns]

log_df.head(10)

## Data Filtering

In [None]:
filtered_log = log_df.copy(deep=True)

num_events = len(log_raw)
num_cases = len(log_raw['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

### Unknown-coded cases

Remove cases with "unknown" as _completed_ code

In [None]:
filtered_log = filtered_log[filtered_log['completed'] != "Unknown"]

# unknown_cases = pm4py.filter_trace_attribute_values(filtered_log, 'completed', ["Unknown"], retain=True)
# filtered_log = filtered_log[filtered_log["completed"] != "Unknown"]
# filtered_log = pd.concat([filtered_log, unknown_cases]).drop_duplicates(keep=False)

legal_events = len(filtered_log)
all_events = len(log_df)
# print(f"Filtered cases: {len(filtered_log['case:concept:name'].unique())}")
print(f"Filtered events: {legal_events}/{all_events} ({round((legal_events/all_events) * 100, 2)}%)")

### Zero duration cases

Calculate some statistics about raw logs data

In [None]:
case_durations = pm4py.get_all_case_durations(log_df)
min_raw = min(case_durations)
max_raw = max(case_durations)
mean_raw = np.mean(case_durations)

print(f"Min Case Duration: {min_raw}\nMax Case Duration: {max_raw}\nMean Case Duration: {mean_raw}")

Filter cases with duration 0

In [None]:
min_not_zero = min([x for x in case_durations if x>0])
print(min_not_zero)
print(max_raw)
filtered_log = pm4py.filter_case_performance(filtered_log, min_not_zero, max_raw)

legal_cases = len(filtered_log['case:concept:name'].unique())
all_cases = len(log_df['case:concept:name'].unique())
print(f"Filtered cases: {legal_cases}/{all_cases} ({round((legal_cases/all_cases) * 100, 2)}%)")

### Start/End activities

Remove cases with illegal start activities

In [None]:
# print(pm4py.get_start_activities(filtered_log))

# filtered_log = pm4py.filter_start_activities(filtered_log, ['Create Fine'])

# legal_start_cases = len(filtered_log['case:concept:name'].unique())
# all_cases = len(log_df['case:concept:name'].unique())
# print(f"Filtered events: {len(filtered_log)}")
# print(f"Filtered cases: {legal_start_cases}/{all_cases} ({round((legal_start_cases/all_cases) * 100, 2)}%)")

Remove cases with illegal end activities

In [None]:
print(pm4py.get_end_activities(filtered_log))

filtered_log = pm4py.filter_end_activities(filtered_log, ['Payment', 'Send for Credit Collection', 'Send Appeal to Prefecture', 'Appeal to Judge'])

legal_cases = len(filtered_log['case:concept:name'].unique())
all_cases = len(log_df['case:concept:name'].unique())
print(f"Filtered events: {len(filtered_log)}")
print(f"Filtered cases: {legal_cases}/{all_cases} ({round((legal_cases/all_cases) * 100, 2)}%)")

## Knowledge Uplift Trail

The starting point of the project is the log, provided as an ```.xes``` file. The log is processed in two phases: cleaning and filtering. In the cleaning process, the log is converted to a DataFrame, compatible with the libraries used, some parts of the log are removed (_matricola_), changed (_amount_) or added (_status_, _completed_, etc.). In the filtering process, data not useful for analysis are removed through a series of filters (time, values and activies).

After these two preliminary phases, statistical methods are used to do a general analysis of the data contained in the log.

## Statistical Analysis

In [None]:
import matplotlib.pyplot as plt
import time
import datetime

### General analysis

General case durations

In [None]:
case_durations = pm4py.get_all_case_durations(filtered_log)
min_duration = min(case_durations)
min_time = datetime.timedelta(seconds=min_duration)
max_duration = max(case_durations)
max_time = datetime.timedelta(seconds=max_duration)
mean_duration = np.mean(case_durations)
mean_time = datetime.timedelta(seconds=mean_duration)

print(f"Min Case Duration: {min_duration} -> {min_time}")
print(f"Max Case Duration: {max_duration} -> {max_time}")
print(f"Mean Case Duration: {mean_duration} -> {mean_time}")

In [None]:
case_durations_df = filtered_log.groupby('case:concept:name', as_index=False).agg(\
    StartTime = ('time:timestamp', lambda x: x.min()),
    Duration = ('time:timestamp', lambda x: x.max() - x.min())
)

def format_duration(row):
    total_seconds = int(row.total_seconds())
    # hours, remainder = divmod(total_seconds, 3600)
    # minutes, seconds = divmod(remainder, 60)
    # return f"{hours:02}:{minutes:02}:{seconds:02}"
    return total_seconds

case_durations_df['Duration'] = case_durations_df['Duration'].apply(format_duration)

def pick_year(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").year

def pick_month(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").month

def pick_day(row):
    return datetime.datetime.strptime(str(row["StartTime"]), "%Y-%m-%d %H:%M:%S%z").day

case_durations_df["StartYear"] = case_durations_df.apply(pick_year, axis="columns")
case_durations_df["StartMonth"] = case_durations_df.apply(pick_month, axis="columns")
case_durations_df["StartDay"] = case_durations_df.apply(pick_day, axis="columns")

case_durations_df.head()

In [None]:
years = [int(x) for x in case_durations_df["StartYear"].unique().tolist()]
years.sort()

means = []
for x in years:
    filtered_durations = case_durations_df[case_durations_df["StartYear"] == x]
    means.append(filtered_durations["Duration"].mean())

plt.figure(figsize=(9, 6))
plt.bar(years, means)
plt.xlabel("Year")
plt.ylabel("Mean Duration (seconds)")
plt.title("Case duration by year")
# plt.yticks(means, means)
plt.xticks(years, years, rotation=45)
plt.grid(True)
plt.show()

Case durations by month of the year

In [None]:
months = [int(x) for x in case_durations_df["StartMonth"].unique().tolist()]
months.sort()

means = []
for x in months:
    filtered_durations = case_durations_df[case_durations_df["StartMonth"] == x]
    means.append(filtered_durations["Duration"].mean())

plt.figure(figsize=(9, 6))
plt.bar(months, means)
plt.xlabel("Year")
plt.ylabel("Mean Duration (seconds)")
plt.title("Case duration by month")
# plt.yticks(means, means)
plt.xticks(months, ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"], rotation=45)
plt.grid(True)
plt.show()

Case frequency by duration

In [None]:
xlabels = ["Less than 1 day",
            "Less than 1 week",
            "Less than 1 month",
            "Less than 6 months",
            "Less than 1 year",
            "Less than 2 years",
            "Less than 5 years",
            "Others"]
freqs = [len(case_durations_df[case_durations_df["Duration"] <= 60*60*24]), # 1 day
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24 * 7]), # 1 week
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*7 * 4]), # 1 month
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*7*4 * 6]), # 6 months
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24 * 365]), # 1 year
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*365 * 2]), # 2 years
            len(case_durations_df[case_durations_df["Duration"] <= 60*60*24*365 * 5]), # 5 years
            len(case_durations_df)]

for x in range(len(freqs)-1, 0, -1):
    freqs[x] -= freqs[x-1]

for x in range(len(freqs)):
    print(f"{xlabels[x].ljust(20, ' ')}: {str(freqs[x]).ljust(5, ' ')} ({round(freqs[x]/len(case_durations_df) * 100, 2)}%)")

plt.figure(figsize=(9, 6))
plt.bar(xlabels, freqs)
plt.xlabel("Duration")
plt.ylabel("Frequency")
plt.title("Case frequency by duration")

y_custom_ticks = [x*5000 for x in range(10)]
y_custom_ticks.remove(5000)
y_custom_ticks.append(min(freqs[:len(freqs)-1]))
y_custom_ticks.append(max(freqs))
y_custom_ticks.sort()

plt.yticks(y_custom_ticks)
plt.xticks(rotation=45)
plt.grid(True)
plt.show()

Bisogna fare altre analisi statistiche ma dipendono da cosa si vuole dimostrare, da continuare più avanti

In [None]:
# cerco di capire perché alcuni casi durano tanto
# tanto = più di 1 anno

# case_ids = case_durations_df.unique().tolist()

log_steps = filtered_log.copy(deep=True)

durations_short = case_durations_df[case_durations_df["Duration"] <= 60*60*24 * 365]
durations_long = case_durations_df[case_durations_df["Duration"] > 60*60*24 * 365]

# last_timestamp = 0
# def time_from_previous_activity(row):
#     global last_timestamp
#     if row["concept:name"] == "Create Fine":
#         last_timestamp = row["time:timestamp"]
#         return 0
#     else:
#         elapsed_time = row["time:timestamp"] - last_timestamp
#         last_timestamp = row["time:timestamp"]
#         return elapsed_time.total_seconds()

# log_steps["step_duration"] = log_steps.apply(time_from_previous_activity, axis="columns")

# log_steps.head(10)

# log_steps[log_steps["amount"] == log_steps["amount"].max()]

## Process discover

In [None]:
def print_top_variants_info(input_log, k=5):
    filtered_log_top = pm4py.filter_variants_top_k(input_log, k)

    num_events = len(filtered_log)
    num_events_top = len(filtered_log_top)
    num_cases = len(filtered_log['case:concept:name'].unique())
    num_cases_top = len(filtered_log_top['case:concept:name'].unique())
    print(f"Number of events: {num_events_top}/{num_events} ({round(num_events_top/num_events * 100, 2)}%)")
    print(f"Number of cases: {num_cases_top}/{num_cases} ({round(num_cases_top/num_cases * 100, 2)}%)")

    net, im, fm = pm4py.discover_petri_net_inductive(filtered_log_top)
    pm4py.view_petri_net(net, im, fm, format='png')

In [None]:
print_top_variants_info(filtered_log, 1)

In [None]:
print_top_variants_info(filtered_log, 3)

In [None]:
print_top_variants_info(filtered_log, 5)

In [None]:
print_top_variants_info(filtered_log, 10)

## Conformance Checking

aaa

## Organizational Goal

aaa

## Further Work

aaa