# Road Traffic Fine Management

## Setup

In [None]:
import pandas as pd
import pm4py

In [None]:
# Import the event log
log_raw = pm4py.read_xes("Road_Traffic_Fine_Management_Process.xes")

In [None]:
log_raw = pm4py.format_dataframe(log_raw, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
log_raw['time:timestamp'] = pd.to_datetime(log_raw['time:timestamp'])

log_raw.head(7)

In [None]:
# Raw log analysis

num_events = len(log_raw)
num_cases = len(log_raw['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

start_activities = pm4py.get_start_activities(log_raw)
end_activities = pm4py.get_end_activities(log_raw)
all_activities = log_raw["concept:name"].unique().tolist()
print(f"Start activities: {start_activities}\nEnd activities: {end_activities}\nAll activites: {all_activities}")

## Data Cleaning

### NaN values

Check which columns have NaN values

In [None]:
log_df = log_raw.copy()

for col in log_df:
    if log_df[col].isna().any():
        print(f"{col.ljust(22, ' ')}: missing values")
    else:
        print(f"{col.ljust(22, ' ')}: clean")

Change NaN values to _zero_ in columns _amount_, _paymentAmount_, _totalPaymentAmount_ and _expense_

In [None]:
log_df["amount"] = log_df["amount"].fillna(0)
log_df["paymentAmount"] = log_df["paymentAmount"].fillna(0)
log_df["totalPaymentAmount"] = log_df["totalPaymentAmount"].fillna(0)
log_df["expense"] = log_df["expense"].fillna(0)

log_df.head()

### Remove matricola column

Remove attribute matricola because it's always either NaN or 0, so it's not useful

In [None]:
print(log_df["matricola"].unique())

In [None]:
log_df.drop(["matricola"], axis="columns", inplace=True)

log_df.head()

### Rename columns

In [None]:
# log_df.rename(columns={"amount" : "amount",
#                         "expense" : "extraAmount",
#                         "paymentAmount": "paymentAmount",
#                         "totalPaymentAmount" : "totalAmount"}, inplace=True)

# log_df.head()

### Fix _amount_ column

In [None]:
def correctAmount(log_row):
    activity = log_row["concept:name"]

    if activity == "Create Fine" or activity == "Add Penalty":
        return log_row["amount"]
    elif activity == "Send Fine":
        return log_row["expense"]
    elif activity == "Payment":
        return log_row["paymentAmount"]
    return 0

log_df["amount"] = log_df.apply(correctAmount, axis="columns")

log_df.head()

### Add _dueAmount_ column

In [None]:
log_df.head(10)

In [None]:
incr_amount = 0
last_case = None
def incrementalDueAmount(log_row):
    global incr_amount, last_case
    if last_case == None or log_row["case:concept:name"] != last_case:
        last_case = log_row["case:concept:name"]
        incr_amount = log_row["amount"]
    else:
        incr_amount += log_row["amount"]
    return incr_amount

log_df["dueAmount"] = log_df.apply(incrementalDueAmount, axis="columns")

log_df.head(10)

### Add _status_ values

In [None]:
def setStatus(log_row):
    dismissal = log_row["dismissal"]

    if dismissal == "#":
        return "Prefecture"
    elif dismissal == "G":
        return "Judge"
    elif dismissal == "NIL":
        return "Not Payed"
    elif pd.isna(dismissal):
        return "Unknown"
    return 0

log_df["status"] = log_df.apply(setStatus, axis="columns")

log_df.head(10)

### Change column order

Change order of columns to make the dataframe more readable

In [None]:
print(log_df.columns)

In [None]:
columns = ['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp', # general attributes
            'amount', 'expense', 'points', 'paymentAmount', # number attributes
            'totalPaymentAmount', 'dueAmount', # current total amount paid by offender and total due (sum of fines, expenses and penalties)
            'dismissal', 'status', # status code
            'lastSent', 'vehicleClass', 'article', 'notificationType',
            'lifecycle:transition', '@@index', '@@case_index']

log_df = log_df[columns]

log_df.head(10)

## Data Filtering

Remove cases with duration 0 and not ending with a payment

In [None]:
filtered_log = pm4py.filter_variants(log_df, [('Create Fine', 'Send Fine')], retain=True)
filtered_log = pm4py.filter_case_performance(filtered_log, 0, 0)
variants = pm4py.get_variants(filtered_log)

variants

In [None]:
log_df = pd.concat([log_df,filtered_log]).drop_duplicates(keep=False)

num_events = len(log_df)
num_cases = len(log_df['case:concept:name'].unique())
print(f"Number of events: {num_events}\nNumber of cases: {num_cases}")

Compute duration of cases using a custom function

In [None]:
log_df.groupby('concept:name').size()

In [None]:
case_durations = log_df.groupby('case:concept:name').agg(\
    Events=('case:concept:name', 'count'),
    FirstOccurence=('time:timestamp', lambda x: x.min()),
    LastOccurence=('time:timestamp', lambda x: x.max()),
    Duration=('time:timestamp', lambda x: x.max() - x.min()),
)

case_durations

In [None]:
# Let's verify we have cases with duration 0
min_case_duration = case_durations['Duration'].min()
max_case_duration = case_durations['Duration'].max()
mean_case_duration = case_durations['Duration'].mean()

print("Min Case Duration: {}\nMax Case Duration: {}\nMean Case Duration: {}".format(min_case_duration, max_case_duration, mean_case_duration))