# Featurize the data

* Read data from prep
* Extract features

## CaseFeatures
### Simple7

CASE | TASK | FIRST_TS | LAST_TS | ACTIVITY_COUNT | DISTINCT_ACTIVITY_COUNT |TASK_SWITCHES
---- | ---- | -------- | ------- | -------------- | ------------- |  -------------
C1   | T1   | 0        | 0       | 1              | 1             |0
C1   | T2   | 0        | 0       | 1              | 1             |0
C2   | T1   | 0        | 0       | 1              | 1             |0
C2   | T2   | 0        | 0       | 1              | 1             |0
C3   | T1   | 0        | 0       | 1              | 1             |0
C3   | T2   | 0        | 0       | 1              | 1             |0
C3   | T4   | 0        | 0       | 1              | 1             |0
C4   | T2   | 0        | 0       | 1              | 1             |0
C4   | T1   | 0        | 0       | 1              | 1             |0
C4   | T4   | 0        | 0       | 1              | 1             |0
C5   | T1   | 0        | 0       | 1              | 1             |0
C5   | T2   | 0        | 0       | 1              | 1             |0
C5   | T3   | 0        | 0       | 1              | 1             |0
C6   | T2   | 0        | 0       | 1              | 1             |0
C6   | T1   | 0        | 0       | 1              | 1             |0
C6   | T3   | 0        | 0       | 1              | 1             |0
C7   | T1   | 0        | 0       | 1              | 1             |0
C7   | T2   | 0        | 0       | 1              | 1             |0
C7   | T3   | 0        | 0       | 1              | 1             |0



## Import required libraries

In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import json
from collections import defaultdict
from configs.config import cache as cache_path

## Assign input and output data paths

In [2]:
in_file_name = "activities.csv"
out_file_name = "features.csv"
out_path = cache_path + out_file_name

## Read the data into a dataframe

In [3]:
# read activities json into pandas dataframe
def read_json_to_df(json_file):
    with open(json_file) as f:
        data = json.load(f)
    return pd.DataFrame(data)

# read activities csv into pandas dataframe
activities_df = pd.read_csv(cache_path + in_file_name)

## Functions to featurize

In [4]:
def count_switches(lst):
    switch_counts = defaultdict(int)
    last_seen = {}

    for i, elem in enumerate(lst):
        if elem not in last_seen:
            last_seen[elem] = (elem, i)
            continue

        prev_elem, _ = last_seen[elem]
        if prev_elem != lst[i - 1]:
            switch_counts[elem] += 1

        last_seen[elem] = (elem, i)

    unique_elements = set(lst)
    for elem in unique_elements:
        if elem not in switch_counts:
            switch_counts[elem] = 0

    return switch_counts

def featurize(activities_df):
    # sort the case tasks by timestamp
    activities_df = (activities_df.groupby(['case', 'task'])
                 .apply(lambda x: x.sort_values('event_time'))
                 .reset_index(drop=True)
                )
    
    # add activity_count and distinct_activity_count columns and processing_time column
    activities_df["activity_count"] = activities_df.groupby(['case', 'task'])['activity'].transform('count')
    activities_df["distinct_activity_count"] = activities_df.groupby(['case', 'task'])['activity'].transform('nunique')
    activities_df["processing_time"] = activities_df.groupby(['case', 'task'])['processing_time'].transform('sum')
    activities_df.drop(columns='activity', inplace=True)
    # activities_df = (
    #     activities_df.assign(
    #         activity_count=activities_df.groupby(['case', 'task'])['activity'].transform('count'),
    #         distinct_activity_count=activities_df.groupby(['case', 'task'])['activity'].transform('nunique'),
    #         processing_time=activities_df.groupby(['case', 'task'])['processing_time'].transform('sum'),
    #     )
    #     .drop(columns='activity')
    # )

    # keep the row with the first activity for each case-task combination
    activities_df.drop_duplicates(subset=['case', 'task'], keep='first', inplace=True)

    # add task_switch column
    switch_counts_per_case_df = (
        activities_df.groupby('case')['task']
        .apply(lambda x: count_switches(x.tolist()))
        .to_frame()
        .reset_index()
        .rename(columns={'task': 'task_switch', 'level_1': 'task'})
    )   

    # drop rows with NaN values in the TASK_SWITCH column
    switch_counts_per_case_df.dropna(inplace=True)
    switch_counts_per_case_df.reset_index(drop=True, inplace=True)
    switch_counts_per_case_df.head()

    # merge the 2 dataframes on CASE and TASK
    activities_df = pd.merge(activities_df, switch_counts_per_case_df, on=['case', 'task'])

    # add first_ts and last_ts columns
    activities_df = (
        activities_df.assign(
            FIRST_TS=activities_df.groupby(['case', 'task'])['event_time'].transform('min'),
            LAST_TS=activities_df.groupby(['case', 'task'])['event_time'].transform('max')
        )
        .drop(columns='event_time')
    )

    # keep only the first task - case combination
    # activities_df.drop_duplicates(subset=['case', 'task'], keep='first', inplace=True)

    # convert first_ts and last_ts columns to datetime
    activities_df[['FIRST_TS', 'LAST_TS']] = activities_df[['FIRST_TS', 'LAST_TS']].apply(pd.to_datetime)

    # rename columns
    activities_df.rename(columns={'FIRST_TS': 'first_TS', 'LAST_TS': 'last_TS'}, inplace=True)

    return activities_df

# Featurize the data and export to features.csv

In [5]:
# filter activities_df to only include cases with values not equal to 'Missing' or 0
activities_df = activities_df.query('case != "Missing" and case != "0" and case != "None"')

final_df = featurize(activities_df)
final_df.to_csv(cache_path + out_file_name, index=False)

In [6]:
# # show final_df as a pivot table
# final_df.pivot_table(
#     index='case', 
#     columns='task', 
#     values='activity_count', 
#     aggfunc='count', 
#     fill_value=0, 
#     margins=True).astype(int).sort_values('All', ascending=False, axis=0)