In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from numpy.random import seed
seed(42)

In [3]:
import os
import pandas as pd
import math
import numpy as np

In [4]:
! pip install wget


[notice] A new release of pip available: 22.1.2 -> 22.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import wget
import os.path

if(not os.path.exists('finale.csv')):
    wget.download('https://data.4tu.nl/ndownloader/files/23993303')

In [6]:
df_finale = pd.read_csv('finale.csv', index_col=0).reset_index()

In [7]:
df_finale.columns

Index(['Case ID', 'Activity', 'Resource', 'Complete Timestamp', 'Variant',
       'Variant index', 'Variant.1', 'seriousness', 'customer', 'product',
       'responsible_section', 'seriousness_2', 'service_level', 'service_type',
       'support_section', 'workgroup'],
      dtype='object')

In [8]:
df_finale.shape

(21348, 16)

String to datetime conversion

In [9]:
df_finale['Complete Timestamp'] = pd.to_datetime(df_finale['Complete Timestamp'])

Sorting case ids by early timestamp

In [11]:
sorted_time_cases = df_finale.sort_values('Complete Timestamp')['Case ID'].drop_duplicates().values

#### Feature creation
- **Duration**: Duration of previous activity
- **Passed Time**: Passed time between first activity and current one
- **Time to conclusion**: Remaining time to process finishes

In [12]:
from tqdm import tqdm

data = []

for case in tqdm(sorted_time_cases):
    #print(case)
    df_case = df_finale[df_finale['Case ID'] == case]
    df_case['Complete Timestamp Shift'] = df_case['Complete Timestamp'].shift(1)
    
    ##datetime
    df_case['Duration'] = df_case['Complete Timestamp'] - df_case['Complete Timestamp Shift']
    
    ## day as float 
    df_case['Duration Float'] = df_case['Duration'] / pd.to_timedelta(1, unit='D')
    df_case['Duration Float'] = df_case['Duration Float'].fillna(0).round(4)
    
    time_to_conclusion = []
    total_time = df_case['Duration Float'].sum()
    
    for time in df_case['Duration Float']:
        total_time = total_time - time
        time_to_conclusion.append(total_time)
        
    df_case['Time to conclusion'] = time_to_conclusion
    df_case['Time to conclusion'] = df_case['Time to conclusion'].round(4)
    
    df_case['Passed Time'] = df_case['Time to conclusion'].iloc[::-1].values
    df_case['Passed Time'] = df_case['Passed Time'].round(4)
    
    df_case['Step'] = np.arange(1, df_case.shape[0]+1)    
    
    data.append(df_case)

100%|██████████| 4580/4580 [06:53<00:00, 11.09it/s]


Example:

In [13]:
df_case[['Step', 'Duration Float', 'Passed Time', 'Time to conclusion']]

Unnamed: 0,Step,Duration Float,Passed Time,Time to conclusion
8109,1,0.0,-0.0,33.6809
8110,2,17.959,15.0002,15.7219
8111,3,0.7217,15.0002,15.0002
8112,4,0.0,15.7219,15.0002
8113,5,15.0002,33.6809,-0.0


In [14]:
appended_data = pd.concat(data)

In [15]:
pd.DataFrame(appended_data).to_csv('finale_time_features.csv', index=False)