In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from numpy.random import seed
seed(42)

In [3]:
import os
import pandas as pd
import math
import numpy as np

In [4]:
! pip install wget


[notice] A new release of pip available: 22.1.2 -> 22.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import wget
import os.path

if(not os.path.exists('finale.csv')):
    wget.download('https://data.4tu.nl/ndownloader/files/23993303')

In [6]:
df_finale = pd.read_csv('finale.csv', index_col=0).reset_index()

In [7]:
df_finale.columns

Index(['Case ID', 'Activity', 'Resource', 'Complete Timestamp', 'Variant',
       'Variant index', 'Variant.1', 'seriousness', 'customer', 'product',
       'responsible_section', 'seriousness_2', 'service_level', 'service_type',
       'support_section', 'workgroup'],
      dtype='object')

In [8]:
df_finale.shape

(21348, 16)

String to datetime conversion

In [9]:
df_finale['Complete Timestamp'] = pd.to_datetime(df_finale['Complete Timestamp'])

Sorting case ids by early timestamp

In [10]:
sorted_time_cases = df_finale.sort_values('Complete Timestamp')['Case ID'].drop_duplicates().values

#### Feature creation
- **Duration**: Duration of previous activity
- **Passed Time**: Passed time between first activity and current one
- **Time to conclusion**: Remaining time to process finishes

In [19]:
from tqdm import tqdm

data = []

for case in tqdm(sorted_time_cases):
    #print(case)
    df_case = df_finale[df_finale['Case ID'] == case]
    df_case['Complete Timestamp Shift'] = df_case['Complete Timestamp'].shift(1)
    
    ##datetime
    df_case['Duration'] = df_case['Complete Timestamp'] - df_case['Complete Timestamp Shift']
    
    ## day as float 
    df_case['Duration Float'] = df_case['Duration'] / pd.to_timedelta(1, unit='D')
    df_case['Duration Float'] = df_case['Duration Float'].fillna(0).round(4)
    
    time_to_conclusion = []
    total_time = df_case['Duration Float'].sum()
    
    for time in df_case['Duration Float']:
        total_time = total_time - time
        time_to_conclusion.append(total_time)
        
    df_case['Time to conclusion'] = time_to_conclusion
    df_case['Time to conclusion'] = df_case['Time to conclusion'].round(4)
    
    df_case['Passed Time'] = df_case['Time to conclusion'].iloc[::-1].values
    df_case['Passed Time'] = df_case['Passed Time'].round(4)
    
    df_case['Step'] = np.arange(1, df_case.shape[0]+1)    
    
    data.append(df_case)

  0%|          | 0/4580 [00:00<?, ?it/s]

Case 3608





In [13]:
df_case.head()

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,Variant.1,seriousness,customer,product,...,service_level,service_type,support_section,workgroup,Complete Timestamp Shift,Duration,Duration Float,Time to conclusion,Passed Time,Step
16857,Case 3608,Assign seriousness,Value 2,2010-01-13 08:40:25,Variant 33,33,Variant 33,Value 1,Value 63,Value 3,...,Value 2,Value 1,Value 4,Value 3,NaT,NaT,0.0,31.0087,0.0,1
16858,Case 3608,Take in charge ticket,Value 2,2010-01-29 08:52:27,Variant 33,33,Variant 33,Value 1,Value 63,Value 3,...,Value 2,Value 1,Value 4,Value 3,2010-01-13 08:40:25,16 days 00:12:02,16.0084,15.0003,0.0,2
16859,Case 3608,Resolve ticket,Value 2,2010-01-29 08:52:34,Variant 33,33,Variant 33,Value 1,Value 63,Value 3,...,Value 2,Value 1,Value 4,Value 3,2010-01-29 08:52:27,0 days 00:00:07,0.0001,15.0002,15.0002,3
16860,Case 3608,Closed,Value 5,2010-02-13 08:52:48,Variant 33,33,Variant 33,Value 1,Value 63,Value 3,...,Value 2,Value 1,Value 4,Value 3,2010-01-29 08:52:34,15 days 00:00:14,15.0002,0.0,15.0003,4
16861,Case 3608,Closed,Value 5,2010-02-13 08:52:48,Variant 33,33,Variant 33,Value 1,Value 63,Value 3,...,Value 2,Value 1,Value 4,Value 3,2010-02-13 08:52:48,0 days 00:00:00,0.0,0.0,31.0087,5


In [12]:
df_case['Complete Timestamp']

16857   2010-01-13 08:40:25
16858   2010-01-29 08:52:27
16859   2010-01-29 08:52:34
16860   2010-02-13 08:52:48
16861   2010-02-13 08:52:48
Name: Complete Timestamp, dtype: datetime64[ns]

In [20]:
df_case['Complete Timestamp'].shift(1)

8109                   NaT
8110   2013-11-28 17:07:59
8111   2013-12-16 16:08:53
8112   2013-12-17 09:28:06
8113   2013-12-17 09:28:07
Name: Complete Timestamp, dtype: datetime64[ns]

Example:

In [13]:
df_case[['Step', 'Duration Float', 'Passed Time', 'Time to conclusion']]

Unnamed: 0,Step,Duration Float,Passed Time,Time to conclusion
8109,1,0.0,-0.0,33.6809
8110,2,17.959,15.0002,15.7219
8111,3,0.7217,15.0002,15.0002
8112,4,0.0,15.7219,15.0002
8113,5,15.0002,33.6809,-0.0


In [14]:
appended_data = pd.concat(data)

In [15]:
pd.DataFrame(appended_data).to_csv('finale_time_features.csv', index=False)

In [20]:
df_final = pd.read_csv('finale_time_features.csv')

In [26]:
last_activities = []

for name, group in df_final.groupby('Case ID'):
    last_activities.append(group.tail(1)['Activity'].values[0])

In [31]:
from collections import Counter

Counter(last_activities)

Counter({'Closed': 4557,
         'Wait': 8,
         'VERIFIED': 1,
         'Require upgrade': 3,
         'Take in charge ticket': 1,
         'Resolve ticket': 10})