## Importing libraries

In [0]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Open training data

In [0]:
link = 'https://drive.google.com/open?id=1f8gIhhWgbp4nYQ72VpW5HFUFjZspRMD-'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
df_training = pd.read_csv('Filename.csv')

In [0]:
df_training

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
...,...,...,...,...,...,...,...
214372,38835094290529,201854,2012-01-18T02:09:07.029+01:00,50000,O_CANCELLED,COMPLETE,14-03-2012 15:30:19.361
214373,38835094290528,201854,2012-01-18T02:09:07.029+01:00,50000,A_CANCELLED,COMPLETE,14-03-2012 15:30:19.361
214374,38835094290530,201854,2012-01-18T02:09:07.029+01:00,50000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 15:30:23.187
214375,35858681954366,199678,2012-01-10T19:16:52.800+01:00,30000,W_Nabellen offertes,START,14-03-2012 15:36:15.299


# Multi-index by case concept name and event concept name

In [0]:
df_training.set_index(['case concept:name', 'event concept:name'], inplace=True)
df_training.sort_index(inplace=True)

In [0]:
df_training

Unnamed: 0_level_0,Unnamed: 1_level_0,eventID,case REG_DATE,case AMOUNT_REQ,event lifecycle:transition,event time:timestamp
case concept:name,event concept:name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
173688,A_ACCEPTED,5,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:42:43.308
173688,A_ACTIVATED,24,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
173688,A_APPROVED,22,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
173688,A_FINALIZED,7,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:45:09.243
173688,A_PARTLYSUBMITTED,1,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:38:44.880
...,...,...,...,...,...,...
206318,W_Afhandelen leads,44955422687235,2012-02-03T17:07:38.334+01:00,5000,START,03-02-2012 17:49:29.108
206318,W_Afhandelen leads,44955422687237,2012-02-03T17:07:38.334+01:00,5000,COMPLETE,03-02-2012 17:56:00.998
206321,A_DECLINED,44959717654530,2012-02-03T17:08:39.199+01:00,2000,COMPLETE,03-02-2012 17:09:19.112
206321,A_PARTLYSUBMITTED,44959717654529,2012-02-03T17:08:39.199+01:00,2000,COMPLETE,03-02-2012 17:08:39.459


Example of a single case

In [0]:
df_training.loc[173688]

Unnamed: 0_level_0,eventID,case REG_DATE,case AMOUNT_REQ,event lifecycle:transition,event time:timestamp
event concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A_ACCEPTED,5,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:42:43.308
A_ACTIVATED,24,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_APPROVED,22,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_FINALIZED,7,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:45:09.243
A_PARTLYSUBMITTED,1,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:38:44.880
A_PREACCEPTED,2,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:39:37.906
A_REGISTERED,21,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_SUBMITTED,0,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:38:44.546
O_ACCEPTED,23,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
O_CREATED,8,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:45:11.197


Another (smaller) example

In [0]:
df_training.loc[206318]

Unnamed: 0_level_0,eventID,case REG_DATE,case AMOUNT_REQ,event lifecycle:transition,event time:timestamp
event concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A_DECLINED,44955422687236,2012-02-03T17:07:38.334+01:00,5000,COMPLETE,03-02-2012 17:55:57.294
A_PARTLYSUBMITTED,44955422687233,2012-02-03T17:07:38.334+01:00,5000,COMPLETE,03-02-2012 17:07:38.843
A_SUBMITTED,44955422687232,2012-02-03T17:07:38.334+01:00,5000,COMPLETE,03-02-2012 17:07:38.334
W_Afhandelen leads,44955422687234,2012-02-03T17:07:38.334+01:00,5000,SCHEDULE,03-02-2012 17:07:52.820
W_Afhandelen leads,44955422687235,2012-02-03T17:07:38.334+01:00,5000,START,03-02-2012 17:49:29.108
W_Afhandelen leads,44955422687237,2012-02-03T17:07:38.334+01:00,5000,COMPLETE,03-02-2012 17:56:00.998


# Split in training and validation set

###1. indices = df.index.levels[0] takes all the unique from level=0 index - which are all the cases
###2. train_indices = np.random.choice(indices,size=int(len(indices)*0.80), replace=False) samples 80% of the indices chosen in previous step - which basically is a 80/20 split of the training data and validation data
###3. Finally we split train and validation accordingly

In [0]:
#1
indices = df_training.index.levels[0]

#2
train_indices = np.random.choice(indices,size=int(len(indices)*0.80), replace=False)
validation_indices = np.setdiff1d(indices, train_indices)

#3
train = df_training[np.in1d(df_training.index.get_level_values(0), train_indices)]
validation = df_training[np.in1d(df_training.index.get_level_values(0), validation_indices)]

Check whether the same case is not in both the training and validation set

In [0]:
train.loc[173688]

Unnamed: 0_level_0,eventID,case REG_DATE,case AMOUNT_REQ,event lifecycle:transition,event time:timestamp
event concept:name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A_ACCEPTED,5,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:42:43.308
A_ACTIVATED,24,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_APPROVED,22,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_FINALIZED,7,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:45:09.243
A_PARTLYSUBMITTED,1,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:38:44.880
A_PREACCEPTED,2,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:39:37.906
A_REGISTERED,21,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
A_SUBMITTED,0,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 00:38:44.546
O_ACCEPTED,23,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,13-10-2011 10:37:29.226
O_CREATED,8,2011-10-01T00:38:44.546+02:00,20000,COMPLETE,01-10-2011 11:45:11.197


In [0]:
validation.loc[173688]

KeyError: ignored

## The length of both dataframes should add up to 214377

In [0]:
len(train)+len(validation)

214377

# Function to split every dataframe into training and validation (according 80/20 split)

In [0]:
def split_data(df1, column1, column2):
  df1.set_index([column1, column2], inplace=True)
  df1.sort_index(inplace=True)

  #1
  indices = df1.index.levels[0]

  #2
  train_indices = np.random.choice(indices,size=int(len(indices)*0.80), replace=False)
  validation_indices = np.setdiff1d(indices, train_indices)

  #3
  train = df1[np.in1d(df1.index.get_level_values(0), train_indices)]
  validation = df1[np.in1d(df1.index.get_level_values(0), validation_indices)]

  return train, validation



Try with a different dataset

In [0]:
link = 'https://drive.google.com/open?id=1wNw7ozOZTP_CJHGmIALZF1ssAHPrQ_ms'
fluff, id = link.split('=')
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
df_training2 = pd.read_csv('Filename.csv')

In [0]:
df_training2 = df_training2.iloc[0:250000]

In [0]:
train2, validation2 = split_data(df_training2, 'case concept:name', 'event concept:name')

In [0]:
train2

Unnamed: 0_level_0,Unnamed: 1_level_0,eventID,case LoanGoal,case ApplicationType,case RequestedAmount,event Action,event org:resource,event EventOrigin,event EventID,event lifecycle:transition,event time:timestamp
case concept:name,event concept:name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Application_1000339879,A_Accepted,25713969201159,Existing loan takeover,New credit,37500.0,statechange,User_28,Application,ApplState_2061518870,complete,18-03-2016 09:25:20.591
Application_1000339879,A_Complete,25713969201166,Existing loan takeover,New credit,37500.0,statechange,User_28,Application,ApplState_947494278,complete,18-03-2016 09:31:53.211
Application_1000339879,A_Concept,25713969201157,Existing loan takeover,New credit,37500.0,statechange,User_1,Application,ApplState_454356197,complete,17-03-2016 13:58:21.623
Application_1000339879,A_Create Application,25713969201152,Existing loan takeover,New credit,37500.0,Created,User_1,Application,Application_1000339879,complete,17-03-2016 13:57:10.159
Application_1000339879,A_Incomplete,25713969201183,Existing loan takeover,New credit,37500.0,statechange,User_75,Application,ApplState_1375801582,complete,28-03-2016 13:35:50.445
...,...,...,...,...,...,...,...,...,...,...,...
Application_999507989,W_Handle leads,25602300051459,Not speficied,New credit,24000.0,Deleted,User_1,Workflow,Workitem_1968459994,withdraw,17-03-2016 07:54:40.461
Application_999507989,W_Validate application,25602300051476,Not speficied,New credit,24000.0,Created,User_116,Workflow,Workitem_781266784,schedule,23-03-2016 10:05:10.324
Application_999507989,W_Validate application,25602300051477,Not speficied,New credit,24000.0,Obtained,User_116,Workflow,Workitem_1152707742,start,23-03-2016 10:05:10.327
Application_999507989,W_Validate application,25602300051480,Not speficied,New credit,24000.0,Released,User_116,Workflow,Workitem_20293026,suspend,23-03-2016 10:12:03.011


In [0]:
validation2

Unnamed: 0_level_0,Unnamed: 1_level_0,eventID,case LoanGoal,case ApplicationType,case RequestedAmount,event Action,event org:resource,event EventOrigin,event EventID,event lifecycle:transition,event time:timestamp
case concept:name,event concept:name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Application_1002996676,A_Accepted,16196321673223,Home improvement,New credit,7750.0,statechange,User_35,Application,ApplState_1178321951,complete,22-02-2016 14:51:01.329
Application_1002996676,A_Cancelled,16196321673234,Home improvement,New credit,7750.0,statechange,User_1,Application,ApplState_411116734,complete,24-03-2016 08:00:19.559
Application_1002996676,A_Complete,16196321673230,Home improvement,New credit,7750.0,statechange,User_35,Application,ApplState_504430590,complete,22-02-2016 14:54:09.638
Application_1002996676,A_Concept,16196321673221,Home improvement,New credit,7750.0,statechange,User_1,Application,ApplState_1854649028,complete,19-02-2016 15:07:53.518
Application_1002996676,A_Create Application,16196321673216,Home improvement,New credit,7750.0,Created,User_1,Application,Application_1002996676,complete,19-02-2016 15:07:16.599
...,...,...,...,...,...,...,...,...,...,...,...
Application_998600110,W_Validate application,13361643257893,"Other, see explanation",Limit raise,6500.0,Released,User_116,Workflow,Workitem_628278010,suspend,02-03-2016 14:02:09.376
Application_998600110,W_Validate application,13361643257894,"Other, see explanation",Limit raise,6500.0,Obtained,User_87,Workflow,Workitem_368108315,resume,04-03-2016 09:04:34.203
Application_998600110,W_Validate application,13361643257895,"Other, see explanation",Limit raise,6500.0,Released,User_87,Workflow,Workitem_51993413,suspend,04-03-2016 09:04:39.577
Application_998600110,W_Validate application,13361643257896,"Other, see explanation",Limit raise,6500.0,Obtained,User_30,Workflow,Workitem_5830979,resume,04-03-2016 12:59:48.338
