# Exercise PPM Part 2 - Small dataset

Imports

In [1]:
import pandas as pd
import warnings
import numpy as np
from datetime import datetime
import calendar
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
warnings.simplefilter("ignore")

Import an event log as a pandas dataframe

In [2]:
df = pd.read_csv("Event_Logs/Sampled_BPIC17_offer_log.csv")
df

Unnamed: 0,index,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,(case) Accepted,(case) ApplicationID,(case) CreditScore,(case) FirstWithdrawalAmount,(case) MonthlyCost,(case) NumberOfTerms,(case) OfferedAmount,(case) Selected,Action,EventID,EventOrigin,OfferID,lifecycle:transition
0,0,Offer_247135719,O_Create Offer,User_17,2016/01/02 18:17:05.720,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,Created,Offer_247135719,Offer,,complete
1,1,Offer_247135719,O_Created,User_17,2016/01/02 18:17:08.762,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_124849367,Offer,Offer_247135719,complete
2,2,Offer_247135719,O_Sent (online only),User_17,2016/01/02 18:19:21.330,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_440662877,Offer,Offer_247135719,complete
3,3,Offer_247135719,O_Cancelled,User_17,2016/01/02 18:21:26.034,Variant 8,8,True,Application_196483749,0,10000.0,201.76,57,10000.0,False,statechange,OfferState_591416028,Offer,Offer_247135719,complete
4,4,Offer_941964966,O_Create Offer,User_17,2016/01/02 18:21:42.022,Variant 1,1,True,Application_196483749,0,4100.0,201.76,57,10000.0,False,Created,Offer_941964966,Offer,,complete
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22404,22404,Offer_1064426652,O_Create Offer,User_23,2016/02/18 22:50:01.446,Variant 2,2,True,Application_1574359134,767,6000.0,118.56,59,6000.0,True,Created,Offer_1064426652,Offer,,complete
22405,22405,Offer_1064426652,O_Created,User_23,2016/02/18 22:50:02.655,Variant 2,2,True,Application_1574359134,767,6000.0,118.56,59,6000.0,True,statechange,OfferState_189916093,Offer,Offer_1064426652,complete
22406,22406,Offer_1064426652,O_Sent (mail and online),User_23,2016/02/18 22:50:22.174,Variant 2,2,True,Application_1574359134,767,6000.0,118.56,59,6000.0,True,statechange,OfferState_24747308,Offer,Offer_1064426652,complete
22407,22407,Offer_1064426652,O_Returned,User_117,2016/02/29 19:08:56.200,Variant 2,2,True,Application_1574359134,767,6000.0,118.56,59,6000.0,True,statechange,OfferState_1809485991,Offer,Offer_1064426652,complete


Create an event log that retains only the attributes caseid, activity, timestamp, resource and outcome.

In [3]:
df = df.rename(columns={"Case ID":"caseid", "Activity":"activity", "Complete Timestamp":"ts", "Resource":"resource", "(case) Accepted":"outcome"})
df_2 = df[["caseid", "activity", "ts", "resource", "outcome"]].copy()
df_2

Unnamed: 0,caseid,activity,ts,resource,outcome
0,Offer_247135719,O_Create Offer,2016/01/02 18:17:05.720,User_17,True
1,Offer_247135719,O_Created,2016/01/02 18:17:08.762,User_17,True
2,Offer_247135719,O_Sent (online only),2016/01/02 18:19:21.330,User_17,True
3,Offer_247135719,O_Cancelled,2016/01/02 18:21:26.034,User_17,True
4,Offer_941964966,O_Create Offer,2016/01/02 18:21:42.022,User_17,True
...,...,...,...,...,...
22404,Offer_1064426652,O_Create Offer,2016/02/18 22:50:01.446,User_23,True
22405,Offer_1064426652,O_Created,2016/02/18 22:50:02.655,User_23,True
22406,Offer_1064426652,O_Sent (mail and online),2016/02/18 22:50:22.174,User_23,True
22407,Offer_1064426652,O_Returned,2016/02/29 19:08:56.200,User_117,True


 Create a function to extract the prefixes of length L from the event log.

In [4]:
def create_prefix(df, L):
    res0 = df.sort_values(by=['caseid','ts']).groupby('caseid', as_index=False)['activity'].apply(list)    
    res1 = df.sort_values(by=['caseid','ts']).groupby('caseid', as_index=False)['ts'].apply(list)
    res2 = df.sort_values(by=['caseid','ts']).groupby('caseid', as_index=False)['resource'].apply(list)
    
    res = pd.merge(res0, pd.merge(res1, res2, on="caseid") , on="caseid")
    res["outcome"] = df["outcome"]

    
    event_num = df.groupby('caseid')['activity'].count().to_frame('event_number')
    res = pd.merge(res, event_num, on="caseid")
    
    res[res["event_number"] < L] = np.nan
    res = res.dropna()
    
    def function(row, label):
        activity_array = row[label][0:L]
        return activity_array
    
    for c in ["activity", "ts", "resource"]:
        res[c] = res.apply(lambda row: function(row, c), axis=1)
    
    res = res.drop(columns=["event_number"])
    return res
    

In [5]:
x = create_prefix(df, 5)
x

Unnamed: 0,caseid,activity,ts,resource,outcome
0,Offer_1000681710,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/02/02 18:24:54.170, 2016/02/02 18:24:55....","[User_20, User_20, User_20, User_117, User_115]",True
1,Offer_1001553250,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/01/16 03:16:41.868, 2016/01/16 03:16:43....","[User_2, User_2, User_2, User_113, User_30]",True
2,Offer_1002136393,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/01/29 17:46:49.529, 2016/01/29 17:46:50....","[User_85, User_85, User_85, User_117, User_118]",True
3,Offer_1002236598,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/01/12 18:24:55.557, 2016/01/12 18:24:56....","[User_49, User_49, User_49, User_113, User_102]",True
9,Offer_1005366187,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/02/01 21:21:51.853, 2016/02/01 21:21:53....","[User_16, User_16, User_16, User_87, User_87]",True
...,...,...,...,...,...
4991,Offer_99101290,"[O_Create Offer, O_Created, O_Sent (online onl...","[2016/02/12 23:56:11.915, 2016/02/12 23:56:13....","[User_41, User_41, User_41, User_115, User_115]",True
4992,Offer_992471522,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/02/18 00:12:18.561, 2016/02/18 00:12:19....","[User_5, User_5, User_5, User_116, User_118]",True
4997,Offer_99473283,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/02/03 20:59:31.775, 2016/02/03 20:59:33....","[User_19, User_19, User_19, User_116, User_113]",True
4998,Offer_995784215,"[O_Create Offer, O_Created, O_Sent (mail and o...","[2016/02/15 17:06:12.610, 2016/02/15 17:06:15....","[User_25, User_25, User_25, User_119, User_102]",True


Create a function that does aggregation encoding of a prefix extracted from the event log

In [8]:
def one_hot(x, label, fun):
    unique_val = []
    #Extract unique values of label
    for element in x[label]:
        for e in element:
            if e not in unique_val:
                unique_val.append(e)
    
    result = x[["caseid"]].copy()
    
    if fun == "index":
        #Create columns
        L = len(x[label][1]) # Length of the prefix
        for e in unique_val:
            for i in range(1, L+1): 
                result[label + "_" + str(i)+ "_" + e] = 0
        
        for index, row in x.iterrows():
            k = 1
            for e in row[label]:
                result.at[index, label + "_" + str(k) + "_" + e] += 1
                k +=1
    
    else:
        #Create columns
        for t in unique_val:
            result[t] = 0
        
        #Add one to the right column if value is in row
        for index, row in x.iterrows():
            for element in row[label]:
                result.at[index, element] += 1

    return result

def average_timestamp(df, label):

    result = df[["caseid"]].copy()
    result["avg_"+label] = 0

    L = len(df[label][1])
    
    for index, row in df.iterrows():
        temp = 0
        for element in row[label]:
            temp += calendar.timegm(datetime.strptime(element, '%Y/%m/%d %H:%M:%S.%f').timetuple())
        result.at[index, "avg_" + label]= time.strftime('%Y/%m/%d %H:%M:%S', time.gmtime(temp/L))
    
    return result


In [9]:
average_timestamp(x, "ts")

Unnamed: 0,caseid,avg_ts
0,Offer_1000681710,2016/02/05 15:45:27
1,Offer_1001553250,2016/01/19 14:03:45
2,Offer_1002136393,2016/02/07 03:09:55
3,Offer_1002236598,2016/01/14 14:34:58
9,Offer_1005366187,2016/02/06 17:00:58
...,...,...
4991,Offer_99101290,2016/02/14 12:03:01
4992,Offer_992471522,2016/02/27 07:35:00
4997,Offer_99473283,2016/02/09 00:34:14
4998,Offer_995784215,2016/02/17 23:22:35


In [10]:
one_hot(x, "activity", "aggr")

Unnamed: 0,caseid,O_Create Offer,O_Created,O_Sent (mail and online),O_Returned,O_Accepted,O_Refused,O_Sent (online only),O_Cancelled
0,Offer_1000681710,1,1,1,1,1,0,0,0
1,Offer_1001553250,1,1,1,1,1,0,0,0
2,Offer_1002136393,1,1,1,1,1,0,0,0
3,Offer_1002236598,1,1,1,1,0,1,0,0
9,Offer_1005366187,1,1,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
4991,Offer_99101290,1,1,0,1,1,0,1,0
4992,Offer_992471522,1,1,1,1,1,0,0,0
4997,Offer_99473283,1,1,1,1,1,0,0,0
4998,Offer_995784215,1,1,1,1,1,0,0,0


In [11]:
one_hot(x, "resource", "aggr")

Unnamed: 0,caseid,User_20,User_117,User_115,User_2,User_113,User_30,User_85,User_118,User_49,...,User_144,User_12,User_52,User_89,User_101,User_86,User_50,User_26,User_76,User_32
0,Offer_1000681710,3,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Offer_1001553250,0,0,0,3,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Offer_1002136393,0,1,0,0,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Offer_1002236598,0,0,0,0,1,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
9,Offer_1005366187,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,Offer_99101290,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4992,Offer_992471522,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4997,Offer_99473283,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,Offer_995784215,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
def aggregation_enc(df):
    res0 = one_hot(df, "activity", "aggr")
    res1 = one_hot(df, "resource", "aggr")
    res2 = average_timestamp(df, "ts")
    
    res = pd.merge(res0, pd.merge(res1, res2), on="caseid")
    
    temp = df[["caseid", "outcome"]].copy()
    res = pd.merge(res, temp, on="caseid")

    return res

In [13]:
aggregation_enc(x)

Unnamed: 0,caseid,O_Create Offer,O_Created,O_Sent (mail and online),O_Returned,O_Accepted,O_Refused,O_Sent (online only),O_Cancelled,User_20,...,User_52,User_89,User_101,User_86,User_50,User_26,User_76,User_32,avg_ts,outcome
0,Offer_1000681710,1,1,1,1,1,0,0,0,3,...,0,0,0,0,0,0,0,0,2016/02/05 15:45:27,True
1,Offer_1001553250,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2016/01/19 14:03:45,True
2,Offer_1002136393,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2016/02/07 03:09:55,True
3,Offer_1002236598,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,2016/01/14 14:34:58,True
4,Offer_1005366187,1,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,2016/02/06 17:00:58,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,Offer_99101290,1,1,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,2016/02/14 12:03:01,True
2543,Offer_992471522,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2016/02/27 07:35:00,True
2544,Offer_99473283,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2016/02/09 00:34:14,True
2545,Offer_995784215,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2016/02/17 23:22:35,True


Create a function that does index-based encoding of a prefix extracted from the event log.

In [14]:
def index_label(df, label):

    L = len(df[label][1])
    result = x[["caseid"]].copy()
    
    # Create the column and put them to 0
    for i in range(1, L+1):
        result[label + "_" + str(i)] = 0
        
    for index, row in x.iterrows():
        k = 1
        for e in row[label]:
            result.at[index, label + "_" + str(k)] = e
            k +=1
    
    return result

In [15]:
one_hot(x, "activity", "index")

Unnamed: 0,caseid,activity_1_O_Create Offer,activity_2_O_Create Offer,activity_3_O_Create Offer,activity_4_O_Create Offer,activity_5_O_Create Offer,activity_1_O_Created,activity_2_O_Created,activity_3_O_Created,activity_4_O_Created,...,activity_1_O_Sent (online only),activity_2_O_Sent (online only),activity_3_O_Sent (online only),activity_4_O_Sent (online only),activity_5_O_Sent (online only),activity_1_O_Cancelled,activity_2_O_Cancelled,activity_3_O_Cancelled,activity_4_O_Cancelled,activity_5_O_Cancelled
0,Offer_1000681710,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Offer_1001553250,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Offer_1002136393,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Offer_1002236598,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Offer_1005366187,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,Offer_99101290,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
4992,Offer_992471522,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,Offer_99473283,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,Offer_995784215,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
one_hot(x, "resource", "index")

Unnamed: 0,caseid,resource_1_User_20,resource_2_User_20,resource_3_User_20,resource_4_User_20,resource_5_User_20,resource_1_User_117,resource_2_User_117,resource_3_User_117,resource_4_User_117,...,resource_1_User_76,resource_2_User_76,resource_3_User_76,resource_4_User_76,resource_5_User_76,resource_1_User_32,resource_2_User_32,resource_3_User_32,resource_4_User_32,resource_5_User_32
0,Offer_1000681710,1,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,Offer_1001553250,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Offer_1002136393,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Offer_1002236598,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,Offer_1005366187,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4991,Offer_99101290,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4992,Offer_992471522,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,Offer_99473283,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,Offer_995784215,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
index_label(x, "ts")

Unnamed: 0,caseid,ts_1,ts_2,ts_3,ts_4,ts_5
0,Offer_1000681710,2016/02/02 18:24:54.170,2016/02/02 18:24:55.833,2016/02/02 18:25:14.889,2016/02/09 22:36:31.755,2016/02/10 00:55:41.368
1,Offer_1001553250,2016/01/16 03:16:41.868,2016/01/16 03:16:43.056,2016/01/16 03:16:58.528,2016/01/22 17:14:16.014,2016/01/26 19:14:10.052
2,Offer_1002136393,2016/01/29 17:46:49.529,2016/01/29 17:46:50.822,2016/01/29 17:47:10.378,2016/02/16 18:46:06.873,2016/02/22 15:42:42.278
3,Offer_1002236598,2016/01/12 18:24:55.557,2016/01/12 18:24:56.826,2016/01/12 18:25:08.174,2016/01/15 19:06:38.165,2016/01/18 22:33:14.113
9,Offer_1005366187,2016/02/01 21:21:51.853,2016/02/01 21:21:53.307,2016/02/01 21:22:06.186,2016/02/11 22:35:14.175,2016/02/15 22:23:50.211
...,...,...,...,...,...,...
4991,Offer_99101290,2016/02/12 23:56:11.915,2016/02/12 23:56:13.312,2016/02/12 23:56:35.610,2016/02/16 18:12:46.353,2016/02/16 18:13:23.726
4992,Offer_992471522,2016/02/18 00:12:18.561,2016/02/18 00:12:19.903,2016/02/18 00:12:37.882,2016/03/01 21:47:21.587,2016/03/22 15:30:26.199
4997,Offer_99473283,2016/02/03 20:59:31.775,2016/02/03 20:59:33.058,2016/02/03 21:03:57.787,2016/02/10 19:35:03.340,2016/02/22 16:13:10.784
4998,Offer_995784215,2016/02/15 17:06:12.610,2016/02/15 17:06:15.203,2016/02/15 17:06:31.030,2016/02/18 20:46:52.744,2016/02/23 20:47:09.974


In [18]:
def index_enc(df):
    res0 = one_hot(df, "activity", "index")
    res1 = one_hot(df, "resource", "index")
    res2 = index_label(df, "ts")
    
    res = pd.merge(res0, pd.merge(res1, res2), on="caseid")
    
    temp = df[["caseid", "outcome"]].copy()
    res = pd.merge(res, temp, on="caseid")

    return res

In [19]:
index_enc(x)

Unnamed: 0,caseid,activity_1_O_Create Offer,activity_2_O_Create Offer,activity_3_O_Create Offer,activity_4_O_Create Offer,activity_5_O_Create Offer,activity_1_O_Created,activity_2_O_Created,activity_3_O_Created,activity_4_O_Created,...,resource_2_User_32,resource_3_User_32,resource_4_User_32,resource_5_User_32,ts_1,ts_2,ts_3,ts_4,ts_5,outcome
0,Offer_1000681710,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/02 18:24:54.170,2016/02/02 18:24:55.833,2016/02/02 18:25:14.889,2016/02/09 22:36:31.755,2016/02/10 00:55:41.368,True
1,Offer_1001553250,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/01/16 03:16:41.868,2016/01/16 03:16:43.056,2016/01/16 03:16:58.528,2016/01/22 17:14:16.014,2016/01/26 19:14:10.052,True
2,Offer_1002136393,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/01/29 17:46:49.529,2016/01/29 17:46:50.822,2016/01/29 17:47:10.378,2016/02/16 18:46:06.873,2016/02/22 15:42:42.278,True
3,Offer_1002236598,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/01/12 18:24:55.557,2016/01/12 18:24:56.826,2016/01/12 18:25:08.174,2016/01/15 19:06:38.165,2016/01/18 22:33:14.113,True
4,Offer_1005366187,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/01 21:21:51.853,2016/02/01 21:21:53.307,2016/02/01 21:22:06.186,2016/02/11 22:35:14.175,2016/02/15 22:23:50.211,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2542,Offer_99101290,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/12 23:56:11.915,2016/02/12 23:56:13.312,2016/02/12 23:56:35.610,2016/02/16 18:12:46.353,2016/02/16 18:13:23.726,True
2543,Offer_992471522,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/18 00:12:18.561,2016/02/18 00:12:19.903,2016/02/18 00:12:37.882,2016/03/01 21:47:21.587,2016/03/22 15:30:26.199,True
2544,Offer_99473283,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/03 20:59:31.775,2016/02/03 20:59:33.058,2016/02/03 21:03:57.787,2016/02/10 19:35:03.340,2016/02/22 16:13:10.784,True
2545,Offer_995784215,1,0,0,0,0,0,1,0,0,...,0,0,0,0,2016/02/15 17:06:12.610,2016/02/15 17:06:15.203,2016/02/15 17:06:31.030,2016/02/18 20:46:52.744,2016/02/23 20:47:09.974,True


Train a decision tree (DT) and a randomforest (RF) from the scikit-learn package for outcome label prediction for the event log for prefix lengths L=2 and L=5.  Split 70-30 for training-testing (no temporal splitting or cross-validation needed!).

In [20]:
def classification(X, y, classifier, seed):

    X = X.astype(int)
    y = y.astype(int)

    X = np.nan_to_num(X)
    clf = RandomForestClassifier()

    if classifier == "RF":
        clf = RandomForestClassifier(n_estimators=100, oob_score=True)
    elif classifier == "DT":
        clf = DecisionTreeClassifier(max_depth=5)

    print("Training for "+classifier+"...")

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    print("Precision: "+str(precision))
    print("Recall: "+str(recall))
    print("F1: "+str(f1))

    return {"precision": precision,
            "recall": recall,
            "f1": f1}

In [21]:
x = create_prefix(df, 2)
x_encoded = one_hot(x, "activity", "aggr")
y = x["outcome"]
columns = x_encoded.columns
columns = x_encoded.columns[1:len(columns)]
classification(x_encoded[columns], y, "RF", 1)

Training for RF...
Precision: 0.43384177777777777
Recall: 0.6586666666666666
F1: 0.5231211146838156


{'precision': 0.43384177777777777,
 'recall': 0.6586666666666666,
 'f1': 0.5231211146838156}

In [22]:
x = create_prefix(df, 5)
x_encoded = one_hot(x, "activity", "aggr")
y = x["outcome"]
columns = x_encoded.columns
columns = x_encoded.columns[1:len(columns)]
classification(x_encoded[columns], y, "RF", 1)

Training for RF...
Precision: 0.648287101367231
Recall: 0.6379084967320261
F1: 0.500590445607946


{'precision': 0.648287101367231,
 'recall': 0.6379084967320261,
 'f1': 0.500590445607946}

In [23]:
x = create_prefix(df, 2)
x_encoded = one_hot(x, "activity", "aggr")
y = x["outcome"]
columns = x_encoded.columns
columns = x_encoded.columns[1:len(columns)]
classification(x_encoded[columns], y, "DT", 1)

Training for DT...
Precision: 0.43384177777777777
Recall: 0.6586666666666666
F1: 0.5231211146838156


{'precision': 0.43384177777777777,
 'recall': 0.6586666666666666,
 'f1': 0.5231211146838156}

In [24]:
x = create_prefix(df, 5)
x_encoded = one_hot(x, "activity", "aggr")
y = x["outcome"]
columns = x_encoded.columns
columns = x_encoded.columns[1:len(columns)]
classification(x_encoded[columns], y, "DT", 1)


Training for DT...
Precision: 0.6490531917641892
Recall: 0.6392156862745098
F1: 0.5058250804673823


{'precision': 0.6490531917641892,
 'recall': 0.6392156862745098,
 'f1': 0.5058250804673823}