In [1]:
import pandas as pd 
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
import scipy.stats as stats
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Concatenate, GRU
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import BatchNormalization, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import metrics, regularizers




In [2]:
## import data
df = pd.read_csv('Customer Behavior.csv')
df.head()


Unnamed: 0,Contact,Actor,Action,Timestamp,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 23:04:02,,,
1,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 21:46:12,,,
2,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,PartialForm,2024-02-14 21:34:40,,,
3,19c0f3be-6424-45d1-84cf-a44dd459da43,Contact Action,EmailOpen,2024-02-14 21:34:39,,,
4,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,SiteVisit,2024-02-14 21:34:28,,,


In [3]:
## dropping redundant columns
df.drop(columns=['Unnamed: 4',	'Unnamed: 5',	'Unnamed: 6'],inplace=True)
df.head()

Unnamed: 0,Contact,Actor,Action,Timestamp
0,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 23:04:02
1,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 21:46:12
2,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,PartialForm,2024-02-14 21:34:40
3,19c0f3be-6424-45d1-84cf-a44dd459da43,Contact Action,EmailOpen,2024-02-14 21:34:39
4,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,SiteVisit,2024-02-14 21:34:28


In [4]:
## data frame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10008 entries, 0 to 10007
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Contact    10008 non-null  object
 1   Actor      10008 non-null  object
 2   Action     10008 non-null  object
 3   Timestamp  10008 non-null  object
dtypes: object(4)
memory usage: 312.9+ KB


In [5]:
## contact column info
df['Contact'].value_counts()

Contact
636688d2-742e-4e39-b49f-ca6d5dc666a2    2313
c9e1a988-a85f-4e7b-8fa7-5e3fbd4db0e4    1615
8668e2ff-0b98-41ca-a8cc-49b86afb50b0     926
3b51815e-773f-400f-9a7a-af0fbdcc9ffb     860
6c17c5bb-58c8-450c-88ba-2c21e7606f9b     684
2dd7a997-61d9-43bb-a0f0-d5a7ea3b34c6     535
f953cb48-5e1d-4a2e-9b66-70429858b8cf     434
fbd52aea-921b-4a8e-9bfc-553bb0792ee8     418
19c0f3be-6424-45d1-84cf-a44dd459da43     405
0eae917b-a265-40ba-ba06-a0d4036a4352     375
0c37865e-d6dc-48ed-b877-dd3f683f7c88     300
5f972ff0-0fa8-4d6b-b5d2-2b00423f8ba2     258
f1edf078-c0ef-4699-b420-66ca262b0c51     225
8a055888-3bc8-4963-9183-db787ea83458     198
ed464759-6c83-45e1-979a-3f2e8dc171d4     197
4c2066ba-da74-45d9-97f4-49399b665d64      85
8053f8ac-1263-4691-9878-4165b0e37448      72
853a7b24-e6df-494f-9d7f-a7b7330244ec      39
1d54729f-8b8b-449e-969c-0ad7a2394de4      26
7a49d2c0-003b-4cda-b323-b6979f17fcb8      26
c838ef06-7a17-4519-a6f6-f133439d8abc       9
3c7aaab1-4d91-4eaa-997a-a162ddaafed8       8
Na

In [6]:
## action column info
df['Action'].value_counts()

Action
DynamicBlockView          2116
ADLogin                   1708
ADWorkflowEdit            1623
EmailOpen                 1106
PopupView                  636
Email Log                  630
ADCampaignCreation         459
SiteVisit                  449
SubmittedForm              184
ShortUrlClick              143
EmailLinkClick             134
DynamicBlockClick          126
DynamicImageView           108
Appointment                 82
PhoneCall                   80
ContentCardTargetVisit      65
ADEmailCreation             52
GoalCompletion              40
WebChatMessage              35
ADPhonePurchase             34
PartialForm                 29
ADWebFormCreation           25
ContentCardView             23
ADPageCreation              18
TextMessage                 12
ADSubAccountCreation        12
Live Chat                   12
WonDeal                     10
AppointmentAttendance        8
ADEmployeeCreation           7
OptIn                        7
WebinarRegistration          6
P

In [7]:
## contact column info
df['Contact'].unique()


array(['c838ef06-7a17-4519-a6f6-f133439d8abc',
       'ed464759-6c83-45e1-979a-3f2e8dc171d4',
       '19c0f3be-6424-45d1-84cf-a44dd459da43',
       '8668e2ff-0b98-41ca-a8cc-49b86afb50b0',
       'fbd52aea-921b-4a8e-9bfc-553bb0792ee8',
       '7a49d2c0-003b-4cda-b323-b6979f17fcb8',
       '8a055888-3bc8-4963-9183-db787ea83458',
       '3c7aaab1-4d91-4eaa-997a-a162ddaafed8',
       '853a7b24-e6df-494f-9d7f-a7b7330244ec',
       '2dd7a997-61d9-43bb-a0f0-d5a7ea3b34c6',
       '636688d2-742e-4e39-b49f-ca6d5dc666a2',
       '3b51815e-773f-400f-9a7a-af0fbdcc9ffb',
       '5f972ff0-0fa8-4d6b-b5d2-2b00423f8ba2',
       'c9e1a988-a85f-4e7b-8fa7-5e3fbd4db0e4',
       'f1edf078-c0ef-4699-b420-66ca262b0c51',
       'f953cb48-5e1d-4a2e-9b66-70429858b8cf',
       '0c37865e-d6dc-48ed-b877-dd3f683f7c88',
       '1d54729f-8b8b-449e-969c-0ad7a2394de4',
       '4c2066ba-da74-45d9-97f4-49399b665d64',
       '8053f8ac-1263-4691-9878-4165b0e37448',
       '6c17c5bb-58c8-450c-88ba-2c21e7606f9b',
       '0eae9

In [8]:
## action column length
len(df['Action'].unique())

40

In [9]:
## adding Contact Id column
df['Contact_id']=df['Contact']
df=df[['Contact','Contact_id','Actor','Action','Timestamp']]
values_to_replace = list(df['Contact_id'].unique())
new_values = [i for i in range(1, len(list(df['Contact_id'].unique()))+1)]
df['Contact_id'] = df['Contact_id'].replace(values_to_replace, new_values)

In [10]:
# contact id of the desired contact
df[df['Contact']=='853a7b24-e6df-494f-9d7f-a7b7330244ec'].iloc[0,1]

9

In [11]:
# contact id info
print(df['Contact_id'].value_counts())
print(df['Contact_id'].unique())

Contact_id
11    2313
14    1615
4      926
12     860
21     684
10     535
16     434
5      418
3      405
22     375
17     300
13     258
15     225
7      198
2      197
19      85
20      72
9       39
18      26
6       26
1        9
8        8
Name: count, dtype: int64
[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22]


In [12]:
# removing users with less action than 20 actions
df=df[(df['Contact_id']!= 1) & (df['Contact_id']!= 8)]
print(df['Contact_id'].value_counts())

Contact_id
11    2313
14    1615
4      926
12     860
21     684
10     535
16     434
5      418
3      405
22     375
17     300
13     258
15     225
7      198
2      197
19      85
20      72
9       39
18      26
6       26
Name: count, dtype: int64


In [13]:
# making dummies for columns then turning time to second and sorting by contact id and time
df_1=df.drop(columns ='Contact')
df_1 = pd.get_dummies(df_1, columns=['Actor'], dtype=int)
df_1 = pd.get_dummies(df_1, columns=['Action'], dtype=int)
df_1['Timestamp'] = pd.to_datetime(df_1['Timestamp'])
df_1['Timestamp'] = df_1['Timestamp'].astype('int64') // 10**9
df_1['Timestamp'] = df_1['Timestamp'] - df_1['Timestamp'].min()
df_1=df_1.sort_values(by=['Contact_id', 'Timestamp'], ascending=[True, True])
df_1['Action_WonDeal'].value_counts()

Action_WonDeal
0    9981
1      10
Name: count, dtype: int64

In [14]:
df_1.head(15)

Unnamed: 0,Contact_id,Timestamp,Actor_Contact Action,Actor_System Action,Action_ADAccountPurchase,Action_ADCampaignCreation,Action_ADCloseAccount,Action_ADEmailCreation,Action_ADEmployeeCreation,Action_ADLogin,...,Action_PopupView,Action_Referral,Action_ShortUrlClick,Action_SiteVisit,Action_SubmittedForm,Action_TextMessage,Action_Unsubscribe,Action_WebChatMessage,Action_WebinarRegistration,Action_WonDeal
349,2,116643947,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
345,2,116643950,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
346,2,116643950,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
347,2,116643950,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
348,2,116643950,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
344,2,116643961,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
340,2,116644100,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
341,2,116644100,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
342,2,116644100,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
343,2,116644100,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#getting attribute columns
a=df_1. columns. values. tolist()
#a.remove('Contact_id')
a.remove('Timestamp')
#a.remove('Action_WonDeal')
a

['Contact_id',
 'Actor_Contact Action',
 'Actor_System Action',
 'Action_ADAccountPurchase',
 'Action_ADCampaignCreation',
 'Action_ADCloseAccount',
 'Action_ADEmailCreation',
 'Action_ADEmployeeCreation',
 'Action_ADLogin',
 'Action_ADPageCreation',
 'Action_ADPhonePurchase',
 'Action_ADSubAccountCreation',
 'Action_ADTrainingTask',
 'Action_ADWebFormCreation',
 'Action_ADWorkflowEdit',
 'Action_Appointment',
 'Action_AppointmentAttendance',
 'Action_AssetDownload',
 'Action_ContentCardTargetVisit',
 'Action_ContentCardView',
 'Action_DynamicBlockClick',
 'Action_DynamicBlockView',
 'Action_DynamicImageView',
 'Action_Email Log',
 'Action_EmailError',
 'Action_EmailLinkClick',
 'Action_EmailOpen',
 'Action_GoalCompletion',
 'Action_Live Chat',
 'Action_OptIn',
 'Action_PartialForm',
 'Action_PhoneCall',
 'Action_PopupClick',
 'Action_PopupView',
 'Action_Referral',
 'Action_ShortUrlClick',
 'Action_SiteVisit',
 'Action_SubmittedForm',
 'Action_TextMessage',
 'Action_Unsubscribe',
 'Ac

In [16]:
#number of wins for each contact
for i in df['Contact_id'].unique():
    df_b = df_1[df_1['Contact_id'] == i]
    print(i,df_b['Action_WonDeal'].value_counts().get(1, 0))

2 0
3 0
4 0
5 0
6 0
7 0
9 0
10 0
11 0
12 0
13 2
14 0
15 0
16 0
17 3
18 1
19 1
20 1
21 1
22 1


In [17]:
#train set
trainX = []
trainY = []

n_steps = 20

for i in df['Contact_id'].unique():
    if i == 10 or i == 20 or i ==9 :
        continue 
    for j in range (len(df_1[df_1['Contact_id']==i]) - n_steps):
        trainX.append(df_1[df_1['Contact_id']==i].iloc[j:j+n_steps+1,4:-1])
        if 1 in df_1[df_1['Contact_id']==i].iloc[j:j+n_steps+1,-1].values:
            trainY.append(1)
        else:
            trainY.append(0)
trainX, trainY = np.array(trainX), np.array(trainY)    

In [18]:
#test set
testX = []
testY = []

n_steps = 20

for i in [10,20]:
    for j in range (len(df_1[df_1['Contact_id']==i]) - n_steps):
        testX.append(df_1[df_1['Contact_id']==i].iloc[j:j+n_steps+1,4:-1])
        if 1 in df_1[df_1['Contact_id']==i].iloc[j:j+n_steps+1,-1].values:
            testY.append(1)
        else:
            testY.append(0)
testX, testY = np.array(testX), np.array(testY)


In [19]:
#pred set
pred_x=[]
for j in range (len(df_1[df_1['Contact_id']==9]) - n_steps):
    pred_x.append(df_1[df_1['Contact_id']==9].iloc[j:j+n_steps+1,4:-1])
pred_x = np.array(pred_x)

In [20]:
trainX.shape

(9005, 21, 39)

In [21]:
#defining model archituctures
model = Sequential()
model.add(Bidirectional(LSTM(units=16, activation='relu', kernel_regularizer=regularizers.l2(0.01), return_sequences=True), input_shape=(trainX.shape[1], trainX.shape[2])))
model.add(Bidirectional(LSTM(units=8, activation='relu', kernel_regularizer=regularizers.l2(0.01), return_sequences=False)))
#model.add(Dropout(0.2))
model.add(Dense(4,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 21, 32)            7168      
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 16)                2624      
 onal)                                                           
                                                                 
 dense (Dense)               (None, 4)                 68        
                                                                 
 dense_1 (Dense)             (None, 1)                 5         
                                                                 
Total params: 9865 (38.54 KB)
Trainable params: 9865 (38.54 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
#trianing the model and checking train and test accuracy
training_start = time.perf_counter()
history = model.fit(trainX, trainY, epochs=50, batch_size=64, validation_split=0.1, verbose=1)
training_end = time.perf_counter()

xgb_train_time = training_end-training_start
print("Time consumed for training: %4.3f" % (xgb_train_time))

test_loss, test_accuracy = model.evaluate(testX, testY)

print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Time consumed for training: 175.730
Test Loss: 0.018176672980189323
Test Accuracy: 0.998236358165741


In [23]:
#propbability of winning contact 9
pred = model.predict(pred_x)
print("Contact 9 winning probability is {}".format(pred[0,-1]))

Contact 9 winning probability is 0.0005121948779560626


In [24]:
#getting the most imporatant system action
first_layer_weights = model.layers[0].get_weights()[0]

# Calculate the importance of each input feature
feature_importance = np.abs(first_layer_weights).sum(axis=1)
# Plot the importance of each input feature
input_features = df_1.columns[4:-1]
feature_importance_df = pd.DataFrame({'Feature': input_features, 'Importance': feature_importance})
print(feature_importance_df[feature_importance_df['Feature'].isin(['Action_Email Log','Action_PhoneCall','Action_TextMessage','Action_WebChatMessage'])])
print("Most important system action is Email Log")


                  Feature  Importance
20       Action_Email Log    0.273928
28       Action_PhoneCall    0.125356
35     Action_TextMessage    0.097258
37  Action_WebChatMessage    0.041934
Most important system action is Email Log
