In [3]:
import pandas as pd 
from sklearn.model_selection import train_test_split, cross_val_predict
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import time
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
import numpy as np
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

In [4]:
## import data
df = pd.read_csv('Customer Behavior.csv')
df.head()


Unnamed: 0,Contact,Actor,Action,Timestamp,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 23:04:02,,,
1,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 21:46:12,,,
2,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,PartialForm,2024-02-14 21:34:40,,,
3,19c0f3be-6424-45d1-84cf-a44dd459da43,Contact Action,EmailOpen,2024-02-14 21:34:39,,,
4,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,SiteVisit,2024-02-14 21:34:28,,,


In [5]:
## dropping redundant columns
df.drop(columns=['Unnamed: 4',	'Unnamed: 5',	'Unnamed: 6'],inplace=True)
df.head()

Unnamed: 0,Contact,Actor,Action,Timestamp
0,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 23:04:02
1,c838ef06-7a17-4519-a6f6-f133439d8abc,Contact Action,EmailOpen,2024-02-14 21:46:12
2,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,PartialForm,2024-02-14 21:34:40
3,19c0f3be-6424-45d1-84cf-a44dd459da43,Contact Action,EmailOpen,2024-02-14 21:34:39
4,ed464759-6c83-45e1-979a-3f2e8dc171d4,Contact Action,SiteVisit,2024-02-14 21:34:28


In [6]:
## data frame info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10008 entries, 0 to 10007
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Contact    10008 non-null  object
 1   Actor      10008 non-null  object
 2   Action     10008 non-null  object
 3   Timestamp  10008 non-null  object
dtypes: object(4)
memory usage: 312.9+ KB


In [7]:
## contact column info
df['Contact'].value_counts()

Contact
636688d2-742e-4e39-b49f-ca6d5dc666a2    2313
c9e1a988-a85f-4e7b-8fa7-5e3fbd4db0e4    1615
8668e2ff-0b98-41ca-a8cc-49b86afb50b0     926
3b51815e-773f-400f-9a7a-af0fbdcc9ffb     860
6c17c5bb-58c8-450c-88ba-2c21e7606f9b     684
2dd7a997-61d9-43bb-a0f0-d5a7ea3b34c6     535
f953cb48-5e1d-4a2e-9b66-70429858b8cf     434
fbd52aea-921b-4a8e-9bfc-553bb0792ee8     418
19c0f3be-6424-45d1-84cf-a44dd459da43     405
0eae917b-a265-40ba-ba06-a0d4036a4352     375
0c37865e-d6dc-48ed-b877-dd3f683f7c88     300
5f972ff0-0fa8-4d6b-b5d2-2b00423f8ba2     258
f1edf078-c0ef-4699-b420-66ca262b0c51     225
8a055888-3bc8-4963-9183-db787ea83458     198
ed464759-6c83-45e1-979a-3f2e8dc171d4     197
4c2066ba-da74-45d9-97f4-49399b665d64      85
8053f8ac-1263-4691-9878-4165b0e37448      72
853a7b24-e6df-494f-9d7f-a7b7330244ec      39
1d54729f-8b8b-449e-969c-0ad7a2394de4      26
7a49d2c0-003b-4cda-b323-b6979f17fcb8      26
c838ef06-7a17-4519-a6f6-f133439d8abc       9
3c7aaab1-4d91-4eaa-997a-a162ddaafed8       8
Na

In [8]:
## action column info
df['Action'].value_counts()

Action
DynamicBlockView          2116
ADLogin                   1708
ADWorkflowEdit            1623
EmailOpen                 1106
PopupView                  636
Email Log                  630
ADCampaignCreation         459
SiteVisit                  449
SubmittedForm              184
ShortUrlClick              143
EmailLinkClick             134
DynamicBlockClick          126
DynamicImageView           108
Appointment                 82
PhoneCall                   80
ContentCardTargetVisit      65
ADEmailCreation             52
GoalCompletion              40
WebChatMessage              35
ADPhonePurchase             34
PartialForm                 29
ADWebFormCreation           25
ContentCardView             23
ADPageCreation              18
TextMessage                 12
ADSubAccountCreation        12
Live Chat                   12
WonDeal                     10
AppointmentAttendance        8
ADEmployeeCreation           7
OptIn                        7
WebinarRegistration          6
P

In [9]:
## contact column info
df['Contact'].unique()


array(['c838ef06-7a17-4519-a6f6-f133439d8abc',
       'ed464759-6c83-45e1-979a-3f2e8dc171d4',
       '19c0f3be-6424-45d1-84cf-a44dd459da43',
       '8668e2ff-0b98-41ca-a8cc-49b86afb50b0',
       'fbd52aea-921b-4a8e-9bfc-553bb0792ee8',
       '7a49d2c0-003b-4cda-b323-b6979f17fcb8',
       '8a055888-3bc8-4963-9183-db787ea83458',
       '3c7aaab1-4d91-4eaa-997a-a162ddaafed8',
       '853a7b24-e6df-494f-9d7f-a7b7330244ec',
       '2dd7a997-61d9-43bb-a0f0-d5a7ea3b34c6',
       '636688d2-742e-4e39-b49f-ca6d5dc666a2',
       '3b51815e-773f-400f-9a7a-af0fbdcc9ffb',
       '5f972ff0-0fa8-4d6b-b5d2-2b00423f8ba2',
       'c9e1a988-a85f-4e7b-8fa7-5e3fbd4db0e4',
       'f1edf078-c0ef-4699-b420-66ca262b0c51',
       'f953cb48-5e1d-4a2e-9b66-70429858b8cf',
       '0c37865e-d6dc-48ed-b877-dd3f683f7c88',
       '1d54729f-8b8b-449e-969c-0ad7a2394de4',
       '4c2066ba-da74-45d9-97f4-49399b665d64',
       '8053f8ac-1263-4691-9878-4165b0e37448',
       '6c17c5bb-58c8-450c-88ba-2c21e7606f9b',
       '0eae9

In [10]:
## action column length
len(df['Action'].unique())

40

In [11]:
## adding Contact Id column
df['Contact_id']=df['Contact']
df=df[['Contact','Contact_id','Actor','Action','Timestamp']]
values_to_replace = list(df['Contact_id'].unique())
new_values = [i for i in range(1, len(list(df['Contact_id'].unique()))+1)]
df['Contact_id'] = df['Contact_id'].replace(values_to_replace, new_values)

In [12]:
# contact id of the wanted contact
#df[df['Contact']=='853a7b24-e6df-494f-9d7f-a7b7330244ec']
print('The desired contact id number is {}'.format(df[df['Contact']=='853a7b24-e6df-494f-9d7f-a7b7330244ec'].iloc[0,1]))

The desired contact id number is 9


In [13]:
# contact id info
df['Contact_id'].unique()


array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22], dtype=int64)

In [14]:
# making dummies for columns then turning time to second and sorting by contact id and time
df_1=df.drop(columns ='Contact')
df_1 = pd.get_dummies(df_1, columns=['Actor'], dtype=int)
df_1 = pd.get_dummies(df_1, columns=['Action'], dtype=int)
df_1['Timestamp'] = pd.to_datetime(df_1['Timestamp'])
df_1['Timestamp'] = df_1['Timestamp'].astype('int64') // 10**9
df_1['Timestamp'] = df_1['Timestamp'] - df_1['Timestamp'].min()
df_1=df_1.sort_values(by=['Contact_id', 'Timestamp'], ascending=[True, True])
df_1.head(15)
df_1['Action_WonDeal'].value_counts()

Action_WonDeal
0    9998
1      10
Name: count, dtype: int64

In [15]:
#getting attribute columns
a=df_1. columns. values. tolist()
a.remove('Contact_id')
a.remove('Timestamp')
#a.remove('Action_WonDeal')
a

['Actor_Contact Action',
 'Actor_System Action',
 'Action_ADAccountPurchase',
 'Action_ADCampaignCreation',
 'Action_ADCloseAccount',
 'Action_ADEmailCreation',
 'Action_ADEmployeeCreation',
 'Action_ADLogin',
 'Action_ADPageCreation',
 'Action_ADPhonePurchase',
 'Action_ADSubAccountCreation',
 'Action_ADTrainingTask',
 'Action_ADWebFormCreation',
 'Action_ADWorkflowEdit',
 'Action_Appointment',
 'Action_AppointmentAttendance',
 'Action_AssetDownload',
 'Action_ContentCardTargetVisit',
 'Action_ContentCardView',
 'Action_DynamicBlockClick',
 'Action_DynamicBlockView',
 'Action_DynamicImageView',
 'Action_Email Log',
 'Action_EmailError',
 'Action_EmailLinkClick',
 'Action_EmailOpen',
 'Action_GoalCompletion',
 'Action_Live Chat',
 'Action_OptIn',
 'Action_PartialForm',
 'Action_PhoneCall',
 'Action_PopupClick',
 'Action_PopupView',
 'Action_Referral',
 'Action_ShortUrlClick',
 'Action_SiteVisit',
 'Action_SubmittedForm',
 'Action_TextMessage',
 'Action_Unsubscribe',
 'Action_WebChatMes

In [16]:
#number of wins for each contact
for i in range(1,23):
    df_b = df_1[df_1['Contact_id'] == i]
    print(i,df_b['Action_WonDeal'].value_counts().get(1, 0))

1 0
2 0
3 0
4 0
5 0
6 0
7 0
8 0
9 0
10 0
11 0
12 0
13 2
14 0
15 0
16 0
17 3
18 1
19 1
20 1
21 1
22 1


In [17]:
# getting the cumulative dataset for XGBoost


df_x= pd.DataFrame()

for i in range(1,23):

    df_a = df_1[df_1['Contact_id'] == i]
    df_a.reset_index(inplace=True)
    if i == 9:
        continue


    if df_a['Action_WonDeal'].value_counts().get(1, 0) == 0:
        df_x = pd.concat([df_x,df_a[a].cumsum()])

    if df_a['Action_WonDeal'].value_counts().get(1, 0) == 1:
        df_b = df_a[a].cumsum()
        w_d=len(df_b)-2
        df_b = df_b.drop(df_b.index[w_d])
        df_x = pd.concat([df_x,df_b])
    if df_a['Action_WonDeal'].value_counts().get(1, 0) > 1:
        df_b= pd.DataFrame()
        row_number = df_a.index[df_a['Action_WonDeal'] == 1]
        df_c = df_a[a].iloc[:row_number[0]+1].cumsum()
        w_d=len(df_c)-2
        df_c = df_c.drop(df_c.index[w_d])
        df_b = pd.concat([df_b,df_c])
        df_c = df_a[a].iloc[row_number[0]+1:row_number[1]+1].cumsum()
        w_d=len(df_c)-2
        df_c = df_c.drop(df_c.index[w_d])
        df_b = pd.concat([df_b,df_c])
        df_x = pd.concat([df_x,df_b])

df_x['Action_WonDeal'].value_counts()
df_x


Unnamed: 0,Actor_Contact Action,Actor_System Action,Action_ADAccountPurchase,Action_ADCampaignCreation,Action_ADCloseAccount,Action_ADEmailCreation,Action_ADEmployeeCreation,Action_ADLogin,Action_ADPageCreation,Action_ADPhonePurchase,...,Action_PopupView,Action_Referral,Action_ShortUrlClick,Action_SiteVisit,Action_SubmittedForm,Action_TextMessage,Action_Unsubscribe,Action_WebChatMessage,Action_WebinarRegistration,Action_WonDeal
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,360,10,0,0,0,0,1,18,0,0,...,42,0,9,11,7,1,0,0,0,0
370,361,10,0,0,0,0,1,18,0,0,...,42,0,9,11,7,1,0,0,0,0
371,362,10,0,0,0,0,1,18,0,0,...,42,0,9,11,7,1,0,0,0,0
372,363,10,1,0,0,0,1,18,0,0,...,42,0,9,11,7,1,0,0,0,0


In [18]:
# contact 9 input array
b=a
b.remove('Action_WonDeal')
df_a = df_1[df_1['Contact_id'] == 9]
df_a.reset_index(inplace=True)
df_c = df_a[b].cumsum()
pred_x=df_c.iloc[[-1]]
pred_x

Unnamed: 0,Actor_Contact Action,Actor_System Action,Action_ADAccountPurchase,Action_ADCampaignCreation,Action_ADCloseAccount,Action_ADEmailCreation,Action_ADEmployeeCreation,Action_ADLogin,Action_ADPageCreation,Action_ADPhonePurchase,...,Action_PopupClick,Action_PopupView,Action_Referral,Action_ShortUrlClick,Action_SiteVisit,Action_SubmittedForm,Action_TextMessage,Action_Unsubscribe,Action_WebChatMessage,Action_WebinarRegistration
38,36,3,0,0,0,0,0,0,0,0,...,0,2,0,3,4,2,0,0,0,0


In [19]:
#defining X and Y
Y = df_x['Action_WonDeal']
X = df_x.drop(columns='Action_WonDeal')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=5)


In [20]:

y_train.value_counts()

Action_WonDeal
0    7958
1       9
Name: count, dtype: int64

In [21]:
# training the XGBoost model
xgb = XGBClassifier(n_estimators=100)
training_start = time.perf_counter()
xgb.fit(X_train, y_train)
training_end = time.perf_counter()


In [22]:
# predicting test set and geting accuracy
prediction_start = time.perf_counter()
preds = xgb.predict(X_test)
prediction_end = time.perf_counter()

preds_2=xgb.predict(X_train)
train_accuracy = accuracy_score(y_train, preds_2)
print("Model train accuracy is {}".format(train_accuracy))

preds_1=xgb.predict(X_test)
test_accuracy = accuracy_score(y_test, preds_1)
print("Model test accuracy is {}".format(test_accuracy))

xgb_train_time = training_end-training_start
xgb_prediction_time = prediction_end-prediction_start
print("Time consumed for training: %4.3f" % (xgb_train_time))
print("Time consumed for prediction: %6.5f seconds" % (xgb_prediction_time))

Model train accuracy is 0.9996234467177105
Model test accuracy is 1.0
Time consumed for training: 0.550
Time consumed for prediction: 0.03523 seconds


In [23]:
# getting contact 9 winning peobability
main_pred=xgb.predict_proba(pred_x)
print("Contact 9 winning probability is {}".format(main_pred[0,1]))

Contact 9 winning probability is 8.755650924285874e-06


In [24]:
#finding the msot imprtant system action
importance_scores = xgb.feature_importances_

# Get the names of the features
feature_names = X_train.columns  # Assuming X_train is a DataFrame with named columns
df = pd.DataFrame({'Feature': feature_names, 'Importance': importance_scores})
print(df[df['Feature'].isin(['Action_Email Log','Action_PhoneCall','Action_TextMessage','Action_WebChatMessage'])])
print("Most important system action is Phone Call")

                  Feature  Importance
22       Action_Email Log    0.028842
30       Action_PhoneCall    0.029793
37     Action_TextMessage    0.000000
39  Action_WebChatMessage    0.000000
Most important system action is Phone Call
