In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

In [2]:
df_final_web_data_cc = pd.read_csv('cleaned_data/df_final_web_data_cc_clean.csv')
df_final_web_data_cc['process_step'].value_counts()

process_step
start      202439
step_1     135733
step_2     110366
step_3      92714
confirm     85631
Name: count, dtype: int64

In [3]:
#check if client_id always has the same visitor_id:

# Group by 'client_id' and calculate the number of unique 'visitor_id' values
unique_visitor_counts = df_final_web_data_cc.groupby('client_id')['visitor_id'].nunique()

# Check if every 'client_id' has only 1 unique 'visitor_id'
consistent_mapping = unique_visitor_counts == 1

# Print any inconsistencies
if consistent_mapping.all():
    print("All client_ids always map to the same visitor_id.")
else:
    inconsistent_clients = unique_visitor_counts[unique_visitor_counts > 1]
    print("The following client_ids map to multiple visitor_ids:")
    print(inconsistent_clients)


#7800 client ids with multiple visitor ids, visitor_id = device?!

The following client_ids map to multiple visitor_ids:
client_id
1643       2
1680       2
4653       2
6130       3
7367       2
          ..
9996088    2
9996404    2
9997470    2
9998342    2
9999729    3
Name: visitor_id, Length: 7841, dtype: int64


In [12]:
#load info about grouping (test vs. control) of clients
df_experiment_clients = pd.read_csv('cleaned_data/df_final_experiment_clients_clean.csv')
df_experiment_clients
#merge info about grouping (test vs. control) of clients into web experiment data
df_web_ex = df_final_web_data_cc.merge(df_experiment_clients, on='client_id', how='left')
df_web_ex['Variation'].value_counts()

Variation
Test       177847
Control    143462
Name: count, dtype: int64

In [17]:
df_web_ex.info() #626.883 entries
df_web_ex['visitor_id'].nunique() #108.216

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 626883 entries, 0 to 626882
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   client_id     626883 non-null  int64 
 1   visitor_id    626883 non-null  object
 2   visit_id      626883 non-null  object
 3   process_step  626883 non-null  object
 4   date_time     626883 non-null  object
 5   Variation     321309 non-null  object
dtypes: int64(1), object(5)
memory usage: 28.7+ MB


108216

In [19]:
# Step 1: Group by 'visitor_id'
grouped = df_web_ex.groupby('visitor_id')

# Step 2: Find groups where 'Variation' has more than one unique value
ambiguous_visitors = grouped.filter(lambda x: x['Variation'].nunique() > 1)

# Step 3: Get unique visitor IDs from confusing groups
ambiguous_visitor_ids = ambiguous_visitors['visitor_id'].unique()

# Output the result
print(f"Confusing visitor_id groups with more than one unique 'Variation' value: {len(ambiguous_visitor_ids)}")
print(f"List of ambiguous visitor group IDs: {ambiguous_visitor_ids}")

Confusing visitor_id groups with more than one unique 'Variation' value: 185
List of ambiguous visitor group IDs: ['2304489_84317775004' '604581053_81626830443' '173063648_31762200101'
 '268059948_82733858999' '123678861_93517709373' '454881699_42713236223'
 '580100142_70760670940' '579276105_69624603060' '571763490_44880124691'
 '367561835_85025056674' '666714263_61824030337' '214050749_58021197086'
 '253357465_8352028794' '486653995_31818439461' '96089498_16066779499'
 '199312490_38692028695' '686457792_50130641082' '60396051_9457024068'
 '738569748_4014232718' '423015939_19738622602' '571853589_19816269571'
 '652206951_94195881749' '952443163_11112691478' '26252461_90603995066'
 '198039515_32206414676' '280197097_2758137299' '824327020_10372883807'
 '435478449_41974372268' '105986634_15115236135' '266717833_98903612091'
 '869851118_61172024661' '531755_55417899190' '358981391_35001155514'
 '876787826_20478980103' '989197874_24115340213' '667315159_68457727652'
 '67620593_47154145788

In [91]:
df_web_ex[df_web_ex['visitor_id'] == '454881699_42713236223']

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
3667,7460349,454881699_42713236223,399371316_39189899496_559976,step_1,2017-04-05 16:14:01,Control
3668,7460349,454881699_42713236223,399371316_39189899496_559976,step_1,2017-04-05 16:13:30,Control
3669,7460349,454881699_42713236223,399371316_39189899496_559976,step_1,2017-04-05 16:12:55,Control
3670,7460349,454881699_42713236223,399371316_39189899496_559976,start,2017-04-05 16:12:36,Control
195402,7208557,454881699_42713236223,399371316_39189899496_559976,start,2017-04-05 16:19:36,Test
195403,7208557,454881699_42713236223,399371316_39189899496_559976,step_2,2017-04-05 16:15:33,Test
195404,7208557,454881699_42713236223,399371316_39189899496_559976,step_1,2017-04-05 16:15:03,Test
195405,7208557,454881699_42713236223,399371316_39189899496_559976,start,2017-04-05 16:14:55,Test
480674,7208557,454881699_42713236223,430342294_16750074009_383709,start,2017-05-01 18:24:00,Test
480675,7208557,454881699_42713236223,430342294_16750074009_383709,step_2,2017-05-01 18:20:31,Test


In [20]:
#getting rid of the clients with the ambiguous visitor_ids
#from 185 visitor_ids that have more than one variation take the client ids into a drop list
client_ids_to_drop = []
for visitor_id in ambiguous_visitor_ids:
    for i in range(len(df_web_ex[df_web_ex['visitor_id'] == f'{visitor_id}']['client_id'].unique())):
        client_ids_to_drop.append(df_web_ex[df_web_ex['visitor_id'] == f'{visitor_id}']['client_id'].unique()[i])
client_ids_to_drop
#372 client_ids to drop           

[9207047,
 765501,
 3155140,
 8633436,
 1428836,
 2842705,
 1344819,
 2683949,
 7460349,
 7208557,
 7460349,
 7208557,
 7303592,
 2648176,
 1144410,
 8883167,
 1663358,
 6167838,
 5856806,
 6243085,
 6226635,
 9927319,
 8889840,
 4514081,
 4054538,
 9458283,
 3742706,
 7263987,
 6351678,
 8626474,
 6351678,
 8626474,
 6731654,
 1813291,
 9152344,
 6635872,
 730714,
 9031268,
 4473543,
 461591,
 9941169,
 1932870,
 8146476,
 8471730,
 4345628,
 3602878,
 3423009,
 8563144,
 1646603,
 9379616,
 3573864,
 2457672,
 553978,
 9969431,
 4675712,
 187837,
 9997125,
 4262428,
 9205347,
 1921173,
 5250597,
 89039,
 8632835,
 7726374,
 1003184,
 3980473,
 7367695,
 2483737,
 2221946,
 3517625,
 6848944,
 9560313,
 7546162,
 823619,
 6543689,
 1555759,
 6233209,
 2982738,
 9366159,
 4404931,
 4408117,
 1995384,
 9583832,
 2762642,
 2102149,
 6775330,
 9844262,
 7419662,
 826682,
 4315157,
 2772609,
 4524801,
 7126892,
 8811666,
 1794555,
 8278713,
 4874181,
 1517855,
 1838227,
 8408236,
 5224523,

In [7]:
# Drop rows where client_id is in client_ids_to_drop
df_web_ex = df_web_ex[~df_web_ex['client_id'].isin(client_ids_to_drop)]
df_web_ex.info() #624.446 entries, before 626.883
df_web_ex['visitor_id'].nunique() #107.993 before 108216

NameError: name 'df_web_ex' is not defined

In [109]:
df_web_exp.to_csv('df_web_ex_cleaned_part2_visitor_ids.csv', index=False)

In [29]:
#der eine special fall der gleiche visitor id gleiche visit id 2 client ids 2 erfolgreiche prozesse
# und auch der fall von start auf confirm:
df_web_ex[df_web_ex['visit_id'] == '200827274_88708804706_343871']

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
38668,507290,939712117_71587565939,200827274_88708804706_343871,confirm,2017-04-08 15:24:25,Test
38669,507290,939712117_71587565939,200827274_88708804706_343871,step_3,2017-04-08 15:23:21,Test
38670,507290,939712117_71587565939,200827274_88708804706_343871,step_2,2017-04-08 15:22:33,Test
38671,507290,939712117_71587565939,200827274_88708804706_343871,step_3,2017-04-08 15:22:04,Test
38672,507290,939712117_71587565939,200827274_88708804706_343871,step_2,2017-04-08 15:21:50,Test
38673,507290,939712117_71587565939,200827274_88708804706_343871,step_3,2017-04-08 15:17:57,Test
38674,507290,939712117_71587565939,200827274_88708804706_343871,step_2,2017-04-08 15:14:22,Test
38675,507290,939712117_71587565939,200827274_88708804706_343871,step_1,2017-04-08 15:13:44,Test
38676,507290,939712117_71587565939,200827274_88708804706_343871,start,2017-04-08 15:12:25,Test
178870,7116423,939712117_71587565939,200827274_88708804706_343871,step_1,2017-04-08 15:49:48,Test
