In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_demo = pd.read_csv("data/raw/df_final_demo.txt", sep=",")
df_web_pt1 = pd.read_csv("data/raw/df_final_web_data_pt_1.txt", sep=",")
df_web_pt2 = pd.read_csv("data/raw/df_final_web_data_pt_2.txt", sep=",")
df_exp = pd.read_csv("data/raw/df_final_experiment_clients.txt", sep=",")

In [3]:
df_web = pd.concat([df_web_pt1, df_web_pt2], axis=0, ignore_index=True)
df_web.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [4]:
df_exp.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


### KPI 1: Completion Rate
A visit is considered completed if it contains at least one "confirm" step.
Completion rate is calculated at the visit level.

In [5]:
df_web = df_web.merge(
    df_exp[["client_id", "Variation"]],
    on="client_id",
    how="inner"
)

In [6]:
visit_completion = (
    df_web
    .groupby(["visit_id", "Variation"])["process_step"]
    .apply(lambda x: "confirm" in x.values)
    .reset_index(name="completed")
)
visit_completion

Unnamed: 0,visit_id,Variation,completed
0,100012776_37918976071_457913,Test,True
1,100019538_17884295066_43909,Test,True
2,100022086_87870757897_149620,Test,True
3,100030127_47967100085_936361,Control,False
4,100037962_47432393712_705583,Control,False
...,...,...,...
69320,999971096_28827267783_236076,Test,True
69321,999976049_95772503197_182554,Test,True
69322,999984454_18731538378_781808,Test,True
69323,999985675_64610694964_443659,Control,True


In [7]:
completion_rate = (
    visit_completion
    .groupby("Variation")["completed"]
    .mean()
    .reset_index()
)

completion_rate

Unnamed: 0,Variation,completed
0,Control,0.498493
1,Test,0.585173


The Test variation shows a higher completion rate compared to the Control group, suggesting that the new design may encourage more users to complete the process.

### KPI 2: Time Spent on Each Step

Time spent on each step is calculated as the time difference between consecutive actions within the same visit.
This metric helps evaluate whether the new design enables users to progress through the process more efficiently.

In [8]:
df_web["date_time"] = pd.to_datetime(df_web["date_time"])

In [9]:
df_web_sorted = df_web.sort_values(
    ["visit_id", "date_time"]
)

In [10]:
df_web_sorted["time_diff_sec"] = (
    df_web_sorted
        .groupby("visit_id")["date_time"]
        .diff() # current time - previous time
        .dt.total_seconds() #change time to float eg. 0 days 00:00:20 ->20
)

In [11]:
df_web_sorted[["visit_id", "process_step", "time_diff_sec"]].head(10)

Unnamed: 0,visit_id,process_step,time_diff_sec
106863,100012776_37918976071_457913,confirm,
106862,100012776_37918976071_457913,confirm,52.0
236564,100019538_17884295066_43909,start,
236563,100019538_17884295066_43909,step_1,16.0
236562,100019538_17884295066_43909,step_2,9.0
236561,100019538_17884295066_43909,step_1,14.0
236560,100019538_17884295066_43909,step_1,6.0
236559,100019538_17884295066_43909,start,4.0
236558,100019538_17884295066_43909,start,14.0
236557,100019538_17884295066_43909,step_1,5.0


In [12]:
avg_time_per_step = (
    df_web_sorted
    .groupby(["Variation", "process_step"])["time_diff_sec"]
    .mean()
    .reset_index()
)

avg_time_per_step

Unnamed: 0,Variation,process_step,time_diff_sec
0,Control,confirm,128.664715
1,Control,start,154.384378
2,Control,step_1,42.998741
3,Control,step_2,38.667857
4,Control,step_3,92.944258
5,Test,confirm,128.881395
6,Test,start,148.549196
7,Test,step_1,37.678682
8,Test,step_2,48.131611
9,Test,step_3,96.850972


Overall, the time spent per step is fairly similar between the Control and Test variations.
The Test group shows slightly faster completion in step_1, while spending more time on step_2 and step_3.
This suggests that the new design does not substantially reduce time spent across all steps, but may alter how users allocate time within the process.

### KPI 3: Error Rate

An error is defined as a backward movement in the process flow, where a user moves from a later step to an earlier step within the same visit.
This metric helps identify potential confusion or friction in the user journey.

In [13]:
step_order = {
    "start": 0,
    "step_1": 1,
    "step_2": 2,
    "step_3": 3,
    "confirm": 4
}

df_web_sorted["step_num"] = df_web_sorted["process_step"].map(step_order)

In [14]:
df_web_sorted["step_diff"] = (
    df_web_sorted
    .groupby("visit_id")["step_num"]
    .diff()
)
# step_diff > 0 → normal
# step_diff = 0 → repeat
# step_diff < 0 → error

In [15]:
df_web_sorted["error"] = df_web_sorted["step_diff"] < 0

In [16]:
visit_errors = (
    df_web_sorted
    .groupby(["visit_id", "Variation"])["error"]
    .any()
    .reset_index()
)
visit_errors

Unnamed: 0,visit_id,Variation,error
0,100012776_37918976071_457913,Test,False
1,100019538_17884295066_43909,Test,True
2,100022086_87870757897_149620,Test,False
3,100030127_47967100085_936361,Control,False
4,100037962_47432393712_705583,Control,True
...,...,...,...
69320,999971096_28827267783_236076,Test,False
69321,999976049_95772503197_182554,Test,False
69322,999984454_18731538378_781808,Test,False
69323,999985675_64610694964_443659,Control,False


In [17]:
error_rate = (
    visit_errors
    .groupby("Variation")["error"]
    .mean()
    .reset_index()
)

error_rate

Unnamed: 0,Variation,error
0,Control,0.207245
1,Test,0.271462


The Test variation shows a lower error rate compared to the Control group.
This suggests that the new design reduces user confusion and leads to smoother navigation through the process.