In [7]:
import matplotlib.pyplot as plt
import numpy as np

In [8]:
from function import (
    load_and_concat_data,
    compute_completion_rate_by_variation,
    compute_avg_time_per_step_by_variation,
    compute_error_rate_by_variation
)

In [9]:
df_demo, df_web, df_exp = load_and_concat_data(
    "../data/raw/df_final_demo.txt",
    "../data/raw/df_final_web_data_pt_1.txt",
    "../data/raw/df_final_web_data_pt_2.txt",
    "../data/raw/df_final_experiment_clients.txt"
)

In [10]:
kpi1 = compute_completion_rate_by_variation(df_web, df_exp)
kpi1

Unnamed: 0,Variation,completed
0,Control,0.498493
1,Test,0.585173


In [11]:
kpi2 = compute_avg_time_per_step_by_variation(df_web, df_exp)
kpi2

Unnamed: 0,Variation,process_step,time_diff_sec
0,Control,confirm,128.460914
1,Control,start,154.883131
2,Control,step_1,42.998741
3,Control,step_2,38.667857
4,Control,step_3,92.944258
5,Test,confirm,129.175744
6,Test,start,148.94289
7,Test,step_1,37.690185
8,Test,step_2,48.131611
9,Test,step_3,96.850972


In [12]:
kpi3 = compute_error_rate_by_variation(df_web, df_exp)
kpi3

Unnamed: 0,Variation,error
0,Control,0.207245
1,Test,0.271462


Hypothesis Test: Completion Rate -> (Chi-square)
H₀: Completion is independent of experiment variation (the completion rates are the same in Test and Control).
H₁: Completion depends on experiment variation (the completion rates differ between Test and Control).

In [13]:
from scipy.stats import chi2_contingency

df_web_var = (
    df_web
    .merge(df_exp[["client_id", "Variation"]], on="client_id", how="inner")
    .dropna(subset=["Variation"])  
)

visit_completion = (
    df_web_var  
    .groupby(["visit_id", "Variation"])["process_step"]
    .apply(lambda x: "confirm" in x.values)
    .reset_index(name="completed")
)

tab = visit_completion.groupby("Variation")["completed"].agg(["sum", "count"])

a = tab.loc["Test", "sum"] # Complete
b = tab.loc["Test", "count"] - a # Not Complete
c = tab.loc["Control", "sum"] # Complete
d = tab.loc["Control", "count"] - c # Not Complete

table = np.array([[a, b],
                  [c, d]])

chi2, p, dof, expected = chi2_contingency(table, correction=False)

chi2, p

(np.float64(522.4351890093079), np.float64(1.249501073264908e-115))

A chi-square test of independence indicates a statistically significant association between variation and completion (χ² = 522.44, p ≈ 1.25e−115). Therefore, we reject the null hypothesis at α = 0.05 and conclude that the new design has a significantly different completion rate compared to the old design.

In [14]:
uplift = (a / tab.loc["Test","count"]) - (c / tab.loc["Control","count"])
uplift

np.float64(0.08668014253091483)

Business threshold (≥ 5 percentage points)
The observed completion-rate uplift is ≈ 8.67 percentage points, which exceeds the predefined 5% threshold.

Hypothesis Test 2：Time
H₀: The average visit duration is the same for the Test and Control groups.
H₁: The average visit duration differs between the Test and Control groups.

In [15]:
import pandas as pd
df = (
    df_web.merge(df_exp, on="client_id", how="inner")
          .dropna(subset=["Variation"])
)

df["date_time"] = pd.to_datetime(df["date_time"])

In [16]:
visit_time = (
    df.groupby(["client_id", "visit_id", "Variation"])["date_time"]
      .agg(["min", "max"])
      .reset_index()
)

visit_time["duration_sec"] = (
    visit_time["max"] - visit_time["min"]
).dt.total_seconds()

visit_time.head()

Unnamed: 0,client_id,visit_id,Variation,min,max,duration_sec
0,555,637149525_38041617439_716659,Test,2017-04-15 12:57:56,2017-04-15 13:00:34,158.0
1,647,40369564_40101682850_311847,Test,2017-04-12 15:41:28,2017-04-12 15:47:45,377.0
2,934,7076463_57954418406_971348,Test,2017-04-18 02:36:30,2017-04-18 02:38:52,142.0
3,1028,557292053_87239438319_391157,Control,2017-04-08 18:51:28,2017-04-08 19:00:26,538.0
4,1104,543158812_46395476577_767725,Control,2017-06-12 07:49:18,2017-06-12 07:49:18,0.0


In [17]:
t = visit_time.loc[
    visit_time["Variation"] == "Test", "duration_sec"
].dropna()

c = visit_time.loc[
    visit_time["Variation"] == "Control", "duration_sec"
].dropna()

len(t), len(c)

(37204, 32243)

In [18]:
from scipy.stats import ttest_ind

t_stat, p_ttest = ttest_ind(t, c, equal_var=False)
t_stat, p_ttest

(np.float64(8.233750539594977), np.float64(1.8456019963252697e-16))

In [19]:
t.mean(), c.mean(), t.median(), c.median()

(np.float64(315.4559993549081),
 np.float64(280.5106534751729),
 np.float64(168.0),
 np.float64(160.0))

Methodology

Visit duration was analyzed at the visit level. Given the large sample size and potential heteroskedasticity, a Welch two-sample t-test was used to compare mean visit duration between the Test and Control groups.

Results

The Welch t-test indicates a statistically significant difference in visit duration between the two groups (t = 8.23, p < 0.001). The Test group exhibits a higher average visit duration (315.5 seconds) compared to the Control group (280.5 seconds). Median visit duration is similar across groups, suggesting that the difference is driven by a subset of longer sessions in the Test group.


Hypothesis Test 3：Error Rate
H0: P_test >= P_control
H1: P_test < P_control

In [20]:
df_err = (
    df_web
    .merge(df_exp, on="client_id", how="inner")
    .dropna(subset=["Variation"])
    .copy()
)

df_err["date_time"] = pd.to_datetime(df_err["date_time"])

df_err = df_err.sort_values(["visit_id", "date_time"])

In [21]:
step_order = {
    "start": 0,
    "step_1": 1,
    "step_2": 2,
    "step_3": 3,
    "confirm": 4
}

df_err["step_num"] = df_err["process_step"].map(step_order)

df_err["step_diff"] = (
    df_err
    .groupby("visit_id")["step_num"]
    .diff()
)

In [22]:
df_err["error"] = df_err["step_diff"] < 0

In [23]:
visit_errors = (
    df_err
    .groupby(["visit_id", "Variation"])["error"]
    .any()
    .reset_index()
)

visit_errors.head()

Unnamed: 0,visit_id,Variation,error
0,100012776_37918976071_457913,Test,False
1,100019538_17884295066_43909,Test,True
2,100022086_87870757897_149620,Test,False
3,100030127_47967100085_936361,Control,False
4,100037962_47432393712_705583,Control,True


In [24]:
tab = visit_errors.groupby("Variation")["error"].agg(["sum", "count"])

# Test
a = tab.loc["Test", "sum"]
b = tab.loc["Test", "count"] - a

# Control
c = tab.loc["Control", "sum"]
d = tab.loc["Control", "count"] - c

table = np.array([[a, b],
                  [c, d]])

chi2, p_two_sided, dof, exp = chi2_contingency(table, correction=False)

# one-side p-value（H1: p_Test < p_Control）
p_test = a / (a + b)
p_control = c / (c + d)

if p_test < p_control:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

chi2, p_one_sided

(np.float64(392.94698358560504), np.float64(1.0))

A one-sided chi-square test was conducted to evaluate whether the new design reduces the error (backtracking) rate.
The Test group exhibits a higher error rate than the Control group (27.1% vs. 20.7%).
The one-sided p-value equals 1.00; therefore, we fail to reject the null hypothesis.
This indicates that the new design does not reduce user errors and instead introduces additional friction.