In [1]:
import pandas as pd
import numpy as np
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

df = pd.read_csv("../data/processed/ab_test_validated.csv")



In [2]:
summary = (
    df.groupby("variant")
      .agg(
          users=("user_id", "count"),
          conversions=("converted", "sum")
      )
)

summary["conversion_rate"] = summary["conversions"] / summary["users"]
summary


Unnamed: 0_level_0,users,conversions,conversion_rate
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,25046,2469,0.098579
treatment,24954,2775,0.111205


In [3]:
conversions = summary["conversions"].values
users = summary["users"].values


In [4]:
summary = summary.loc[["control", "treatment"]]
conversions = summary["conversions"].values
users = summary["users"].values


In [5]:
z_stat, p_value = proportions_ztest(
    count=conversions,
    nobs=users,
    alternative="smaller"
)

z_stat, p_value


(np.float64(-4.607158207871794), np.float64(2.0410470882781514e-06))

In [6]:
uplift = summary.loc["treatment", "conversion_rate"] - summary.loc["control", "conversion_rate"]
uplift


np.float64(0.012626001146589488)

In [7]:
ci_control = proportion_confint(
    summary.loc["control", "conversions"],
    summary.loc["control", "users"],
    alpha=0.05,
    method="normal"
)

ci_treatment = proportion_confint(
    summary.loc["treatment", "conversions"],
    summary.loc["treatment", "users"],
    alpha=0.05,
    method="normal"
)

ci_uplift = (
    ci_treatment[0] - ci_control[1],
    ci_treatment[1] - ci_control[0]
)

ci_uplift


(0.005033555086798466, 0.02021844720638051)

In [8]:
results = pd.DataFrame({
    "metric": ["conversion_rate"],
    "control": [summary.loc["control", "conversion_rate"]],
    "treatment": [summary.loc["treatment", "conversion_rate"]],
    "uplift": [uplift],
    "p_value": [p_value]
})

results


Unnamed: 0,metric,control,treatment,uplift,p_value
0,conversion_rate,0.098579,0.111205,0.012626,2e-06


In [9]:
results.to_csv("../data/processed/conversion_results.csv", index=False)


In [10]:
import pandas as pd
df = pd.read_csv("../data/processed/ab_test_validated.csv")


In [11]:
secondary_summary = df.groupby("variant").agg(
    users=("user_id", "count"),
    bounce_rate=("bounced", "mean"),
    avg_revenue=("revenue", "mean")
)

secondary_summary


Unnamed: 0_level_0,users,bounce_rate,avg_revenue
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
control,25046,0.401901,10.022346
treatment,24954,0.379739,11.136829


In [12]:
secondary_summary["bounce_rate_uplift"] = (
    secondary_summary.loc["treatment", "bounce_rate"] - 
    secondary_summary.loc["control", "bounce_rate"]
)

secondary_summary["avg_revenue_uplift"] = (
    secondary_summary.loc["treatment", "avg_revenue"] - 
    secondary_summary.loc["control", "avg_revenue"]
)

secondary_summary


Unnamed: 0_level_0,users,bounce_rate,avg_revenue,bounce_rate_uplift,avg_revenue_uplift
variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
control,25046,0.401901,10.022346,-0.022162,1.114483
treatment,24954,0.379739,11.136829,-0.022162,1.114483


In [13]:
from statsmodels.stats.proportion import proportions_ztest

bounce_conversions = df.groupby("variant")["bounced"].sum().values
users = df.groupby("variant")["user_id"].count().values

z_stat_bounce, p_value_bounce = proportions_ztest(
    count=bounce_conversions,
    nobs=users,
    alternative="two-sided"
)

z_stat_bounce, p_value_bounce


(np.float64(5.078012836972882), np.float64(3.81402931817657e-07))

In [14]:
from scipy.stats import ttest_ind

revenue_control = df[df["variant"]=="control"]["revenue"]
revenue_treatment = df[df["variant"]=="treatment"]["revenue"]

t_stat, p_value_revenue = ttest_ind(revenue_treatment, revenue_control, equal_var=False)
t_stat, p_value_revenue


(np.float64(3.202633684421381), np.float64(0.0013626261641633047))

In [15]:
secondary_results = pd.DataFrame({
    "metric": ["bounce_rate", "avg_revenue"],
    "control": [secondary_summary.loc["control","bounce_rate"], secondary_summary.loc["control","avg_revenue"]],
    "treatment": [secondary_summary.loc["treatment","bounce_rate"], secondary_summary.loc["treatment","avg_revenue"]],
    "uplift": [secondary_summary.loc["treatment","bounce_rate_uplift"], secondary_summary.loc["treatment","avg_revenue_uplift"]],
    "p_value": [p_value_bounce, p_value_revenue]
})

secondary_results.to_csv("../data/processed/secondary_results.csv", index=False)
