In [15]:
# Load the real data collected from Google Analytics 
event_data_path = "event analysis.csv"
event_df = pd.read_csv(event_data_path)

event_df.head()

Unnamed: 0,group,Event name,Event count,Event count per active user,Average engagement time per session,Bounce rate,Unnamed: 6
0,,,1375,10.338346,42.177419,0.413978,Grand total
1,(not set),page_view,382,2.87218,0.0,0.39779,
2,(not set),scroll,331,2.903509,2.519231,0.320513,
3,(not set),user_engagement,241,4.82,70.180723,0.144578,
4,(not set),session_start,186,1.398496,0.0,0.413978,


In [16]:
# Filter out non-informative rows and focus on group-level data
event_df_clean = event_df.dropna(subset=["Event name"])
event_df_clean = event_df_clean[event_df_clean["group"].isin([
    "blue-large", "blue-small", "red-large", "red-small"
])]

# Select relevant columns and clean headers
event_df_clean = event_df_clean[["group", "Event name", "Event count"]]
event_df_clean.columns = ["group", "event_name", "event_count"]

# Pivot to get each group's event counts per type
pivot_df = event_df_clean.pivot(index="group", columns="event_name", values="event_count").fillna(0)

# Add sample size (enter_feature_page as proxy for # of sessions per group)
pivot_df["sessions"] = pivot_df["enter_feature_page"]

# Calculate rates
pivot_df["CTR"] = pivot_df["apply_transformation_click"] / pivot_df["sessions"]
pivot_df["Completion Rate"] = pivot_df["reach_last_tab"] / pivot_df["sessions"]
pivot_df["Bounce Rate"] = pivot_df["bounce_detected"] / pivot_df["sessions"]

# Keep only relevant data for simulation
sim_params = pivot_df[["CTR", "Completion Rate", "Bounce Rate", "sessions"]].copy()

# Calculate scaling factor to bring each group up to 80 sessions
sim_params["target_sessions"] = 80
sim_params["scale"] = sim_params["target_sessions"] / sim_params["sessions"]

sim_params.reset_index(inplace=True)
sim_params.head()

event_name,group,CTR,Completion Rate,Bounce Rate,sessions,target_sessions,scale
0,blue-large,0.4,0.6,0.0,5.0,80,16.0
1,blue-small,0.5,0.5,7.5,2.0,80,40.0
2,red-large,0.0,0.0,0.0,2.0,80,40.0
3,red-small,0.0,0.5,4.5,2.0,80,40.0


In [20]:
#Data Simulation

# Blend the real group bounce rates with a floor level to make it realistic across all groups
# All bounce rates will be between 0.5 and 0.8 to reflect high bounce behavior on a 7-tab Shiny app

sim_params["CTR"] = sim_params["CTR"].clip(upper=1.0)
sim_params["Completion Rate"] = sim_params["Completion Rate"].clip(upper=1.0)
sim_params["Bounce Rate"] = sim_params["Bounce Rate"].clip(upper=0.8)
sim_params["Bounce Rate"] = sim_params["Bounce Rate"].apply(lambda x: max(x, 0.5))  # min 0.5 for realism

# Simulate session-level data based on cleaned and scaled params
harmonized_sessions = []
np.random.seed(42)

for _, row in sim_params.iterrows():
    group = row["group"]
    n_sessions = int(row["target_sessions"])
    p_click = row["CTR"]
    p_complete = row["Completion Rate"]
    p_bounce = row["Bounce Rate"]

    for i in range(n_sessions):
        clicked = int(np.random.rand() < p_click)
        completed = int(np.random.rand() < p_complete)
        bounced = int(np.random.rand() < p_bounce)
        time_spent = round(np.random.normal(45 if not bounced else 5, 10), 1)
        time_spent = max(time_spent, 0.5)
        harmonized_sessions.append({
            "session_id": f"{group}_{i+1}",
            "group": group,
            "clicked": clicked,
            "completed": completed,
            "bounced": bounced,
            "time_spent": time_spent
        })

# Create simulated dataset
harmonized_df = pd.DataFrame(harmonized_sessions)

harmonized_df.to_csv("reproduced_A_B_Test_Dataset.csv", index=False)

In [19]:
harmonized_df.head()

Unnamed: 0,session_id,group,clicked,completed,bounced,time_spent
0,blue-large_1,blue-large,1,0,0,33.9
1,blue-large_2,blue-large,1,1,0,48.2
2,blue-large_3,blue-large,0,0,1,0.5
3,blue-large_4,blue-large,1,1,0,39.7
4,blue-large_5,blue-large,0,1,0,40.7
