In [65]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 

In [66]:
billing = pd.read_csv("billing.csv") 
events = pd.read_csv("events.csv")
sessions = pd.read_csv("sessions.csv")
users = pd.read_csv('users.csv')

Looking into expansion events group and non expansion group and testing out what are their behvaiour. Make a test and control group that we want to study

In [67]:
expand_users = users[(users['expansion_event'] == 1 )]
# no_expand_users = users[(users['expansion_event'] == 0 )]
print(len(expand_users))
# print(len(no_expand_users))

69309


In [None]:
# billing['month'] = pd.to_datetime(billing['month']).dt.to_period('M')
# billing = billing.sort_values(["user_id", "month"])

# # High-water mark + previous high
# billing["max_seats_to_date"] = billing.groupby("user_id")["active_seats"].cummax()
# billing["prev_max_seats_to_date"] = billing.groupby("user_id")["max_seats_to_date"].shift()

# # True expansion month (first time seats exceed any prior max)
# billing["true_expansion_month"] = billing["active_seats"] > billing["prev_max_seats_to_date"]

# # First expansion month per user (nullable if they never expand)
# first_expansion = (
#     billing.loc[billing["true_expansion_month"]]
#     .groupby("user_id", as_index=False)["month"].min()
#     .rename(columns={"month": "first_expansion_month"})
# )

In [78]:
# Create control group by matching non-expansion users to expansion users on key user details
from sklearn.neighbors import NearestNeighbors

# Select features for matching
match_cols = ['plan_tier', 'company_size', 'region', 'industry', 'acquisition_channel', 'is_enterprise']

# Prepare expansion and non-expansion user data
expand_users = users[users['expansion_event'] == 1].copy()
no_expand_users = users[users['expansion_event'] == 0].copy()

# One-hot encode categorical variables for matching
expand_encoded = pd.get_dummies(expand_users[match_cols])
no_expand_encoded = pd.get_dummies(no_expand_users[match_cols])
# Align columns
expand_encoded, no_expand_encoded = expand_encoded.align(no_expand_encoded, join='left', axis=1, fill_value=0)

# Fit nearest neighbor matcher
nn = NearestNeighbors(n_neighbors=1, metric='euclidean')
nn.fit(no_expand_encoded)
distances, indices = nn.kneighbors(expand_encoded)

# Get matched control users
matched_indices = indices.flatten()
control_users = no_expand_users.iloc[matched_indices].copy()
control_users['matched_expansion_user_id'] = expand_users['user_id'].values

# Output the expansion and control groups for further analysis
print(f"Expansion group size: {len(expand_users)}")
print(f"Control group size: {len(control_users)}")



Expansion group size: 69309
Control group size: 69309


## Example Analysis: Compare Churn Rates Between Expansion and Control Groups
Let's compare the 90-day churn rates between the expansion group and the matched control group to see if expansion is associated with retention.

In [98]:
# Compare 90-day churn rates between expansion and control groups
expansion_churn_2 = expand_users['churned_90d'].mean()
control_churn_2 = no_expand_users['churned_90d'].mean()

print(f"Expansion group 90-day churn rate: {expansion_churn_2:.2%}")
print(f"Control group 90-day churn rate: {control_churn_2:.2%}")

# Optional: statistical test
from scipy.stats import ttest_ind
stat, pval = ttest_ind(expand_users['churned_90d'], no_expand_users['churned_90d'])
print(f"T-test p-value: {pval:.4f}")


Expansion group 90-day churn rate: 25.10%
Control group 90-day churn rate: 18.23%
T-test p-value: 0.0000


In [97]:
# Compare 90-day churn rates between expansion and control groups
expansion_churn = expand_users['churned_90d'].mean()
control_churn = control_users['churned_90d'].mean()

print(f"Expansion group 90-day churn rate: {expansion_churn:.2%}")
print(f"Control group 90-day churn rate: {control_churn:.2%}")

# Optional: statistical test
from scipy.stats import ttest_ind
stat, pval = ttest_ind(expand_users['churned_90d'], control_users['churned_90d'])
print(f"T-test p-value: {pval:.4f}")


Expansion group 90-day churn rate: 25.10%
Control group 90-day churn rate: 25.48%
T-test p-value: 0.1015


### Aggregate and Visualize Total Events per User (Matched Groups)
Let's compare the distribution of total events per user between the expansion and matched control groups.

In [114]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# ---- 1) Build per-user aggregates from billing (history-agnostic summaries)
def build_user_aggregates(billing: pd.DataFrame) -> pd.DataFrame:
    b = billing.copy()
    b["month"] = pd.to_datetime(b["month"])
    agg = b.sort_values(["user_id","month"]).groupby("user_id").agg(
        months_observed=("month", lambda s: s.dt.to_period("M").nunique()),
        first_month=("month", "min"),
        last_month=("month", "max"),
        avg_seats=("active_seats", "mean"),
        max_seats=("active_seats", "max"),
        avg_mrr=("mrr", "mean"),
        max_mrr=("mrr", "max"),
        pct_months_discount=("discount_applied", "mean"),
        pct_months_overdue=("invoices_overdue", "mean"),
        avg_tickets=("support_ticket_count", "mean")
    ).reset_index()
    agg["signup_cohort"] = agg["first_month"].dt.to_period("M").astype(str)
    return agg

# ---- 2) Merge user attributes (including expansion_event) + aggregates
def assemble_user_frame(users: pd.DataFrame, billing: pd.DataFrame) -> pd.DataFrame:
    ua = build_user_aggregates(billing)
    df = users.merge(ua, on="user_id", how="left")
    # clean booleans → ints for means
    for c in ["is_enterprise","churned_30d","churned_90d","downgraded","expansion_event"]:
        if c in df:
            df[c] = df[c].astype(int)
    return df

# ---- 3) Stratified descriptive comparison (plan tier × signup cohort)
def stratified_two_group_summary(df: pd.DataFrame) -> pd.DataFrame:
    # choose metrics to compare
    metrics = ["avg_seats","max_seats","avg_mrr","max_mrr",
               "pct_months_discount","pct_months_overdue","avg_tickets",
               "churned_30d","churned_90d","downgraded","is_enterprise"]
    strata = ["plan_tier", "signup_cohort"]
    out = []
    for keys, g in df.groupby(strata):
        if g["expansion_event"].nunique() < 2:
            continue
        exp = g[g["expansion_event"]==1]
        ctl = g[g["expansion_event"]==0]
        row = dict(zip(strata, keys))
        row["n_exp"] = len(exp); row["n_ctl"] = len(ctl)
        for m in metrics:
            row[f"exp_{m}"] = exp[m].mean()
            row[f"ctl_{m}"] = ctl[m].mean()
            row[f"diff_{m}"] = row[f"exp_{m}"] - row[f"ctl_{m}"]
        out.append(row)
    return pd.DataFrame(out).sort_values(strata + ["n_exp"], ascending=[True, True, False])

# ---- 4) (Optional) Standardized mean differences (SMD) to flag imbalance
def standardized_diffs(df: pd.DataFrame, cols) -> pd.Series:
    a = df[df["expansion_event"]==1][cols].astype(float)
    b = df[df["expansion_event"]==0][cols].astype(float)
    mu_a, mu_b = a.mean(), b.mean()
    s2 = (a.var(ddof=1) + b.var(ddof=1)) / 2.0
    smd = (mu_a - mu_b) / np.sqrt(s2.replace(0, np.nan))
    return smd


ux = assemble_user_frame(users, billing)
summary = stratified_two_group_summary(ux)
print(summary.head())
smd = standardized_diffs(ux, ["avg_seats","avg_mrr","avg_tickets","pct_months_discount","pct_months_overdue"])
print("SMD:\n", smd.sort_values(key=np.abs, ascending=False))


  plan_tier signup_cohort  n_exp  n_ctl  exp_avg_seats  ctl_avg_seats  \
0      free       2024-02   2772    332       7.704445      53.806894   
1      free       2024-03   2882    353       7.286831      53.779870   
2      free       2024-04   2890    385       7.538992      51.468506   
3      free       2024-05   2900    430       7.285655      51.057364   
4      free       2024-06   2717    385       6.764788      49.718367   

   diff_avg_seats  exp_max_seats  ctl_max_seats  diff_max_seats  ...  \
0      -46.102449      10.480159      56.786145      -46.305986  ...   
1      -46.493039       9.931298      56.677054      -46.745756  ...   
2      -43.929514      10.101384      54.171429      -44.070044  ...   
3      -43.771709       9.762069      53.753488      -43.991419  ...   
4      -42.953580       9.085756      52.228571      -43.142815  ...   

   diff_churned_30d  exp_churned_90d  ctl_churned_90d  diff_churned_90d  \
0          0.015486         0.328644         0.237952

In [115]:
smd

avg_seats             -0.863011
avg_mrr               -0.604281
avg_tickets           -0.693281
pct_months_discount   -0.205647
pct_months_overdue     0.019555
dtype: float64

feature_name,jira_board,jira_automation,team_invite,bitbucket_integration,confluence_page,confluence_ai_assist,trello_export_csv,feature_adoption_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0000093e-7258-43c0-b212-ea7212795ddf,1,0,0,0,0,0,0,1
0006fdbd-c35d-4c0d-a948-e8ee10f279d5,0,0,0,0,0,1,0,1
0007ee13-0c60-4865-9464-914b2df654df,1,1,1,0,0,0,0,3
000836d5-0777-4310-a8a5-5c857d2ea5e5,0,0,0,0,1,0,0,1
0008a782-5378-45a1-b332-10c73c4296ff,0,0,1,0,1,1,1,4
