In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/skygeni_sales_data.csv')
df.sample(5)

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
3073,D03074,2024-01-05,2024-04-20,rep_13,Ecommerce,India,Core,Partner,Qualified,6802,106,Won
1780,D01781,2023-04-21,2023-07-25,rep_25,SaaS,APAC,Pro,Outbound,Qualified,3160,95,Lost
836,D00837,2024-01-21,2024-05-03,rep_1,SaaS,India,Pro,Inbound,Demo,43680,103,Won
1227,D01228,2024-03-20,2024-06-25,rep_19,EdTech,North America,Enterprise,Referral,Proposal,28573,97,Won
3710,D03711,2023-11-18,2024-03-17,rep_13,Ecommerce,India,Pro,Inbound,Closed,96119,120,Won


# 0. Problem Definition
- The CRO observes declining win rate despite stable pipeline volume.
- The goal is to identify which deal attributes (segment, region, ACV, cycle time, etc.) and sales behaviors are statistically associated with win probability, and to detect which factors contributed to the win rate decline in the last two quarters.

# 1. Feature Engineering

In [3]:
df["created_date"] = pd.to_datetime(df["created_date"])
df["closed_date"] = pd.to_datetime(df["closed_date"])

df["created_month"] = df["created_date"].dt.month
df["created_quarter"] = df["created_date"].dt.quarter
df["created_year"] = df["created_date"].dt.year
df['created_year_and_quarter'] = df["created_date"].dt.to_period("Q").astype(str)
df["created_weekday"] = df["created_date"].dt.weekday

df["closed_month"] = df["closed_date"].dt.month
df["closed_quarter"] = df["closed_date"].dt.quarter
df["closed_year"] = df["closed_date"].dt.year
df['closed_year_and_quarter'] = df["closed_date"].dt.to_period("Q").astype(str)
df["closed_weekday"] = df["closed_date"].dt.weekday

# Filter out the last quarter
latest_quarter = df['closed_year_and_quarter'].max()
df = df[df['closed_year_and_quarter'] != latest_quarter].copy()

# Time buckets (helps CRO thinking)
df["sales_cycle_bucket"] = pd.cut(
    df["sales_cycle_days"],
    bins=[0, 14, 30, 60, 90, np.inf],
    labels=["<2w", "2-4w", "1-2m", "2-3m", "3m+"]
)

# Binary outcome
df["is_won"] = (df["outcome"] == "Won").astype(int)

# ACV buckets (very important) --> Right skewed data, hence log transformed
df["acv_bucket"] = pd.qcut(
    df["deal_amount"], 
    # np.log1p(df["deal_amount"]), 
    q=4,
    labels=["Low", "Mid", "High", "Very High"]
)

# doesn't matter if I apply log or not in pd.qcut as log is a monotonic function (preserves order) 
# and qcut works by splitting thr ranked data into equal sized groups
# hence ranking order of values doesn't change post log applying
# NOTE: for strictly qcut into bucket applications, skip applying log as it is just additional compute, 
# look at examples/qcut-example.ipynb for sanity check

# 2. Detect Structural Changes in Win Rate Drivers

In [4]:
# Sort quarters chronologically
quarters = sorted(df["closed_year_and_quarter"].unique())

# Last 2 completed quarters
decline_quarters = quarters[-2:]

df["period_flag"] = np.where(
    df["closed_year_and_quarter"].isin(decline_quarters),
    "decline_period",
    "baseline_period"
)

df["period_flag"].value_counts()


period_flag
baseline_period    3344
decline_period     1627
Name: count, dtype: int64

In [11]:
def segment_win_rate_drift(df, segment_col):
    summary = (
        df.groupby(["period_flag", segment_col])
          .agg(
              deals=("deal_id", "count"),
              win_rate=("is_won", "mean")
          )
          .reset_index()
    )
    
    pivot = summary.pivot_table(
        index=segment_col,
        columns="period_flag",
        values=["win_rate", "deals"]
    ).reset_index()
    
    pivot["win_rate_delta"] = (
        pivot["decline_period"] - pivot["baseline_period"]
    )
    
    return pivot.sort_values("win_rate_delta")

In [12]:
region_drift = segment_win_rate_drift(df, "region")
industry_drift = segment_win_rate_drift(df, "industry")
acv_drift = segment_win_rate_drift(df, "acv_bucket")
cycle_drift = segment_win_rate_drift(df, "sales_cycle_bucket")

  df.groupby(["period_flag", segment_col])
  pivot = summary.pivot_table(
  df.groupby(["period_flag", segment_col])
  pivot = summary.pivot_table(


##### NOTE: Negative values in win_rate_delta mean drop in win_rate and positive means improvement

In [8]:
region_drift

period_flag,region,baseline_period,decline_period,win_rate_delta
0,APAC,0.458134,0.43073,-0.027403
1,Europe,0.455758,0.457002,0.001245
2,India,0.453052,0.457547,0.004496
3,North America,0.43562,0.47619,0.040571


In [14]:
industry_drift

period_flag,industry,baseline_period,decline_period,win_rate_delta
1,EdTech,0.445636,0.424699,-0.020937
2,FinTech,0.476423,0.481132,0.004709
4,SaaS,0.44812,0.457317,0.009197
0,Ecommerce,0.444444,0.45679,0.012346
3,HealthTech,0.441349,0.458462,0.017113


In [15]:
acv_drift

period_flag,acv_bucket,baseline_period,decline_period,win_rate_delta
0,Low,0.454217,0.452785,-0.001432
2,High,0.458637,0.461905,0.003267
1,Mid,0.42757,0.434109,0.006538
3,Very High,0.462919,0.471744,0.008826


In [16]:
cycle_drift

period_flag,sales_cycle_bucket,baseline_period,decline_period,win_rate_delta
2,1-2m,0.442411,0.405759,-0.036652
1,2-4w,0.512524,0.5,-0.012524
4,3m+,0.440789,0.441065,0.000275
3,2-3m,0.433875,0.488613,0.054738
0,<2w,0.441606,0.5,0.058394


In [26]:
summary = (
    df.groupby(["period_flag", 'region'])
      .agg(
          deals=("deal_id", "count"),
          win_rate=("is_won", "mean")
      )
      .reset_index()
)

summary


Unnamed: 0,period_flag,region,deals,win_rate
0,baseline_period,APAC,836,0.458134
1,baseline_period,Europe,825,0.455758
2,baseline_period,India,852,0.453052
3,baseline_period,North America,831,0.43562
4,decline_period,APAC,397,0.43073
5,decline_period,Europe,407,0.457002
6,decline_period,India,424,0.457547
7,decline_period,North America,399,0.47619


In [25]:
pivot = summary.pivot_table(
    index='region',
    columns="period_flag",
    values=["win_rate", "deals"]
).reset_index()

pivot["win_rate_delta"] = (
    pivot["decline_period"] - pivot["baseline_period"]
)
pivot

KeyError: 'decline_period'