In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/skygeni_sales_data.csv')
df.sample(5)

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
628,D00629,2024-03-01,2024-03-16,rep_5,EdTech,North America,Core,Referral,Proposal,70152,15,Lost
1554,D01555,2023-03-27,2023-06-15,rep_20,EdTech,North America,Core,Inbound,Closed,11631,80,Won
2785,D02786,2023-09-03,2023-09-20,rep_25,Ecommerce,Europe,Pro,Referral,Qualified,65576,17,Lost
1954,D01955,2023-06-18,2023-06-25,rep_18,SaaS,India,Core,Partner,Qualified,3784,7,Lost
3059,D03060,2023-01-29,2023-04-26,rep_18,EdTech,Europe,Core,Partner,Negotiation,48199,87,Lost


# 0. Problem Definition
- The CRO observes declining win rate despite stable pipeline volume.
- The goal is to identify which deal attributes (segment, region, ACV, cycle time, etc.) and sales behaviors are statistically associated with win probability, and to detect which factors contributed to the win rate decline in the last two quarters.

# 1. Feature Engineering

In [3]:
df["created_date"] = pd.to_datetime(df["created_date"])
df["closed_date"] = pd.to_datetime(df["closed_date"])

df["created_month"] = df["created_date"].dt.month
df["created_quarter"] = df["created_date"].dt.quarter
df["created_year"] = df["created_date"].dt.year
df['created_year_and_quarter'] = df["created_date"].dt.to_period("Q").astype(str)
df["created_weekday"] = df["created_date"].dt.weekday

df["closed_month"] = df["closed_date"].dt.month
df["closed_quarter"] = df["closed_date"].dt.quarter
df["closed_year"] = df["closed_date"].dt.year
df['closed_year_and_quarter'] = df["closed_date"].dt.to_period("Q").astype(str)
df["closed_weekday"] = df["closed_date"].dt.weekday

# Filter out the last quarter
latest_quarter = df['closed_year_and_quarter'].max()
df = df[df['closed_year_and_quarter'] != latest_quarter].copy()

# Time buckets (helps CRO thinking)
df["sales_cycle_bucket"] = pd.cut(
    df["sales_cycle_days"],
    bins=[0, 14, 30, 60, 90, np.inf],
    labels=["<2w", "2-4w", "1-2m", "2-3m", "3m+"]
)

# Binary outcome
df["is_won"] = (df["outcome"] == "Won").astype(int)

# ACV buckets (very important) --> Right skewed data, hence log transformed
df["acv_bucket"] = pd.qcut(
    df["deal_amount"], 
    # np.log1p(df["deal_amount"]), 
    q=4,
    labels=["Low", "Mid", "High", "Very High"]
)

# doesn't matter if I apply log or not in pd.qcut as log is a monotonic function (preserves order) 
# and qcut works by splitting thr ranked data into equal sized groups
# hence ranking order of values doesn't change post log applying
# NOTE: for strictly qcut into bucket applications, skip applying log as it is just additional compute, 
# look at examples/qcut-example.ipynb for sanity check

# 2. Detect Structural Changes in Win Rate Drivers --> Impact Quantification by time

### This shows where performance truly deteriorated
- Whether it’s a mix shift problem or execution problem
- And which segments the team should focus on to recover win rate.”

In [4]:
from scipy.stats import chi2_contingency
def segment_win_rate_drift_weighted(df, segment_col):
    """
    Computes win rate delta, weighted impact, and contribution percentage
    for a segment between baseline and decline periods.

    Contribution (%) = portion of weighted impact relative to total weighted impact

    Parameters:
    - df: DataFrame containing at least 'closed_year_and_quarter', 'is_won', 'deal_id', and segment_col
    - segment_col: column to analyze (e.g., 'sales_cycle_bucket', 'region', 'product_type')

    Returns:
    - pivot table with segment, baseline & decline win rates, delta, deal counts, 
      weighted impact, and contribution percentage
    """
    
    quarters = sorted(df["closed_year_and_quarter"].unique())
    decline_quarters = quarters[-2:]

    df = df.copy()
    df["period_flag"] = np.where(
        df["closed_year_and_quarter"].isin(decline_quarters),
        "decline_period",
        "baseline_period"
    )

    summary = (
        df.groupby(["period_flag", segment_col])
          .agg(
              deals=("deal_id", "count"),
              wins=("is_won", "sum"),
              revenue=("deal_amount", "sum")
          )
          .reset_index()
    )

    pivot = summary.pivot_table(
        index=segment_col,
        columns="period_flag",
        values=["deals", "wins", "revenue"]
    )

    pivot.columns = ['_'.join(col) for col in pivot.columns]
    pivot = pivot.reset_index()

    # Win rate calculations
    pivot["win_rate_baseline"] = pivot["wins_baseline_period"] / pivot["deals_baseline_period"]
    pivot["win_rate_decline"] = pivot["wins_decline_period"] / pivot["deals_decline_period"]
    pivot["win_rate_delta"] = pivot["win_rate_decline"] - pivot["win_rate_baseline"]

    # Weighted impact (volume-adjusted)
    pivot["weighted_impact"] = pivot["win_rate_delta"] * pivot["deals_decline_period"]

    # Revenue impact
    pivot["revenue_delta"] = (
        pivot["revenue_decline_period"] - pivot["revenue_baseline_period"]
    )

    # Statistical significance (chi-square)
    p_values = []

    for _, row in pivot.iterrows():
        table = [
            [
                row["wins_baseline_period"],
                row["deals_baseline_period"] - row["wins_baseline_period"]
            ],
            [
                row["wins_decline_period"],
                row["deals_decline_period"] - row["wins_decline_period"]
            ]
        ]
        try:
            _, p, _, _ = chi2_contingency(table)
        except:
            p = np.nan
        p_values.append(p)

    pivot["p_value"] = p_values
    pivot["statistically_significant"] = pivot["p_value"] < 0.05

    return pivot.sort_values("weighted_impact")

In [5]:
region_drift = segment_win_rate_drift_weighted(df, "region")
industry_drift = segment_win_rate_drift_weighted(df, "industry")
acv_drift = segment_win_rate_drift_weighted(df, "acv_bucket")
cycle_drift = segment_win_rate_drift_weighted(df, "sales_cycle_bucket")

  df.groupby(["period_flag", segment_col])
  pivot = summary.pivot_table(
  df.groupby(["period_flag", segment_col])
  pivot = summary.pivot_table(


#### win_rate_delta = win_rate_decline_period - win_rate_baseline_period
#### weighted_impact = win_rate_delta * deals_decline_period
#### NOTE: Negative values in weighted_impact mean drop in win_rate and positive means improvement

#### “This isolates exactly which segments are responsible for the win rate drop, quantifies their impact, and tells us where intervention will have the highest ROI.”

In [6]:
region_drift

Unnamed: 0,region,deals_baseline_period,deals_decline_period,revenue_baseline_period,revenue_decline_period,wins_baseline_period,wins_decline_period,win_rate_baseline,win_rate_decline,win_rate_delta,weighted_impact,revenue_delta,p_value,statistically_significant
0,APAC,836.0,397.0,21898532.0,10493614.0,383.0,171.0,0.458134,0.43073,-0.027403,-10.879187,-11404918.0,0.399462,False
1,Europe,825.0,407.0,21162497.0,11056396.0,376.0,186.0,0.455758,0.457002,0.001245,0.506667,-10106101.0,1.0,False
2,India,852.0,424.0,21771089.0,12284312.0,386.0,194.0,0.453052,0.457547,0.004496,1.906103,-9486777.0,0.926514,False
3,North America,831.0,399.0,22437686.0,9588955.0,362.0,190.0,0.43562,0.47619,0.040571,16.187726,-12848731.0,0.201235,False


In [7]:
industry_drift

Unnamed: 0,industry,deals_baseline_period,deals_decline_period,revenue_baseline_period,revenue_decline_period,wins_baseline_period,wins_decline_period,win_rate_baseline,win_rate_decline,win_rate_delta,weighted_impact,revenue_delta,p_value,statistically_significant
1,EdTech,653.0,332.0,17496747.0,9516510.0,291.0,141.0,0.445636,0.424699,-0.020937,-6.950995,-7980237.0,0.576815,False
2,FinTech,615.0,318.0,15307140.0,8711651.0,293.0,153.0,0.476423,0.481132,0.004709,1.497561,-6595489.0,0.946296,False
4,SaaS,665.0,328.0,17351219.0,9011501.0,298.0,150.0,0.44812,0.457317,0.009197,3.016541,-8339718.0,0.836696,False
0,Ecommerce,729.0,324.0,20084880.0,7939698.0,324.0,148.0,0.444444,0.45679,0.012346,4.0,-12145182.0,0.76062,False
3,HealthTech,682.0,325.0,17029818.0,8243917.0,301.0,149.0,0.441349,0.458462,0.017113,5.561584,-8785901.0,0.657859,False


In [8]:
industry_drift

Unnamed: 0,industry,deals_baseline_period,deals_decline_period,revenue_baseline_period,revenue_decline_period,wins_baseline_period,wins_decline_period,win_rate_baseline,win_rate_decline,win_rate_delta,weighted_impact,revenue_delta,p_value,statistically_significant
1,EdTech,653.0,332.0,17496747.0,9516510.0,291.0,141.0,0.445636,0.424699,-0.020937,-6.950995,-7980237.0,0.576815,False
2,FinTech,615.0,318.0,15307140.0,8711651.0,293.0,153.0,0.476423,0.481132,0.004709,1.497561,-6595489.0,0.946296,False
4,SaaS,665.0,328.0,17351219.0,9011501.0,298.0,150.0,0.44812,0.457317,0.009197,3.016541,-8339718.0,0.836696,False
0,Ecommerce,729.0,324.0,20084880.0,7939698.0,324.0,148.0,0.444444,0.45679,0.012346,4.0,-12145182.0,0.76062,False
3,HealthTech,682.0,325.0,17029818.0,8243917.0,301.0,149.0,0.441349,0.458462,0.017113,5.561584,-8785901.0,0.657859,False


In [9]:
acv_drift

Unnamed: 0,acv_bucket,deals_baseline_period,deals_decline_period,revenue_baseline_period,revenue_decline_period,wins_baseline_period,wins_decline_period,win_rate_baseline,win_rate_decline,win_rate_delta,weighted_impact,revenue_delta,p_value,statistically_significant
0,Low,830.0,413.0,3569955.0,1749845.0,377.0,187.0,0.454217,0.452785,-0.001432,-0.591566,-1820110.0,1.0,False
2,High,822.0,420.0,17060918.0,8811283.0,377.0,194.0,0.458637,0.461905,0.003267,1.372263,-8249635.0,0.960817,False
1,Mid,856.0,387.0,8485784.0,3829979.0,366.0,168.0,0.42757,0.434109,0.006538,2.530374,-4655805.0,0.877801,False
3,Very High,836.0,407.0,58153147.0,29032170.0,387.0,192.0,0.462919,0.471744,0.008826,3.592105,-29120977.0,0.816424,False


In [10]:
cycle_drift

Unnamed: 0,sales_cycle_bucket,deals_baseline_period,deals_decline_period,revenue_baseline_period,revenue_decline_period,wins_baseline_period,wins_decline_period,win_rate_baseline,win_rate_decline,win_rate_delta,weighted_impact,revenue_delta,p_value,statistically_significant
2,1-2m,929.0,382.0,24036047.0,10470062.0,411.0,155.0,0.442411,0.405759,-0.036652,-14.001076,-13565985.0,0.247641,False
1,2-4w,519.0,164.0,12927048.0,3802409.0,266.0,82.0,0.512524,0.5,-0.012524,-2.05395,-9124639.0,0.849248,False
4,3m+,760.0,526.0,20911140.0,14213345.0,335.0,232.0,0.440789,0.441065,0.000275,0.144737,-6697795.0,1.0,False
0,<2w,274.0,72.0,7265163.0,1936964.0,121.0,36.0,0.441606,0.5,0.058394,4.20438,-5328199.0,0.451655,False
3,2-3m,862.0,483.0,22130406.0,13000497.0,374.0,236.0,0.433875,0.488613,0.054738,26.438515,-9129909.0,0.060461,False


# 3. Mix Shift Analysis --> Composition Change

### “This analysis tells us whether our win rate dropped because execution worsened, or because we’re selling more into segments that historically convert worse. It isolates whether this is a performance problem or a mix problem.”

In [11]:
def mix_shift_analysis(df, segment_col):
    quarters = sorted(df["closed_year_and_quarter"].unique())
    decline_quarters = quarters[-2:]

    df = df.copy()
    df["period_flag"] = np.where(
        df["closed_year_and_quarter"].isin(decline_quarters),
        "decline_period",
        "baseline_period"
    )

    grouped = (
        df.groupby(["period_flag", segment_col])
          .agg(
              deals=("deal_id", "count"),
              win_rate=("is_won", "mean")
          )
          .reset_index()
    )

    total_deals = df.groupby("period_flag")["deal_id"].count()

    grouped["mix_share"] = grouped.apply(
        lambda row: row["deals"] / total_deals[row["period_flag"]],
        axis=1
    )

    return grouped.sort_values(["period_flag", "mix_share"], ascending=False)

In [12]:
region_shift = mix_shift_analysis(df, "region")
industry_shift = mix_shift_analysis(df, "industry")
acv_shift = mix_shift_analysis(df, "acv_bucket")
cycle_shift = mix_shift_analysis(df, "sales_cycle_bucket")

  df.groupby(["period_flag", segment_col])
  df.groupby(["period_flag", segment_col])


In [13]:
region_shift

Unnamed: 0,period_flag,region,deals,win_rate,mix_share
6,decline_period,India,424,0.457547,0.260602
5,decline_period,Europe,407,0.457002,0.250154
7,decline_period,North America,399,0.47619,0.245237
4,decline_period,APAC,397,0.43073,0.244007
2,baseline_period,India,852,0.453052,0.254785
0,baseline_period,APAC,836,0.458134,0.25
3,baseline_period,North America,831,0.43562,0.248505
1,baseline_period,Europe,825,0.455758,0.246711


In [14]:
industry_shift

Unnamed: 0,period_flag,industry,deals,win_rate,mix_share
6,decline_period,EdTech,332,0.424699,0.204057
9,decline_period,SaaS,328,0.457317,0.201598
8,decline_period,HealthTech,325,0.458462,0.199754
5,decline_period,Ecommerce,324,0.45679,0.19914
7,decline_period,FinTech,318,0.481132,0.195452
0,baseline_period,Ecommerce,729,0.444444,0.218002
3,baseline_period,HealthTech,682,0.441349,0.203947
4,baseline_period,SaaS,665,0.44812,0.198864
1,baseline_period,EdTech,653,0.445636,0.195275
2,baseline_period,FinTech,615,0.476423,0.183911


In [15]:
acv_shift

Unnamed: 0,period_flag,acv_bucket,deals,win_rate,mix_share
6,decline_period,High,420,0.461905,0.258144
4,decline_period,Low,413,0.452785,0.253841
7,decline_period,Very High,407,0.471744,0.250154
5,decline_period,Mid,387,0.434109,0.237861
1,baseline_period,Mid,856,0.42757,0.255981
3,baseline_period,Very High,836,0.462919,0.25
0,baseline_period,Low,830,0.454217,0.248206
2,baseline_period,High,822,0.458637,0.245813


In [16]:
cycle_shift

Unnamed: 0,period_flag,sales_cycle_bucket,deals,win_rate,mix_share
9,decline_period,3m+,526,0.441065,0.323294
8,decline_period,2-3m,483,0.488613,0.296865
7,decline_period,1-2m,382,0.405759,0.234788
6,decline_period,2-4w,164,0.5,0.100799
5,decline_period,<2w,72,0.5,0.044253
2,baseline_period,1-2m,929,0.442411,0.277811
3,baseline_period,2-3m,862,0.433875,0.257775
4,baseline_period,3m+,760,0.440789,0.227273
1,baseline_period,2-4w,519,0.512524,0.155203
0,baseline_period,<2w,274,0.441606,0.081938


# 4. Explainable Logistic Model (Driver Quantification)

### “This model isolates the true drivers of win probability while controlling for all other deal characteristics. It tells us structurally which segments, products, and deal types increase or decrease our odds of winning.”

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

features = [
    "region",
    "product_type",
    "acv_bucket",
    "sales_cycle_bucket",
    "lead_source"
]

X = df[features]
y = df["is_won"]

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), features)
    ]
)

model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("logreg", LogisticRegression(max_iter=1000))
])

# Train
model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('logreg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [25]:

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}\n")

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.5427

Confusion Matrix:
[[503  42]
 [413  37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.92      0.69       545
           1       0.47      0.08      0.14       450

    accuracy                           0.54       995
   macro avg       0.51      0.50      0.41       995
weighted avg       0.51      0.54      0.44       995



In [26]:
feature_names = model.named_steps["preprocess"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(features)

coefficients = model.named_steps["logreg"].coef_[0]

driver_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefficients
})

driver_df["impact_direction"] = np.where(
    driver_df["coefficient"] > 0,
    "Improves Win Rate",
    "Hurts Win Rate"
)
driver_df["odds_ratio"] = np.exp(driver_df["coefficient"])

driver_df.sort_values("coefficient")


Unnamed: 0,feature,coefficient,impact_direction,odds_ratio
6,acv_bucket_Mid,-0.160811,Hurts Win Rate,0.851453
14,lead_source_Referral,-0.109278,Hurts Win Rate,0.896481
13,lead_source_Partner,-0.077659,Hurts Win Rate,0.925279
12,lead_source_Outbound,-0.028886,Hurts Win Rate,0.971528
2,region_North America,-0.018505,Hurts Win Rate,0.981665
5,acv_bucket_Low,-0.018314,Hurts Win Rate,0.981853
7,acv_bucket_Very High,-0.004888,Hurts Win Rate,0.995124
3,product_type_Enterprise,0.006267,Improves Win Rate,1.006287
4,product_type_Pro,0.019478,Improves Win Rate,1.019669
0,region_Europe,0.04796,Improves Win Rate,1.049129


In [27]:
top_negative = driver_df.sort_values("coefficient").head(5)
top_positive = driver_df.sort_values("coefficient", ascending=False).head(5)

In [28]:
top_negative

Unnamed: 0,feature,coefficient,impact_direction,odds_ratio
6,acv_bucket_Mid,-0.160811,Hurts Win Rate,0.851453
14,lead_source_Referral,-0.109278,Hurts Win Rate,0.896481
13,lead_source_Partner,-0.077659,Hurts Win Rate,0.925279
12,lead_source_Outbound,-0.028886,Hurts Win Rate,0.971528
2,region_North America,-0.018505,Hurts Win Rate,0.981665


In [29]:
top_positive

Unnamed: 0,feature,coefficient,impact_direction,odds_ratio
9,sales_cycle_bucket_2-4w,0.262842,Improves Win Rate,1.300621
11,sales_cycle_bucket_<2w,0.183112,Improves Win Rate,1.200948
8,sales_cycle_bucket_2-3m,0.089657,Improves Win Rate,1.093799
10,sales_cycle_bucket_3m+,0.058854,Improves Win Rate,1.06062
1,region_India,0.051177,Improves Win Rate,1.052509
