In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('data/skygeni_sales_data.csv')
df.sample(5)

Unnamed: 0,deal_id,created_date,closed_date,sales_rep_id,industry,region,product_type,lead_source,deal_stage,deal_amount,sales_cycle_days,outcome
3317,D03318,2024-03-23,2024-06-01,rep_12,HealthTech,North America,Enterprise,Outbound,Qualified,4920,70,Won
3540,D03541,2023-04-13,2023-07-11,rep_12,Ecommerce,India,Core,Referral,Negotiation,4418,89,Won
102,D00103,2023-02-05,2023-05-29,rep_25,HealthTech,North America,Pro,Outbound,Proposal,3697,113,Lost
1363,D01364,2023-07-11,2023-08-18,rep_11,HealthTech,APAC,Enterprise,Inbound,Closed,12435,38,Won
1481,D01482,2023-03-18,2023-07-05,rep_12,EdTech,Europe,Enterprise,Referral,Qualified,68439,109,Won


In [5]:
min(df['created_date']), max(df['created_date'])

(Timestamp('2023-01-01 00:00:00'), Timestamp('2024-03-26 00:00:00'))

In [6]:
min(df['created_date']), max(df['closed_date'])

(Timestamp('2023-01-01 00:00:00'), Timestamp('2024-07-20 00:00:00'))

# Feature Engineering

In [7]:
df["created_date"] = pd.to_datetime(df["created_date"])
df["closed_date"] = pd.to_datetime(df["closed_date"])

df["created_month"] = df["created_date"].dt.month
df["created_quarter"] = df["created_date"].dt.quarter
df["created_year"] = df["created_date"].dt.year
df['created_year_and_quarter'] = df["created_date"].dt.to_period("Q").astype(str)
df["created_weekday"] = df["created_date"].dt.weekday

df["closed_month"] = df["closed_date"].dt.month
df["closed_quarter"] = df["closed_date"].dt.quarter
df["closed_year"] = df["closed_date"].dt.year
df['closed_year_and_quarter'] = df["closed_date"].dt.to_period("Q").astype(str)
df["closed_weekday"] = df["closed_date"].dt.weekday

# Time buckets (helps CRO thinking)
df["sales_cycle_bucket"] = pd.cut(
    df["sales_cycle_days"],
    bins=[0, 14, 30, 60, 90, np.inf],
    labels=["<2w", "2-4w", "1-2m", "2-3m", "3m+"]
)

# Binary outcome
df["is_won"] = (df["outcome"] == "Won").astype(int)

# ACV buckets (very important) --> Right skewed data, hence log transformed
df["acv_bucket"] = pd.qcut(
    df["deal_amount"], 
    # np.log1p(df["deal_amount"]), 
    q=4,
    labels=["Low", "Mid", "High", "Very High"]
)

# doesn't matter if I apply log or not in pd.qcut as log is a monotonic function (preserves order) 
# and qcut works by splitting thr ranked data into equal sized groups
# hence ranking order of values doesn't change post log applying
# NOTE: for strictly qcut into bucket applications, skip applying log as it is just additional compute, 
# look at examples/qcut-example.ipynb for sanity check

In [4]:
df.groupby(["created_year_and_quarter","acv_bucket"])["deal_amount"].agg(["min", "max", "sum", "median", "mean", "count"])

  df.groupby(["created_year_and_quarter","acv_bucket"])["deal_amount"].agg(["min", "max", "sum", "median", "mean", "count"])


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,sum,median,mean,count
created_year_and_quarter,acv_bucket,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023Q1,Low,2052,6550,360441,4349.0,4342.662651,83
2023Q1,Mid,6680,14119,930299,9717.0,10003.215054,93
2023Q1,High,14244,35218,1805689,18608.0,20755.045977,87
2023Q1,Very High,39701,99320,4593538,71775.0,70669.815385,65
2023Q2,Low,2002,6608,1090590,4116.0,4210.772201,259
2023Q2,Mid,6639,14161,2416438,9984.0,9985.280992,242
2023Q2,High,14178,39053,4963361,18475.0,20854.457983,238
2023Q2,Very High,39244,99377,16410000,70697.0,70128.205128,234
2023Q3,Low,2045,6569,1039567,4474.0,4386.35865,237
2023Q3,Mid,6612,14143,2376404,9459.0,9779.440329,243
