## Codes for last model run in chapter 3.2.4
#### first cell is for aiming for balanced model, improving NPV but keeping OK score with win class and last cell with adjusting the treshold to more conservative to reach better precision

### Balanced model

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix,
)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import shap

# ------------------------------------------------------------------
# 1. LOAD & PREPARE DATA 
# ------------------------------------------------------------------

# InputData CSV exported from Process mining platform 
df = pd.read_csv("InputData3.2.4.csv")
data = df.copy()

# Filtering based on cutoff date and changing the datatypes to date columns
cutoff_dateclose = pd.Timestamp("2025-1-12")
cutoff_datecreate = pd.Timestamp("2025-4-1")

data["CloseDateNew"] = pd.to_datetime(data["CloseDateNew"])
data["CreatedDate"] = pd.to_datetime(data["CreatedDate"])
data["Timestamp"] = pd.to_datetime(data["Timestamp"])

# Filter by segment if needed
# data = data[data["MarketSegmentc"] == "NAM"]

# Choose your cutoff logic:
# data = data[data["CloseDateNew"] > cutoff_dateclose]
# data = data[(data["CreatedDate"] > cutoff_datecreate ) & (data["CloseDateNew"] > cutoff_dateclose)]
data = data[data["CreatedDate"] > cutoff_datecreate]  # current choice
data = data[data["Timestamp"] > cutoff_datecreate]    # current choice

# Only closed opportunities (comment out to include open ones)
data = data[data["IsClosed"] == 1]

# Values to remove from Activity as these might be leaky
to_drop = ["CloseWon", "ClosedLost", "DidNotMaterialise", "ChangeCloseDate"]
data = data[~data["Activity"].isin(to_drop)].copy()

print(data["Activity"].value_counts())
print(data.shape)
data.info()

n_unique_opps = data["SFOpportunity ID"].nunique()
print("Number of distinct opportunities:", n_unique_opps)

# ------------------------------------------------------------------
# 2. HANDLE MISSING & ENCODE
# ------------------------------------------------------------------

# Columns to explicitly drop (helper columns not used in training)
explicit_drop = [
    "Customer",
    "SFOpportunity ID",
    "IsClosed",
    "CloseDateNew",
    "CreatedDate",
]

# Find columns with >60% missing values
missing_pct = data.isnull().sum() / len(data) * 100
high_missing = missing_pct[missing_pct > 60].index.tolist()

# Combine all columns to drop
cols_to_drop = list(set(explicit_drop + high_missing))

data_cleaned = data.drop(columns=cols_to_drop)

# Missing values summary
missing_counts = data_cleaned.isnull().sum()
missing_pct = (missing_counts / len(data_cleaned) * 100).round(2)

missing_summary = pd.DataFrame({
    "Missing Values": missing_counts,
    "Missing %": missing_pct
})

display(missing_summary)

# Fill missing in categorical columns
for col in data_cleaned.select_dtypes(include="object").columns:
    data_cleaned[col] = data_cleaned[col].fillna("Unknown")

# Fill missing in numeric columns
for col in data_cleaned.select_dtypes(include=["float64", "int64"]).columns:
    if col == "CustomerPrevRevenue":
        # Special rule: treat missing as 0 (no previous revenue)
        data_cleaned[col] = data_cleaned[col].fillna(0)
    else:
        if data_cleaned[col].isnull().any():
            median_val = data_cleaned[col].median()
            data_cleaned[col] = data_cleaned[col].fillna(median_val)

# Data summary
summary = pd.DataFrame({
    "Unique Values": data_cleaned.nunique(dropna=True),
    "Missing Values": data_cleaned.isnull().sum(),
    "Missing %": (data_cleaned.isnull().sum() / len(data_cleaned) * 100).round(2),
    "Data Type": data_cleaned.dtypes.astype(str)
}).sort_values("Unique Values", ascending=False)

display(summary)

# One-hot encode categoricals
df_encoded = pd.get_dummies(data_cleaned, drop_first=False)

# Ensure Timestamp is datetime and sort by time
df_encoded["Timestamp"] = pd.to_datetime(df_encoded["Timestamp"])
df_encoded = df_encoded.sort_values("Timestamp").reset_index(drop=True)

# ------------------------------------------------------------------
# 3. TRAIN / HOLDOUT SPLIT (LAST 30 DAYS AS HOLDOUT)
# ------------------------------------------------------------------

holdout_horizon_days = 30
last_timestamp = df_encoded["Timestamp"].max()
holdout_cutoff = last_timestamp - pd.Timedelta(days=holdout_horizon_days)

train_mask = df_encoded["Timestamp"] < holdout_cutoff
holdout_mask = ~train_mask

train_df = df_encoded[train_mask].reset_index(drop=True)
holdout_df = df_encoded[holdout_mask].reset_index(drop=True)

print("\nTrain period:")
print("  From:", train_df["Timestamp"].min(), "To:", train_df["Timestamp"].max())
print("Holdout period (last 30 days):")
print("  From:", holdout_df["Timestamp"].min(), "To:", holdout_df["Timestamp"].max())
print("Train size:", len(train_df), "rows; Holdout size:", len(holdout_df), "rows")

# Features and target
X = train_df.drop(columns=["IsWon", "Timestamp"])
y = train_df["IsWon"]

X_holdout = holdout_df.drop(columns=["IsWon", "Timestamp"])
y_holdout = holdout_df["IsWon"]

# Holdout set class balance
holdout_pos = (y_holdout == 1).sum()
holdout_neg = (y_holdout == 0).sum()
holdout_total = len(y_holdout)
holdout_pos_rate = (holdout_pos / holdout_total * 100).round(2)

print("\nHoldout set class balance:")
print(f"  Positives: {holdout_pos}")
print(f"  Negatives: {holdout_neg}")
print(f"  % positives: {holdout_pos_rate}%")

# ------------------------------------------------------------------
# 3b. GLOBAL CLASS WEIGHT ON TRAINING WINDOW
# ------------------------------------------------------------------

pos_train = (y == 1).sum()
neg_train = (y == 0).sum()
scale_pos_weight = neg_train / pos_train
print(f"\nGlobal scale_pos_weight (neg/pos on train window): {scale_pos_weight:.3f}")

# ------------------------------------------------------------------
# 4. TIME-SERIES SPLIT ON TRAINING DATA
# ------------------------------------------------------------------

tscv = TimeSeriesSplit(n_splits=100)

fold_rows = []   # store per-fold date ranges and class balances (based on training data)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X, y), start=1):
    train_start = train_df.loc[train_idx[0], "Timestamp"]
    train_end   = train_df.loc[train_idx[-1], "Timestamp"]
    test_start  = train_df.loc[test_idx[0], "Timestamp"]
    test_end    = train_df.loc[test_idx[-1], "Timestamp"]

    print(f"Fold {fold}:")
    print("  Train period:", train_start, "→", train_end)
    print("  Test  period:", test_start,  "→", test_end)
    print("  Train shape:", len(train_idx), "rows, Test shape:", len(test_idx), "rows")

    fold_rows.append({
        "fold": fold,
        "train_start": train_start,
        "train_end": train_end,
        "test_start": test_start,
        "test_end": test_end,
    })

# ------------------------------------------------------------------
# 5. HELPER: EVALUATION (FIXED THRESHOLD 0.5)
# ------------------------------------------------------------------

def evaluate_model(name, y_true, y_pred, y_proba):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0  # Negative Predictive Value

    return {
        "Model": name,
        "ROC_AUC": roc_auc_score(y_true, y_proba),
        "PR_AUC": average_precision_score(y_true, y_proba),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "Specificity": specificity,
        "NPV": npv,
    }

# ------------------------------------------------------------------
# 6. MODELING - XGBOOST (WITH scale_pos_weight, THRESHOLD=0.5)
# ------------------------------------------------------------------

results = []

# Keep references to last-fold XGB for SHAP
last_fold_xgb = None
last_fold_X_test = None
last_fold_y_test = None
last_fold_y_pred = None
last_fold_y_proba = None
last_fold_y_train = None

threshold = 0.2  # standard decision threshold

for fold, (train_idx, test_idx) in enumerate(tscv.split(X, y), start=1):
    print(f"\n=== Fold {fold} ===")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train set class balance for this fold
    train_pos = (y_train == 1).sum()
    train_neg = (y_train == 0).sum()
    train_total = len(y_train)
    train_pos_rate = (train_pos / train_total * 100).round(2)

    print(f"  Train class balance: {train_pos} positives, {train_neg} negatives "
          f"({train_pos_rate}% positives)")

    # Test set class balance
    test_pos = (y_test == 1).sum()
    test_neg = (y_test == 0).sum()
    test_total = len(y_test)
    test_pos_rate = (test_pos / test_total * 100).round(2)

    print(f"  Test class balance:  {test_pos} positives, {test_neg} negatives "
          f"({test_pos_rate}% positives)")

    test_win_rate = (y_test.mean() * 100).round(2)  # proportion of 1s

    # XGBoost with global class weight
    xgb_clf = xgb.XGBClassifier(
        eval_metric="logloss",
        random_state=42,
        scale_pos_weight=scale_pos_weight
    )
    xgb_clf.fit(X_train, y_train)

    # Probabilities on test
    y_proba_xgb = xgb_clf.predict_proba(X_test)[:, 1]

    # Fixed threshold 0.5
    y_pred_xgb = (y_proba_xgb >= threshold).astype(int)

    # Evaluate
    res_xgb = evaluate_model("XGBoost", y_test, y_pred_xgb, y_proba_xgb)
    res_xgb["fold"] = fold
    res_xgb["test_win_rate"] = test_win_rate
    res_xgb["train_pos"] = train_pos
    res_xgb["train_neg"] = train_neg
    res_xgb["train_pos_rate"] = train_pos_rate
    res_xgb["test_pos"] = test_pos
    res_xgb["test_neg"] = test_neg
    res_xgb["test_pos_rate"] = test_pos_rate
    results.append(res_xgb)

    print(f"  scale_pos_weight (global): {scale_pos_weight:.3f}")
    print("  XGBoost ROC-AUC:", res_xgb["ROC_AUC"])
    print("  XGBoost PR-AUC:", res_xgb["PR_AUC"])
    print("  XGBoost Accuracy:", res_xgb["Accuracy"])
    print("  XGBoost Recall:", res_xgb["Recall"])
    print("  XGBoost Precision:", res_xgb["Precision"])
    print("  XGBoost Specificity:", res_xgb["Specificity"])
    print("  XGBoost NPV:", res_xgb["NPV"])

    # Update fold_rows with class balance info for this fold
    fold_rows[fold - 1]["train_pos"] = train_pos
    fold_rows[fold - 1]["train_neg"] = train_neg
    fold_rows[fold - 1]["train_pos_rate"] = train_pos_rate
    fold_rows[fold - 1]["test_pos"] = test_pos
    fold_rows[fold - 1]["test_neg"] = test_neg
    fold_rows[fold - 1]["test_pos_rate"] = test_pos_rate

    # Keep last fold XGBoost for SHAP
    last_fold_xgb = xgb_clf
    last_fold_X_test = X_test.copy()
    last_fold_y_test = y_test.copy()
    last_fold_y_pred = y_pred_xgb.copy()
    last_fold_y_proba = y_proba_xgb.copy()
    last_fold_y_train = y_train.copy()

# ------------------------------------------------------------------
# 7. FULL TRAIN + FINAL HOLDOUT EVALUATION (WITH scale_pos_weight)
# ------------------------------------------------------------------

X_full_train = X
y_full_train = y

pos_count_full = (y_full_train == 1).sum()
neg_count_full = (y_full_train == 0).sum()

if pos_count_full == 0:
    raise RuntimeError("No positive samples in the full training data.")

full_train_total = len(y_full_train)
full_train_pos_rate = (pos_count_full / full_train_total * 100).round(2)

print("\nFull training set class balance:")
print(f"  Positives: {pos_count_full}")
print(f"  Negatives: {neg_count_full}")
print(f"  % positives: {full_train_pos_rate}%")
print(f"  Using same scale_pos_weight as in CV: {scale_pos_weight:.3f}")

xgb_holdout = xgb.XGBClassifier(
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=scale_pos_weight
)

xgb_holdout.fit(X_full_train, y_full_train)

y_holdout_proba = xgb_holdout.predict_proba(X_holdout)[:, 1]

# Default threshold 0.5
y_holdout_pred = (y_holdout_proba >= 0.2).astype(int)

holdout_results = evaluate_model("XGBoost_holdout", y_holdout, y_holdout_pred, y_holdout_proba)

print("\n=== HOLDOUT RESULTS (scale_pos_weight, THRESHOLD 0.5) ===")
print("Holdout win rate (% of 1s):", (y_holdout.mean() * 100).round(2))
print("Holdout ROC-AUC:", holdout_results["ROC_AUC"])
print("Holdout PR-AUC:", holdout_results["PR_AUC"])
print("Holdout Accuracy:", holdout_results["Accuracy"])
print("Holdout Recall:", holdout_results["Recall"])
print("Holdout Precision:", holdout_results["Precision"])
print("Holdout F1:", holdout_results["F1"])
print("Holdout Specificity:", holdout_results["Specificity"])
print("Holdout NPV:", holdout_results["NPV"])

# ------------------------------------------------------------------
# 8. CONFUSION MATRIX - HOLDOUT
# ------------------------------------------------------------------

cm_holdout = confusion_matrix(y_holdout, y_holdout_pred)
cm_holdout_df = pd.DataFrame(
    cm_holdout,
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

print("\nHoldout confusion matrix (counts):")
print(cm_holdout_df)

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm_holdout_df,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)
plt.title("Confusion Matrix - Holdout")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------
# 9. RESULTS TABLES & EXPORT (WITH FOLD DATES & CLASS BALANCES)
# ------------------------------------------------------------------

results_df = pd.DataFrame(results)

# Per fold (only XGBoost)
by_fold = (
    results_df
    .groupby(["fold", "Model"], as_index=False)[
        [
            "ROC_AUC",
            "PR_AUC",
            "Accuracy",
            "Recall",
            "Precision",
            "F1",
            "Specificity",
            "NPV",
            "test_win_rate",
            "train_pos",
            "train_neg",
            "train_pos_rate",
            "test_pos",
            "test_neg",
            "test_pos_rate",
        ]
    ]
    .mean()
    .round(4)
    .sort_values(["fold", "Model"])
)

display(by_fold.head())

# Average over folds
avg_metrics = (
    results_df
    .groupby("Model", as_index=False)[
        [
            "ROC_AUC",
            "PR_AUC",
            "Accuracy",
            "Recall",
            "Precision",
            "F1",
            "Specificity",
            "NPV",
        ]
    ]
    .mean()
    .round(4)
    .sort_values("ROC_AUC", ascending=False)
)

print("\nAverage metrics over all folds:")
display(avg_metrics)

# Fold metadata including dates and class balances
fold_periods_df = pd.DataFrame(fold_rows).sort_values("fold").reset_index(drop=True)

by_fold_with_dates = by_fold.merge(
    fold_periods_df,
    on="fold",
    how="left"
).sort_values(["Model", "fold"])

print("\nPer-fold summary with dates and class balances:")
display(by_fold_with_dates.head())

# Save detailed per-fold results
by_fold_with_dates.to_csv("by_fold_xgboost_100folds_with_spw_dates_and_balance.csv", index=False)
results_df.to_csv("xgboost_evaluation_results_detailed_with_spw.csv", index=False)

# Aggregate average and standard deviation of metrics
metric_cols = [
    "ROC_AUC",
    "PR_AUC",
    "Accuracy",
    "Recall",
    "Precision",
    "F1",
    "Specificity",
    "NPV",
    "test_win_rate",
    "train_pos_rate",
    "test_pos_rate",
]

summary = (
    results_df
    .groupby("Model")[metric_cols]
    .agg(["mean", "std"])
)

summary.columns = [f"{metric}_{stat}" for metric, stat in summary.columns]
summary = summary.round(4).reset_index()

print("\nAverage and standard deviation of metrics (XGBoost, with scale_pos_weight):")
print(summary.to_string(index=False))

summary.to_csv("xgboost_metrics_avg_std_with_spw.csv", index=False)

# ------------------------------------------------------------------
# 10. PLOT METRICS PER FOLD (XGBOOST ONLY)
# ------------------------------------------------------------------

metrics_to_plot = ["ROC_AUC", "PR_AUC", "Accuracy", "Precision", "Recall", "F1", "Specificity", "NPV"]

fig, axes = plt.subplots(len(metrics_to_plot), 1, figsize=(10, 18), sharex=True)

subset = by_fold_with_dates[by_fold_with_dates["Model"] == "XGBoost"]

for ax, metric in zip(axes, metrics_to_plot):
    ax.plot(subset["fold"], subset[metric], marker="o", label="XGBoost")
    ax.set_ylabel(metric)
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel("Fold")
axes[0].legend(loc="best")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------
# 11. SHAP ANALYSIS ON HOLDOUT SET
# ------------------------------------------------------------------

print("\nRunning SHAP for final holdout model...")

explainer = shap.TreeExplainer(xgb_holdout)
shap_values_holdout = explainer.shap_values(X_holdout)

# Bar summary (global feature importance on holdout)
shap.summary_plot(shap_values_holdout, X_holdout, plot_type="bar")

# Beeswarm detailed summary (per-feature distribution on holdout)
shap.summary_plot(shap_values_holdout, X_holdout)

Series([], Name: count, dtype: int64)
(0, 26)
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   SFOpportunity ID                  0 non-null      int64         
 1   IsWon                             0 non-null      int64         
 2   IsClosed                          0 non-null      int64         
 3   Activity                          0 non-null      float64       
 4   Timestamp                         0 non-null      datetime64[ns]
 5   CloseDateNew                      0 non-null      datetime64[ns]
 6   CreatedDate                       0 non-null      datetime64[ns]
 7   Previous activity                 0 non-null      float64       
 8   Lead time from previous activity  0 non-null      float64       
 9   Activity amount so far            0 non-null      float64       
 10  Rework amount        

Unnamed: 0,Missing Values,Missing %
IsWon,0,
Activity,0,
Timestamp,0,
Previous activity,0,
Lead time from previous activity,0,
Activity amount so far,0,
Rework amount,0,
Lead time until the stage,0,
Amount in stage,0,
MarketSegmentc,0,


Unnamed: 0,Unique Values,Missing Values,Missing %,Data Type
IsWon,0,0,,int64
Industryc,0,0,,float64
CustomerPrevRevenue,0,0,,float64
Tenderflag,0,0,,float64
BigDealAlertSent,0,0,,float64
OfferingCategory,0,0,,float64
QuoteOperatingUnitc,0,0,,float64
Paymentterm,0,0,,float64
Ownerrolec,0,0,,float64
Subsegmentc,0,0,,float64



Train period:
  From: NaT To: NaT
Holdout period (last 30 days):
  From: NaT To: NaT
Train size: 0 rows; Holdout size: 0 rows

Holdout set class balance:
  Positives: 0
  Negatives: 0
  % positives: nan%

Global scale_pos_weight (neg/pos on train window): nan


  holdout_pos_rate = (holdout_pos / holdout_total * 100).round(2)
  scale_pos_weight = neg_train / pos_train


ValueError: Cannot have number of folds=101 greater than the number of samples=0.

### Model for improved precision

In [5]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    accuracy_score,
    recall_score,
    precision_score,
    f1_score,
    confusion_matrix,
)

import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
import shap

# ------------------------------------------------------------------
# 1. LOAD & PREPARE DATA 
# ------------------------------------------------------------------

# InputData CSV exported from Process mining platform 
df = pd.read_csv('InputData3.2.4.csv')
data = df.copy()

# Filtering based on cutoff date and changing the datatypes to date columns
cutoff_dateclose = pd.Timestamp("2025-1-12")
cutoff_datecreate = pd.Timestamp("2025-4-1")

data["CloseDateNew"] = pd.to_datetime(data["CloseDateNew"])
data["CreatedDate"] = pd.to_datetime(data["CreatedDate"])
data["Timestamp"] = pd.to_datetime(data["Timestamp"])

# Filter by segment if needed
# data = data[data["MarketSegmentc"] == "NAM"]

# Choose cutoff logic:
# data = data[data["CloseDateNew"] > cutoff_dateclose]
# data = data[(data["CreatedDate"] > cutoff_datecreate ) & (data["CloseDateNew"] > cutoff_dateclose)]
data = data[data["CreatedDate"] > cutoff_datecreate]  # current choice
data = data[data["Timestamp"] > cutoff_datecreate]    # current choice

# Only closed opportunities (comment out to include open ones)
data = data[data["IsClosed"] == 1]

# Values to remove from Activity as these might be leaky
to_drop = ["CloseWon", "ClosedLost", "DidNotMaterialise", "ChangeCloseDate"]
data = data[~data["Activity"].isin(to_drop)].copy()

print(data["Activity"].value_counts())
print(data.shape)
data.info()
n_unique_opps = data["SFOpportunity ID"].nunique()
print("Number of distinct opportunities:", n_unique_opps)

# ------------------------------------------------------------------
# 2. HANDLE MISSING & ENCODE
# ------------------------------------------------------------------

# Columns to explicitly drop (these are helper columns not used in training)
explicit_drop = [
    "Customer",
    "SFOpportunity ID",
    "IsClosed",
    "CloseDateNew",
    "CreatedDate",
]

# Find columns with >60% missing values
missing_pct = data.isnull().sum() / len(data) * 100
high_missing = missing_pct[missing_pct > 60].index.tolist()

# Combine all columns to drop
cols_to_drop = list(set(explicit_drop + high_missing))

data_cleaned = data.drop(columns=cols_to_drop)

# Missing values summary
missing_counts = data_cleaned.isnull().sum()
missing_pct = (missing_counts / len(data_cleaned) * 100).round(2)

missing_summary = pd.DataFrame({
    "Missing Values": missing_counts,
    "Missing %": missing_pct
})

display(missing_summary)

# Fill missing in categorical columns
for col in data_cleaned.select_dtypes(include="object").columns:
    data_cleaned[col] = data_cleaned[col].fillna("Unknown")

# Fill missing in numeric columns
for col in data_cleaned.select_dtypes(include=["float64", "int64"]).columns:
    if col == "CustomerPrevRevenue":
        # Special rule: treat missing as 0 (no previous revenue)
        data_cleaned[col] = data_cleaned[col].fillna(0)
    else:
        if data_cleaned[col].isnull().any():
            median_val = data_cleaned[col].median()
            data_cleaned[col] = data_cleaned[col].fillna(median_val)

# Data summary
summary = pd.DataFrame({
    "Unique Values": data_cleaned.nunique(dropna=True),
    "Missing Values": data_cleaned.isnull().sum(),
    "Missing %": (data_cleaned.isnull().sum() / len(data_cleaned) * 100).round(2),
    "Data Type": data_cleaned.dtypes.astype(str)
}).sort_values("Unique Values", ascending=False)

display(summary)

# One-hot encode categoricals
df_encoded = pd.get_dummies(data_cleaned, drop_first=False)

# Ensure Timestamp is datetime and sort by time
df_encoded["Timestamp"] = pd.to_datetime(df_encoded["Timestamp"])
df_encoded = df_encoded.sort_values("Timestamp").reset_index(drop=True)

# ------------------------------------------------------------------
# 3. TRAIN / HOLDOUT SPLIT (LAST 30 DAYS AS HOLDOUT)
# ------------------------------------------------------------------

holdout_horizon_days = 30
last_timestamp = df_encoded["Timestamp"].max()
holdout_cutoff = last_timestamp - pd.Timedelta(days=holdout_horizon_days)

train_mask = df_encoded["Timestamp"] < holdout_cutoff
holdout_mask = ~train_mask

train_df = df_encoded[train_mask].reset_index(drop=True)
holdout_df = df_encoded[holdout_mask].reset_index(drop=True)

print("\nTrain period:")
print("  From:", train_df["Timestamp"].min(), "To:", train_df["Timestamp"].max())
print("Holdout period (last 14 days):")
print("  From:", holdout_df["Timestamp"].min(), "To:", holdout_df["Timestamp"].max())
print("Train size:", len(train_df), "rows; Holdout size:", len(holdout_df), "rows")

# Features and target
X = train_df.drop(columns=["IsWon", "Timestamp"])
y = train_df["IsWon"]

X_holdout = holdout_df.drop(columns=["IsWon", "Timestamp"])
y_holdout = holdout_df["IsWon"]

# Holdout set class balance
holdout_pos = (y_holdout == 1).sum()
holdout_neg = (y_holdout == 0).sum()
holdout_total = len(y_holdout)
holdout_pos_rate = (holdout_pos / holdout_total * 100).round(2)

print("\nHoldout set class balance:")
print(f"  Positives: {holdout_pos}")
print(f"  Negatives: {holdout_neg}")
print(f"  % positives: {holdout_pos_rate}%")

# Time series split on training data only (change n_splits as needed)
tscv = TimeSeriesSplit(n_splits=100)

fold_rows = []   # store per-fold date ranges and class balances (based on training data)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X, y), start=1):
    train_start = train_df.loc[train_idx[0], "Timestamp"]
    train_end   = train_df.loc[train_idx[-1], "Timestamp"]
    test_start  = train_df.loc[test_idx[0], "Timestamp"]
    test_end    = train_df.loc[test_idx[-1], "Timestamp"]

    print(f"Fold {fold}:")
    print("  Train period:", train_start, "→", train_end)
    print("  Test  period:", test_start,  "→", test_end)
    print("  Train shape:", len(train_idx), "rows, Test shape:", len(test_idx), "rows")

    fold_rows.append({
        "fold": fold,
        "train_start": train_start,
        "train_end": train_end,
        "test_start": test_start,
        "test_end": test_end,
    })

# ------------------------------------------------------------------
# 4. HELPER: EVALUATION (FIXED THRESHOLD 0.5)
# ------------------------------------------------------------------

def evaluate_model(name, y_true, y_pred, y_proba):
    # confusion matrix: tn, fp, fn, tp
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0  # 
    
    return {
        "Model": name,
        "ROC_AUC": roc_auc_score(y_true, y_proba),
        "PR_AUC": average_precision_score(y_true, y_proba),
        "Accuracy": accuracy_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "F1": f1_score(y_true, y_pred),
        "Specificity": specificity,
        "NPV": npv,
    }

# ------------------------------------------------------------------
# 5. MODELING - XGBOOST 
# ------------------------------------------------------------------

results = []

# Keep references to last-fold XGB for SHAP
last_fold_xgb = None
last_fold_X_test = None
last_fold_y_test = None
last_fold_y_pred = None
last_fold_y_proba = None
last_fold_y_train = None

for fold, (train_idx, test_idx) in enumerate(tscv.split(X, y), start=1):
    print(f"\n=== Fold {fold} ===")
    
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train set class balance for this fold
    train_pos = (y_train == 1).sum()
    train_neg = (y_train == 0).sum()
    train_total = len(y_train)
    train_pos_rate = (train_pos / train_total * 100).round(2)

    print(f"  Train class balance: {train_pos} positives, {train_neg} negatives "
          f"({train_pos_rate}% positives)")

    # Test set class balance
    test_pos = (y_test == 1).sum()
    test_neg = (y_test == 0).sum()
    test_total = len(y_test)
    test_pos_rate = (test_pos / test_total * 100).round(2)

    print(f"  Test class balance:  {test_pos} positives, {test_neg} negatives "
          f"({test_pos_rate}% positives)")
    test_win_rate = (y_test.mean() * 100).round(2)  # proportion of 1s
    print(f"  Test class balance: {test_win_rate}% positives")

    # Compute class weight for this fold: positive class = loss (label 1)
    pos_count = train_pos
    neg_count = train_neg

    if pos_count == 0:
        print("  WARNING: No positive (loss) samples in this training fold. Skipping fold.")
        continue

    scale_pos_weight = neg_count / pos_count
    print(f"  scale_pos_weight (loss vs win): {scale_pos_weight:.3f}")

    xgb_clf = xgb.XGBClassifier(
        eval_metric="logloss",
        scale_pos_weight=scale_pos_weight,
        random_state=42
    )

    xgb_clf.fit(X_train, y_train)

    # Probabilities on test
    y_proba_xgb = xgb_clf.predict_proba(X_test)[:, 1]

    # Default threshold 0.5
    y_pred_xgb = (y_proba_xgb >= 0.9).astype(int)

    # Evaluate
    res_xgb = evaluate_model("XGBoost", y_test, y_pred_xgb, y_proba_xgb)
    res_xgb["fold"] = fold
    res_xgb["test_win_rate"] = test_win_rate
    res_xgb["train_pos"] = train_pos
    res_xgb["train_neg"] = train_neg
    res_xgb["train_pos_rate"] = train_pos_rate
    results.append(res_xgb)

    print("  XGBoost ROC-AUC:", res_xgb["ROC_AUC"])
    print("  XGBoost PR-AUC:", res_xgb["PR_AUC"])
    print("  XGBoost Accuracy:", res_xgb["Accuracy"])
    print("  XGBoost Recall:", res_xgb["Recall"])
    print("  XGBoost Precision:", res_xgb["Precision"])
    print("  XGBoost Specificity:", res_xgb["Specificity"])
    print("  XGBoost NPV:", res_xgb["NPV"])

    # Update fold_rows with class balance info for this fold
    fold_rows[fold - 1]["train_pos"] = train_pos
    fold_rows[fold - 1]["train_neg"] = train_neg
    fold_rows[fold - 1]["train_pos_rate"] = train_pos_rate
    fold_rows[fold - 1]["test_pos_rate"] = test_win_rate

    # Keep last fold XGBoost for SHAP
    last_fold_xgb = xgb_clf
    last_fold_X_test = X_test.copy()
    last_fold_y_test = y_test.copy()
    last_fold_y_pred = y_pred_xgb.copy()
    last_fold_y_proba = y_proba_xgb.copy()
    last_fold_y_train = y_train.copy()

# ------------------------------------------------------------------
# 6. FULL TRAIN  + FINAL HOLDOUT EVALUATION
# ------------------------------------------------------------------

X_full_train = X
y_full_train = y

pos_count_full = (y_full_train == 1).sum()
neg_count_full = (y_full_train == 0).sum()

if pos_count_full == 0:
    raise RuntimeError("No positive samples in the full training data.")

full_train_total = len(y_full_train)
full_train_pos_rate = (pos_count_full / full_train_total * 100).round(2)

print("\nFull training set class balance:")
print(f"  Positives: {pos_count_full}")
print(f"  Negatives: {neg_count_full}")
print(f"  % positives: {full_train_pos_rate}%")

scale_pos_weight_full = neg_count_full / pos_count_full
print(f"Full-train scale_pos_weight (loss vs win): {scale_pos_weight_full:.3f}")

xgb_holdout = xgb.XGBClassifier(
    eval_metric="logloss",
    scale_pos_weight=scale_pos_weight_full,
    random_state=42
)

xgb_holdout.fit(X_full_train, y_full_train)

y_holdout_proba = xgb_holdout.predict_proba(X_holdout)[:, 1]

# Default threshold 0.5
y_holdout_pred = (y_holdout_proba >= 0.9).astype(int)

holdout_results = evaluate_model("XGBoost_holdout", y_holdout, y_holdout_pred, y_holdout_proba)

print("\n=== HOLDOUT RESULTS (DEFAULT THRESHOLD 0.5) ===")
print("Holdout win rate (% of 1s):", (y_holdout.mean() * 100).round(2))
print("Holdout ROC-AUC:", holdout_results["ROC_AUC"])
print("Holdout PR-AUC:", holdout_results["PR_AUC"])
print("Holdout Accuracy:", holdout_results["Accuracy"])
print("Holdout Recall:", holdout_results["Recall"])
print("Holdout Precision:", holdout_results["Precision"])
print("Holdout F1:", holdout_results["F1"])
print("Holdout Specificity:", holdout_results["Specificity"])
print("Holdout NPV:", holdout_results["NPV"])

# ------------------------------------------------------------------
# 7. CONFUSION MATRIX - HOLDOUT
# ------------------------------------------------------------------

cm_holdout = confusion_matrix(y_holdout, y_holdout_pred)
cm_holdout_df = pd.DataFrame(
    cm_holdout,
    index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

print("\nHoldout confusion matrix (counts):")
print(cm_holdout_df)

plt.figure(figsize=(5, 4))
sns.heatmap(
    cm_holdout_df,
    annot=True,
    fmt="d",
    cmap="Blues",
    cbar=False
)
plt.title("Confusion Matrix - Holdout")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------
# 8. RESULTS TABLES & EXPORT (WITH FOLD DATES & CLASS BALANCES)
# ------------------------------------------------------------------

results_df = pd.DataFrame(results)

# Per fold (only XGBoost)
by_fold = (
    results_df
    .groupby(["fold", "Model"], as_index=False)[
        [
            "ROC_AUC",
            "PR_AUC",
            "Accuracy",
            "Recall",
            "Precision",
            "F1",
            "Specificity",
            "NPV",
            "test_win_rate",
            "train_pos",
            "train_neg",
            "train_pos_rate",
        ]
    ]
    .mean()
    .round(4)
    .sort_values(["fold", "Model"])
)

display(by_fold.head())

# Average over folds
avg_metrics = (
    results_df
    .groupby("Model", as_index=False)[
        [
            "ROC_AUC",
            "PR_AUC",
            "Accuracy",
            "Recall",
            "Precision",
            "F1",
            "Specificity",
            "NPV",
        ]
    ]
    .mean()
    .round(4)
    .sort_values("ROC_AUC", ascending=False)
)

print("\nAverage metrics over all folds:")
display(avg_metrics)

# Fold metadata including dates and class balances
fold_periods_df = pd.DataFrame(fold_rows).sort_values("fold").reset_index(drop=True)

by_fold_with_dates = by_fold.merge(
    fold_periods_df,
    on="fold",
    how="left"
).sort_values(["Model", "fold"])

print("\nPer-fold summary with dates and class balances:")
display(by_fold_with_dates.head())

# Save detailed per-fold results
by_fold_with_dates.to_csv("by_fold_xgboost_100folds_with_dates_and_balance.csv", index=False)
results_df.to_csv("xgboost_evaluation_results_detailed.csv", index=False)

# Aggregate average and standard deviation of metrics
metric_cols = [
    "ROC_AUC",
    "PR_AUC",
    "Accuracy",
    "Recall",
    "Precision",
    "F1",
    "Specificity",
    "NPV",
    "test_win_rate",
    "train_pos_rate",
]

summary = (
    results_df
    .groupby("Model")[metric_cols]
    .agg(["mean", "std"])
)

summary.columns = [f"{metric}_{stat}" for metric, stat in summary.columns]
summary = summary.round(4).reset_index()

print("\nAverage and standard deviation of metrics (XGBoost):")
print(summary.to_string(index=False))

summary.to_csv("xgboost_metrics_avg_std.csv", index=False)

# ------------------------------------------------------------------
# 9. PLOT METRICS PER FOLD (XGBOOST ONLY)
# ------------------------------------------------------------------

metrics_to_plot = ["ROC_AUC", "PR_AUC", "Accuracy", "Precision", "Recall", "F1", "Specificity", "NPV"]

fig, axes = plt.subplots(len(metrics_to_plot), 1, figsize=(10, 18), sharex=True)

subset = by_fold_with_dates[by_fold_with_dates["Model"] == "XGBoost"]

for ax, metric in zip(axes, metrics_to_plot):
    ax.plot(subset["fold"], subset[metric], marker="o", label="XGBoost")
    ax.set_ylabel(metric)
    ax.grid(True, alpha=0.3)

axes[-1].set_xlabel("Fold")
axes[0].legend(loc="best")
plt.tight_layout()
plt.show()

# ------------------------------------------------------------------
# 10. SHAP ANALYSIS ON HOLDOUT SET
# ------------------------------------------------------------------

print("\nRunning SHAP for final holdout model...")

# Use the model trained on full training data
explainer = shap.TreeExplainer(xgb_holdout)

# SHAP values for the holdout feature matrix
shap_values_holdout = explainer.shap_values(X_holdout)

# Bar summary (global feature importance on holdout)
shap.summary_plot(shap_values_holdout, X_holdout, plot_type="bar")

# Beeswarm detailed summary (per-feature distribution on holdout)
shap.summary_plot(shap_values_holdout, X_holdout)

Series([], Name: count, dtype: int64)
(0, 26)
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 26 columns):
 #   Column                            Non-Null Count  Dtype         
---  ------                            --------------  -----         
 0   SFOpportunity ID                  0 non-null      int64         
 1   IsWon                             0 non-null      int64         
 2   IsClosed                          0 non-null      int64         
 3   Activity                          0 non-null      float64       
 4   Timestamp                         0 non-null      datetime64[ns]
 5   CloseDateNew                      0 non-null      datetime64[ns]
 6   CreatedDate                       0 non-null      datetime64[ns]
 7   Previous activity                 0 non-null      float64       
 8   Lead time from previous activity  0 non-null      float64       
 9   Activity amount so far            0 non-null      float64       
 10  Rework amount        

Unnamed: 0,Missing Values,Missing %
IsWon,0,
Activity,0,
Timestamp,0,
Previous activity,0,
Lead time from previous activity,0,
Activity amount so far,0,
Rework amount,0,
Lead time until the stage,0,
Amount in stage,0,
MarketSegmentc,0,


Unnamed: 0,Unique Values,Missing Values,Missing %,Data Type
IsWon,0,0,,int64
Industryc,0,0,,float64
CustomerPrevRevenue,0,0,,float64
Tenderflag,0,0,,float64
BigDealAlertSent,0,0,,float64
OfferingCategory,0,0,,float64
QuoteOperatingUnitc,0,0,,float64
Paymentterm,0,0,,float64
Ownerrolec,0,0,,float64
Subsegmentc,0,0,,float64



Train period:
  From: NaT To: NaT
Holdout period (last 14 days):
  From: NaT To: NaT
Train size: 0 rows; Holdout size: 0 rows

Holdout set class balance:
  Positives: 0
  Negatives: 0
  % positives: nan%


  holdout_pos_rate = (holdout_pos / holdout_total * 100).round(2)


ValueError: Cannot have number of folds=101 greater than the number of samples=0.