# IT Incident SLA Breach Prediction
This notebook:
- Reads raw event log `sla_incident_events.csv`
- Cleans data, resolves SLA inconsistencies
- Uses only the **first 3 events** per incident to build features
- Trains a baseline (Logistic Regression) and an advanced model (XGBoost)
- Interprets early signals and performs a counterfactual

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, recall_score,
    confusion_matrix, classification_report, roc_curve
)
import xgboost as xgb

In [None]:
EVENTS_CSV = "sla_incident_events.csv"

events = pd.read_csv(EVENTS_CSV, parse_dates=['timestamp'])
print("Loaded events:", events.shape)
events.head()

In [None]:
# 1) Drop exact duplicates
events = events.drop_duplicates()

# 2) Drop rows missing key values
events = events.dropna(subset=['incident_id', 'timestamp'])

# 3) Resolve SLA label inconsistencies per incident using majority vote
def resolve_sla_majority(g):
    vc = g['sla_breached'].value_counts()
    if vc.empty:
        chosen = 0
    elif len(vc) == 1:
        chosen = vc.idxmax()
    else:
        # majority; if tie, choose 1
        if vc.iloc[0] == vc.iloc[1]:
            chosen = 1
        else:
            chosen = vc.idxmax()
    g['sla_breached'] = chosen
    return g

events = events.groupby('incident_id', group_keys=False).apply(resolve_sla_majority)

# 4) Quick data quality
print("Unique incidents:", events['incident_id'].nunique())
print("Missing values per column:\n", events.isnull().sum())


In [None]:
# Sort and take first 3 events per incident
events_sorted = events.sort_values(['incident_id','timestamp'])
first3 = events_sorted.groupby('incident_id', group_keys=False).head(3).copy()

# Add event order to make pivoting robust
first3['event_order'] = first3.groupby('incident_id').cumcount() + 1

# Pivot to wide format (activity1, actor1, ts1, activity2, actor2, ts2, ...)
wide = first3.pivot(index='incident_id', columns='event_order',
                    values=['event_type','actor','timestamp'])

# Flatten multiindex columns
wide.columns = [f"{col[0]}{col[1]}" for col in wide.columns]
wide = wide.reset_index()

# Bring SLA target (we resolved inconsistencies above)
targets = events.groupby('incident_id', as_index=False)['sla_breached'].max()
data = pd.merge(wide, targets, on='incident_id', how='inner')

print("Wide data (one row per incident):", data.shape)
data.head()


In [None]:
# Ensure timestamp columns are datetimes
for tcol in ['timestamp1','timestamp2','timestamp3']:
    if tcol in data.columns:
        data[tcol] = pd.to_datetime(data[tcol], errors='coerce')

# Temporal deltas in minutes (use safe calculation)
def minutes_diff(a,b):
    return ((a - b).dt.total_seconds() / 60.0).fillna(0.0)

if 'timestamp1' in data.columns and 'timestamp2' in data.columns:
    data['dt_1_2'] = minutes_diff(data['timestamp2'], data['timestamp1'])
else:
    data['dt_1_2'] = 0.0

if 'timestamp2' in data.columns and 'timestamp3' in data.columns:
    data['dt_2_3'] = minutes_diff(data['timestamp3'], data['timestamp2'])
else:
    data['dt_2_3'] = 0.0

# dt_1_3 if both exist
if 'timestamp1' in data.columns and 'timestamp3' in data.columns:
    data['dt_1_3'] = minutes_diff(data['timestamp3'], data['timestamp1'])
else:
    data['dt_1_3'] = data['dt_1_2'] + data['dt_2_3']

# Derived flags from activities and actors
for col in ['event_type1','event_type2','event_type3','actor1','actor2','actor3']:
    if col not in data.columns:
        data[col] = np.nan

data['any_escalated_1_3'] = data[['event_type1','event_type2','event_type3']].apply(
    lambda row: int(any(isinstance(x,str) and 'escal' in x.lower() for x in row)), axis=1)
data['any_assigned_1_3'] = data[['event_type1','event_type2','event_type3']].apply(
    lambda row: int(any(isinstance(x,str) and 'assign' in x.lower() for x in row)), axis=1)
data['tier2_in_first3'] = data[['actor1','actor2','actor3']].apply(
    lambda row: int(any(isinstance(x,str) and 'tier2' in x.lower() for x in row)), axis=1)

# Calendar features from first timestamp
data['first_ts'] = data['timestamp1']
data['hour_of_day'] = data['first_ts'].dt.hour.fillna(0).astype(int)
data['day_of_week'] = data['first_ts'].dt.weekday.fillna(0).astype(int)
data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)

# Optionally drop the raw timestamp columns to avoid leak
drop_cols = [c for c in ['timestamp1','timestamp2','timestamp3','first_ts'] if c in data.columns]
data = data.drop(columns=drop_cols)

# Fill remaining NaNs for categorials with 'Unknown'
cat_cols = [c for c in data.columns if c.startswith('event_type') or c.startswith('actor') or c in ['priority','incident_type','affected_system'] if c in data.columns]
for c in cat_cols:
    data[c] = data[c].fillna('Unknown')

# Fill numeric NaNs
for c in ['dt_1_2','dt_2_3','dt_1_3']:
    data[c] = data[c].fillna(data[c].median())

# Quick check
print("Feature columns:", [c for c in data.columns if c not in ['incident_id','sla_breached']][:30])
data[['dt_1_2','dt_2_3','dt_1_3','any_escalated_1_3','any_assigned_1_3','tier2_in_first3']].describe()


In [None]:
FEATURES_CSV = "sla_incident_features_first3_from_raw.csv"
data.to_csv(FEATURES_CSV, index=False)
print("Saved features csv:", FEATURES_CSV)


In [None]:
# Prepare X and y
y = data['sla_breached'].astype(int)
X = data.drop(columns=['incident_id','sla_breached'])

# Define feature groups
numeric_features = [c for c in X.columns if X[c].dtype in [np.float64, np.int64] and not c.startswith('event_type') and not c.startswith('actor')]
categorical_features = [c for c in X.columns if c not in numeric_features]

print("Numeric features:", numeric_features)
print("Categorical features (sample):", categorical_features[:10])

# Train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
print("Train/test sizes:", X_train.shape, X_test.shape)


In [None]:
preprocess = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
])

baseline = Pipeline([
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=500, class_weight='balanced'))
])

baseline.fit(X_train, y_train)
proba_bl = baseline.predict_proba(X_test)[:,1]
pred_bl = (proba_bl >= 0.5).astype(int)

print("Baseline metrics:")
print("AUC:", round(roc_auc_score(y_test, proba_bl), 4))
print(classification_report(y_test, pred_bl))


In [None]:
# Transform training data for XGBoost
X_train_trans = preprocess.fit_transform(X_train)
X_test_trans  = preprocess.transform(X_test)

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
xgb_clf = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight
)
xgb_clf.fit(X_train_trans, y_train, eval_set=[(X_test_trans, y_test)], verbose=False)

proba_xgb = xgb_clf.predict_proba(X_test_trans)[:,1]
pred_xgb = (proba_xgb >= 0.5).astype(int)

print("XGBoost metrics:")
print("AUC:", round(roc_auc_score(y_test, proba_xgb), 4))
print(classification_report(y_test, pred_xgb))


In [None]:
# ROC curves
fpr_bl, tpr_bl, _ = roc_curve(y_test, proba_bl)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, proba_xgb)

plt.figure(figsize=(7,5))
plt.plot(fpr_bl, tpr_bl, label=f'LogReg (AUC={roc_auc_score(y_test, proba_bl):.3f})')
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC={roc_auc_score(y_test, proba_xgb):.3f})')
plt.plot([0,1],[0,1],'--', color='gray')
plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
plt.legend(); plt.title("ROC Curves"); plt.show()

# Confusion matrix for XGBoost
cm = confusion_matrix(y_test, pred_xgb)
plt.figure(figsize=(4,3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Breached','Breached'], yticklabels=['Not Breached','Breached'])
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix (XGBoost)'); plt.show()


In [None]:
# Build feature names after preprocessing
ohe = preprocess.named_transformers_['cat']
ohe_names = list(ohe.get_feature_names_out(categorical_features))
feature_names = numeric_features + ohe_names

# LR coefficients
lr_coef = baseline.named_steps['clf'].coef_[0]
lr_imp = sorted(zip(feature_names, lr_coef), key=lambda x: abs(x[1]), reverse=True)[:15]
print("Top Logistic Regression signals (feature, coef):")
for f,c in lr_imp:
    print(f, round(c,4))

# XGBoost importances
xgb_importances = xgb_clf.feature_importances_
imp_df = pd.DataFrame({'feature': feature_names, 'importance': xgb_importances})
imp_df = imp_df.sort_values('importance', ascending=False).head(15)
print("\nTop XGBoost features:")
print(imp_df.to_string(index=False))


In [None]:
print("Plain-language takeaways (examples):")
print("- Larger dt_1_2 or dt_1_3 (long idle time in early events) tends to increase breach risk.")
print("- Early escalation or Tier2 assignment presence are strong signals (direction depends on data).")
print("- High priority (Critical/High) raises baseline breach probability.")
print("- If 'Investigating' appears without an early assignment, that's often a warning sign.")


In [None]:
# Pick one high-risk incident from test set (by XGBoost probability) and show before/after
proba_bl_test = baseline.predict_proba(X_test)[:,1]
idx_high = int(np.argmax(proba_bl_test))
orig = X_test.iloc[[idx_high]].copy()

print("Original sample (selected cols):")
display(orig[['event_type1','event_type2','event_type3','actor1','actor2','actor3','dt_1_2','dt_2_3']].T)

p_before = baseline.predict_proba(orig)[:,1][0]

# Counterfactual: assign event2 to 'Assigned' and actor2 to 'Tier2' and shorten dt_1_2
cf = orig.copy()
if 'event_type2' in cf.columns:
    cf.loc[:, 'event_type2'] = 'Assigned'
if 'actor2' in cf.columns:
    cf.loc[:, 'actor2'] = 'Tier2'
if 'dt_1_2' in cf.columns:
    cf.loc[:, 'dt_1_2'] = max(1.0, cf.loc[:, 'dt_1_2'] * 0.1)  # shorten to 10% or at least 1 minute

p_after = baseline.predict_proba(cf)[:,1][0]

print(f"Predicted risk BEFORE: {p_before:.3f}")
print(f"Predicted risk AFTER (counterfactual): {p_after:.3f}")
print(f"Delta: {p_after - p_before:+.3f}")


In [None]:
#Saving the model:
import joblib
joblib.dump(preprocess, "preprocess_pipeline.joblib")
joblib.dump(baseline, "baseline_logreg_pipeline.joblib")
joblib.dump(xgb_clf, "xgb_model.joblib")
print("Saved models and preprocessors.")
