In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HT-cable Poisson-LSTM forecast + 9-factor health score (2018-2024)
2025-08-19 — QUARTERLY HEALTH SCORE + SEASONAL FAULT FACTOR
"""

from __future__ import annotations
import logging, math, sys, json, numpy as np, pandas as pd
import torch, torch.nn as nn
from pathlib import Path
from sklearn.metrics import confusion_matrix, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)

# ─── paths ──────────────────────────────────────────────────────────────
FAULT_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/FAULT DATA/HT_fault_cable_info_processed_with_affected.csv")
CABLE_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SWNO_MASTER_COMBINED_FULL_FINAL4.csv")
OUT_DIR = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/A_fault_model_with_health_QUARTERLY")

# ─── constants / hyper-params ───────────────────────────────────────────
MIN_YEAR, TARGET_YEAR   = 2017, 2024
TRAIN_YEARS             = list(range(2019, TARGET_YEAR))
KEEP_VOLTAGES           = {22, 33}
EXPECTED_LIFE_YEARS     = 35

# 9-factor weights
W = dict(  a= 0.01941, c= 0.03371, f= 0.12157, i= 0.10727, l= 0.25278,
           p= 0.15000, r= 0.04904, s= 0.06028, u= 0.20594 )

BATCH, EPOCHS, PATIENCE = 64, 40, 12
LR, WD, CLIP            = 1e-3, 1e-5, 1.0
HID, LAY, DROP, EMB     = 512, 2, 0.1, 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42); np.random.seed(42)

logging.basicConfig(format="%(asctime)s | %(message)s",
                    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout)
log = logging.getLogger("pipeline").info

# ─── helper functions (unchanged) ───────────────────────────────────────
def v_to_num(v):
    try: return float(str(v).lower().replace("kv",""))
    except: return math.nan

def norm_sw(s):
    return (s.astype(str).str.upper().str.strip()
             .str.replace(r"^(SWNO_|SWNO|SW|S)\s*", "", regex=True)
             .str.replace(r"\D+","",regex=True)
             .replace("", np.nan)).astype("Int64")

def month_range(a,b): return pd.period_range(f"{a}-01", f"{b}-12", freq="M")

def sincos(idx):
    ang = 2*np.pi*idx.month.values/12
    return np.stack([np.sin(ang), np.cos(ang)], 1)

def robust(s, lo=5, hi=95):
    c=s.replace([np.inf,-np.inf],np.nan).dropna()
    if c.empty: return pd.Series(0., index=s.index)
    a,b=np.percentile(c,[lo,hi]); return ((s-a).clip(0,b-a)/(b-a+1e-9)).fillna(0.)

def months_to_quarters(mat):
    q=np.zeros((mat.shape[0],4))
    q[:,0]=mat[:,0:3].sum(1); q[:,1]=mat[:,3:6].sum(1)
    q[:,2]=mat[:,6:9].sum(1); q[:,3]=mat[:,9:12].sum(1)
    return q

# ─── 1 · fault history ──────────────────────────────────────────────────
log("STEP 1: Processing fault history...")
fault=pd.read_csv(FAULT_CSV,parse_dates=["TIME_OUTAGE"],low_memory=False)
sw_col="TO_SWITCH" if "TO_SWITCH" in fault.columns else fault.columns[0]
fault["SWITCH_ID"]=norm_sw(fault[sw_col])
fault=fault.dropna(subset=["SWITCH_ID","TIME_OUTAGE"])
fault["TIME_OUTAGE"] = fault["TIME_OUTAGE"].dt.tz_localize(None)
fault=fault[fault["TIME_OUTAGE"].dt.year.between(MIN_YEAR,TARGET_YEAR)]
if "VOLTAGE" in fault.columns:
    fault["VNUM"]=fault["VOLTAGE"].apply(v_to_num)
    fault=fault[fault["VNUM"].isin(KEEP_VOLTAGES)]
fault=fault.drop_duplicates(["SWITCH_ID","TIME_OUTAGE"])
fault["YM"]=fault["TIME_OUTAGE"].dt.to_period("M")
idx_full=month_range(MIN_YEAR,TARGET_YEAR)
counts=(fault[fault["TIME_OUTAGE"].dt.year<=TARGET_YEAR-1]
        .groupby(["SWITCH_ID","YM"]).size()
        .unstack(fill_value=0)
        .reindex(columns=idx_full,fill_value=0).astype(float))
switches=counts.index.tolist(); sw2idx={sw:i for i,sw in enumerate(switches)}
log(f"Processed fault history for {len(switches)} switches.")

# ─── 2 · Poisson-LSTM dataset & training ────────────────────────────────
log("\nSTEP 2: Building dataset and training Poisson-LSTM model...")
class WinDS(Dataset):
    def __init__(self, fr):
        self.x  = torch.tensor(fr["X_seq"],dtype=torch.float32)
        self.xs = torch.tensor(fr["X_season"],dtype=torch.float32)
        self.y  = torch.tensor(fr["y_seq"],dtype=torch.float32)
        self.sw = torch.tensor(fr["sw_idx"],dtype=torch.long)
    def __len__(self): return len(self.sw)
    def __getitem__(self,i): return self.x[i],self.xs[i],self.y[i],self.sw[i]
class PoissonLSTM(nn.Module):
    def __init__(self,n_sw):
        super().__init__()
        self.emb = nn.Embedding(n_sw, EMB)
        self.rnn = nn.LSTM(3,HID,LAY,batch_first=True,
                           dropout=DROP if LAY>1 else 0.)
        self.head=nn.Sequential(nn.Linear(HID+EMB,HID),nn.ReLU(),
                                nn.Linear(HID,HID//2),nn.ReLU(),
                                nn.Linear(HID//2,12))
        self.sp = nn.Softplus()
    def forward(self,x,xs,sw):
        h,_=self.rnn(torch.cat([x,xs],-1))
        h=torch.cat([h[:,-1],self.emb(sw)],1)
        return self.sp(self.head(h))
def build_frames(years, nonzero=True):
    X_seq,X_sea,y_seq,sw_idx=[],[],[],[]
    for sw,row in counts.iterrows():
        for Y in years:
            tr,tg=month_range(Y-2,Y-1),month_range(Y,Y)
            if nonzero and row[tr].sum()==0: continue
            X_seq.append(np.log1p(row[tr]).values[:,None])
            X_sea.append(sincos(tr)); y_seq.append(row[tg].values)
            sw_idx.append(sw2idx[sw])
    return dict(X_seq=np.stack(X_seq), X_season=np.stack(X_sea),
                y_seq=np.stack(y_seq), sw_idx=np.array(sw_idx,dtype=np.int64))
train_frames=build_frames(TRAIN_YEARS,True)
ds=WinDS(train_frames); perm=np.random.permutation(len(ds))
n_val=max(1,int(.1*len(ds)))
dl_tr=DataLoader(torch.utils.data.Subset(ds,perm[:-n_val]),BATCH,shuffle=True)
dl_va=DataLoader(torch.utils.data.Subset(ds,perm[-n_val:]),BATCH,shuffle=False)
model=PoissonLSTM(len(switches)).to(DEVICE)
loss_fn,opt=nn.PoissonNLLLoss(log_input=False),torch.optim.Adam(model.parameters(),lr=LR,weight_decay=WD)
best,bad,best_ep=1e9,0,0
for ep in range(1,EPOCHS+1):
    model.train()
    for xb,xs,yb,swb in dl_tr:
        xb,xs,yb,swb=[t.to(DEVICE) for t in (xb,xs,yb,swb)]
        opt.zero_grad(); loss_fn(model(xb,xs,swb),yb).backward()
        nn.utils.clip_grad_norm_(model.parameters(),CLIP); opt.step()
    with torch.no_grad():
        v=np.mean([loss_fn(model(x.to(DEVICE),xs.to(DEVICE),sw.to(DEVICE)),
                           y.to(DEVICE)).item() for x,xs,y,sw in dl_va])
    if v<best-1e-4: best,bad,best_ep,ckpt=v,0,ep,model.state_dict()
    else: bad+=1
    if bad>=PATIENCE: break
model.load_state_dict(ckpt)
log(f"Training finished in {best_ep} epochs | best val-loss={best:.4f}")

# ─── 3 · TARGET YEAR forecasts ──────────────────────────────────────────
log("\nSTEP 3: Generating forecasts for target year...")
eval_frames=build_frames([TARGET_YEAR],False)
mu,swids=[],[]
with torch.no_grad():
    for xb,xs,_,swb in DataLoader(WinDS(eval_frames),256,False):
        xb,xs,swb=[t.to(DEVICE) for t in (xb,xs,swb)]
        mu.append(model(xb,xs,swb).cpu().numpy())
        swids.append(swb.cpu().numpy())
MU,SWIDX=np.concatenate(mu),np.concatenate(swids)
QTRS=months_to_quarters(MU)
ym=month_range(TARGET_YEAR,TARGET_YEAR)
rows_q,rows_y=[],[]
for i,ix in enumerate(SWIDX):
    sw=switches[ix]
    rows_y.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,PRED_FULL_YEAR=float(MU[i].sum())))
    for qi,qsum in enumerate(QTRS[i],1):
        rows_q.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,QUARTER=qi,PRED_FAULTS_Q=float(qsum)))
q_preds = pd.DataFrame(rows_q)
y_preds = pd.DataFrame(rows_y)
log(f"Generated quarterly and yearly forecasts for {TARGET_YEAR}.")

# ─── 4 · PRE-PROCESSING FOR DYNAMIC FACTORS ─────────────────────────────
log("\nSTEP 4: Pre-processing cable master data for dynamic calculations...")
cables_master = pd.read_csv(CABLE_CSV, low_memory=False)
def parse_date_from_col(col_name, prefix):
    try:
        date_str = col_name.replace(f"{prefix}_Month_", "")
        return pd.to_datetime(date_str, format='%Y%m')
    except: return None
cycle_cols = {c: parse_date_from_col(c, "CYCLE") for c in cables_master.columns if c.startswith("CYCLE_Month_")}
var_cols = {c: parse_date_from_col(c, "VAR") for c in cables_master.columns if c.startswith("VAR_Month_")}
cycle_cols = {k: v for k, v in cycle_cols.items() if v is not None}
var_cols = {k: v for k, v in var_cols.items() if v is not None}
log(f"Found {len(cycle_cols)} monthly cycle columns and {len(var_cols)} monthly variation columns.")

# ─── 5 · QUARTERLY HEALTH SCORE WITH DYNAMIC LOADING ────────────────────
log("\nSTEP 5: Calculating Health Scores per Quarter with dynamic loading...")
QUARTERS = {
    "Q1_Jan-Mar": (1, 3), "Q2_Apr-Jun": (4, 6),
    "Q3_Jul-Sep": (7, 9), "Q4_Oct-Dec": (10, 12)
}
# MODIFIED: Updated the label for the 'u' factor
LABELS = {
    "c":"High cyclic loading (c)", "r":"Wide load-range utilisation (r)",
    "a":"Advanced cable age (a)", "f":"Many historic faults (f)",
    "s":"Numerous joints / segments (s)", "p":"High predicted faults per km (p)",
    "i":"Frequent interruptions (i)", "l":"Long circuit length (ℓ)",
    "u":"Faults in same quarter last year (u)"
}
OUT_DIR.mkdir(parents=True, exist_ok=True)

for i, (q_name, (start_month, end_month)) in enumerate(QUARTERS.items()):
    q_num = i + 1
    log(f"\n--- Processing {q_name} ({TARGET_YEAR}) ---")
    q_out_dir = OUT_DIR / q_name
    q_out_dir.mkdir(parents=True, exist_ok=True)
    q_start_date = pd.Timestamp(f"{TARGET_YEAR}-{start_month}-01")
    loading_start_date = q_start_date - pd.DateOffset(years=1)
    cab = cables_master.drop_duplicates("DESTINATION_SWITCH_ID").rename(columns={
        "DESTINATION_SWITCH_ID":"SWITCH_ID", "MEASUREDLENGTH":"LENGTH_M",
        "COMMISSIONEDDATE":"DATE_INSTALLED", "NO_OF_SEGMENT":"SEGMENTS"})
    cab["DATE_INSTALLED"] = pd.to_datetime(cab["DATE_INSTALLED"], errors="coerce", utc=True).dt.tz_localize(None)
    cab["LENGTH_KM"] = pd.to_numeric(cab["LENGTH_M"], errors="coerce") / 1000
    relevant_cycle_cols = [c for c, dt in cycle_cols.items() if loading_start_date <= dt < q_start_date]
    relevant_var_cols = [c for c, dt in var_cols.items() if loading_start_date <= dt < q_start_date]
    if relevant_cycle_cols: cab["cycle_pm"] = cab[relevant_cycle_cols].mean(axis=1)
    else: cab["cycle_pm"] = 0
    if relevant_var_cols:
        median_of_period = cab[relevant_var_cols].median(axis=1)
        cab["load_range_idx"] = cab[relevant_var_cols].mean(axis=1) / median_of_period.replace(0, np.nan)
    else: cab["load_range_idx"] = 0
    cab["c_raw"] = cab["cycle_pm"]; cab["r_raw"] = cab["load_range_idx"]
    hist = fault[fault["TIME_OUTAGE"] < q_start_date].groupby("SWITCH_ID").size()
    cab = cab.merge(hist.rename("hist_faults"), on="SWITCH_ID", how="left").fillna({"hist_faults": 0})
    cab["f_raw"] = cab["hist_faults"] / cab["LENGTH_KM"].replace(0, np.nan)
    q_pred_current = q_preds[q_preds['QUARTER'] == q_num]
    cab = cab.merge(q_pred_current[['SWITCH_ID', 'PRED_FAULTS_Q']], on="SWITCH_ID", how="left").fillna({"PRED_FAULTS_Q": 0})
    cab["p_raw"] = cab["PRED_FAULTS_Q"].replace(0, np.nan)
    fault_sorted = fault[fault["TIME_OUTAGE"] < q_start_date].sort_values(["SWITCH_ID", "TIME_OUTAGE"])
    fault_sorted["Δt_h"] = (fault_sorted.groupby("SWITCH_ID")["TIME_OUTAGE"].diff().dt.total_seconds().div(3600))
    mean_dt = fault_sorted.groupby("SWITCH_ID")["Δt_h"].mean().rename("mean_h")
    cab = cab.merge(mean_dt, on="SWITCH_ID", how="left")
    cab["i_raw"] = 1 / cab["mean_h"].clip(lower=1)
    
    # MODIFIED: 'u' factor is now faults in the same quarter of the previous year
    last_year = TARGET_YEAR - 1
    same_q_faults_last_year = fault[
        (fault["TIME_OUTAGE"].dt.year == last_year) &
        (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))
    ].groupby("SWITCH_ID").size()
    cab = cab.merge(same_q_faults_last_year.rename("faults_same_q_last_yr"), on="SWITCH_ID", how="left").fillna({"faults_same_q_last_yr": 0})
    cab["u_raw"] = cab["faults_same_q_last_yr"]
    
    cab["a_raw"] = (q_start_date - cab["DATE_INSTALLED"]).dt.days / (EXPECTED_LIFE_YEARS * 365)
    cab["s_raw"] = (cab["SEGMENTS"].fillna(1) - 1).clip(lower=0)
    cab["l_raw"] = np.log1p(cab["LENGTH_KM"])
    for col in ["a_raw", "c_raw", "f_raw", "i_raw", "l_raw", "p_raw", "r_raw", "s_raw", "u_raw"]:
        cab[col[0]] = robust(cab[col])
    risk = sum(W[k] * cab[k] for k in W.keys())
    cab["health_score"] = np.rint(100 * (1 - risk)).clip(0, 100).astype(int)
    cab["health_score_10"] = np.clip(np.ceil(cab["health_score"] / 10), 1, 10).astype(int)
    cab["health_band"] = pd.cut(cab["health_score"], [-np.inf, 40, 60, 100], labels=["Poor", "Moderate", "Good"])
    for k, wv in W.items(): cab[f"weight_{k}"] = wv
    risk_contrib = pd.DataFrame({k: W[k] * cab[k] for k in W.keys()})
    top3 = risk_contrib.apply(lambda r: r.nlargest(3).index.tolist(), axis=1)
    cab["primary_health_driver"] = top3.apply(lambda lst: LABELS[lst[0]])
    cab["top3_health_drivers"] = top3.apply(lambda lst: "; ".join(LABELS[k] for k in lst))

    # MODIFIED: Logic for "no fault" assets now only considers total historical faults
    mask_nofault = cab["hist_faults"].eq(0)
    cab.loc[mask_nofault, ["health_score", "health_score_10"]] = [100, 10]
    cab.loc[mask_nofault, "health_band"] = "Good"
    cab.loc[mask_nofault, "primary_health_driver"] = "No recorded faults"
    cab.loc[mask_nofault, "top3_health_drivers"] = "No recorded faults"

    output_cols = [c for c in cab.columns if c not in cycle_cols and c not in var_cols]
    cab[output_cols].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_scored.csv", index=False)
    cab[output_cols][~mask_nofault].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_predictions.csv", index=False)
    log(f"Saved results for {len(cab)} cables to {q_out_dir.name}/")
    
    actual_faults_q = fault[(fault["TIME_OUTAGE"].dt.year == TARGET_YEAR) & (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))]
    actual_switches_q = actual_faults_q["SWITCH_ID"].unique()
    cab["ACTUAL_FAIL_Q"] = cab["SWITCH_ID"].isin(actual_switches_q).astype(int)
    pred = cab["health_band"].map({"Poor": 1, "Moderate": 1, "Good": 0})
    if len(cab["ACTUAL_FAIL_Q"].unique()) > 1:
        cm = confusion_matrix(cab["ACTUAL_FAIL_Q"], pred)
        au = roc_auc_score(cab["ACTUAL_FAIL_Q"], 100 - cab["health_score"])
        log(f"{q_name} confusion matrix:\n{cm}\nAUROC = {au:.3f}")
    else:
        log(f"Skipping validation for {q_name}: Only one class present in actuals.")

# --- Final Summary ---
total_predicted_faults = y_preds['PRED_FULL_YEAR'].sum()
total_actual_faults_2024 = fault[fault['TIME_OUTAGE'].dt.year == TARGET_YEAR].shape[0]

log("\n" + "="*45)
log(f"FLEET-WIDE FAULT SUMMARY FOR FULL YEAR {TARGET_YEAR}")
log("="*45)
log(f"Total Predicted Faults (Yearly): {total_predicted_faults:.1f}")
log(f"Total Actual Faults (Yearly):    {total_actual_faults_2024}")
log("="*45)
log(f"\nPipeline complete → Quarterly results saved in: {OUT_DIR.resolve()}")

22:56:21 | STEP 1: Processing fault history...


22:56:22 | Processed fault history for 208 switches.
22:56:22 | 
STEP 2: Building dataset and training Poisson-LSTM model...
22:56:24 | Training finished in 5 epochs | best val-loss=0.2613
22:56:24 | 
STEP 3: Generating forecasts for target year...
22:56:24 | Generated quarterly and yearly forecasts for 2024.
22:56:24 | 
STEP 4: Pre-processing cable master data for dynamic calculations...
22:56:25 | Found 0 monthly cycle columns and 0 monthly variation columns.
22:56:25 | 
STEP 5: Calculating Health Scores per Quarter with dynamic loading...
22:56:25 | 
--- Processing Q1_Jan-Mar (2024) ---
22:56:25 | Saved results for 256 cables to Q1_Jan-Mar/
22:56:25 | Q1_Jan-Mar confusion matrix:
[[191  43]
 [ 10  12]]
AUROC = 0.752
22:56:25 | 
--- Processing Q2_Apr-Jun (2024) ---
22:56:25 | Saved results for 256 cables to Q2_Apr-Jun/
22:56:25 | Q2_Apr-Jun confusion matrix:
[[178  34]
 [ 24  20]]
AUROC = 0.724
22:56:25 | 
--- Processing Q3_Jul-Sep (2024) ---
22:56:25 | Saved results for 256 cables t

In [6]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HT-cable Poisson-LSTM forecast + 9-factor health score (2018-2024)
2025-08-19 — QUARTERLY HEALTH SCORE + AGGRESSIVE WEIGHTS
"""

from __future__ import annotations
import logging, math, sys, json, numpy as np, pandas as pd
import torch, torch.nn as nn
from pathlib import Path
from sklearn.metrics import confusion_matrix, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)

# ─── paths ──────────────────────────────────────────────────────────────
FAULT_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/FAULT DATA/HT_fault_cable_info_processed_with_affected.csv")
CABLE_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SWNO_MASTER_COMBINED_FULL_FINAL3.csv")
OUT_DIR = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/A_fault_model_with_health_QUARTERLY")

# ─── constants / hyper-params ───────────────────────────────────────────
MIN_YEAR, TARGET_YEAR   = 2016, 2024
TRAIN_YEARS             = list(range(2018, TARGET_YEAR))
KEEP_VOLTAGES           = {22, 33}
EXPECTED_LIFE_YEARS     = 35

# MODIFIED: Weights have been made more aggressive as per your request
W = dict(  a= 0.01941,
  c= 0.53371,
  f= 0.12157,
  i= 0.10727,
  l= 0.20278,
  p= 0.15000,
  r= 0.54904,
  s= 0.06028,
  u= 0.15594 )
BATCH, EPOCHS, PATIENCE = 64, 40, 12
LR, WD, CLIP            = 1e-3, 1e-5, 1.0
HID, LAY, DROP, EMB     = 512, 2, 0.1, 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42); np.random.seed(42)

logging.basicConfig(format="%(asctime)s | %(message)s",
                    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout)
log = logging.getLogger("pipeline").info

# ─── helper functions (unchanged) ───────────────────────────────────────
def v_to_num(v):
    try: return float(str(v).lower().replace("kv",""))
    except: return math.nan

def norm_sw(s):
    return (s.astype(str).str.upper().str.strip()
             .str.replace(r"^(SWNO_|SWNO|SW|S)\s*", "", regex=True)
             .str.replace(r"\D+","",regex=True)
             .replace("", np.nan)).astype("Int64")

def month_range(a,b): return pd.period_range(f"{a}-01", f"{b}-12", freq="M")

def sincos(idx):
    ang = 2*np.pi*idx.month.values/12
    return np.stack([np.sin(ang), np.cos(ang)], 1)

def robust(s, lo=5, hi=95):
    c=s.replace([np.inf,-np.inf],np.nan).dropna()
    if c.empty: return pd.Series(0., index=s.index)
    a,b=np.percentile(c,[lo,hi]); return ((s-a).clip(0,b-a)/(b-a+1e-9)).fillna(0.)

def months_to_quarters(mat):
    q=np.zeros((mat.shape[0],4))
    q[:,0]=mat[:,0:3].sum(1); q[:,1]=mat[:,3:6].sum(1)
    q[:,2]=mat[:,6:9].sum(1); q[:,3]=mat[:,9:12].sum(1)
    return q

# ─── 1 · fault history ──────────────────────────────────────────────────
log("STEP 1: Processing fault history...")
fault=pd.read_csv(FAULT_CSV,parse_dates=["TIME_OUTAGE"],low_memory=False)
sw_col="TO_SWITCH" if "TO_SWITCH" in fault.columns else fault.columns[0]
fault["SWITCH_ID"]=norm_sw(fault[sw_col])
fault=fault.dropna(subset=["SWITCH_ID","TIME_OUTAGE"])
fault["TIME_OUTAGE"] = fault["TIME_OUTAGE"].dt.tz_localize(None)
fault=fault[fault["TIME_OUTAGE"].dt.year.between(MIN_YEAR,TARGET_YEAR)]
if "VOLTAGE" in fault.columns:
    fault["VNUM"]=fault["VOLTAGE"].apply(v_to_num)
    fault=fault[fault["VNUM"].isin(KEEP_VOLTAGES)]
fault=fault.drop_duplicates(["SWITCH_ID","TIME_OUTAGE"])
fault["YM"]=fault["TIME_OUTAGE"].dt.to_period("M")
idx_full=month_range(MIN_YEAR,TARGET_YEAR)
counts=(fault[fault["TIME_OUTAGE"].dt.year<=TARGET_YEAR-1]
        .groupby(["SWITCH_ID","YM"]).size()
        .unstack(fill_value=0)
        .reindex(columns=idx_full,fill_value=0).astype(float))
switches=counts.index.tolist(); sw2idx={sw:i for i,sw in enumerate(switches)}
log(f"Processed fault history for {len(switches)} switches.")

# ─── 2 · Poisson-LSTM dataset & training ────────────────────────────────
log("\nSTEP 2: Building dataset and training Poisson-LSTM model...")
class WinDS(Dataset):
    def __init__(self, fr):
        self.x  = torch.tensor(fr["X_seq"],dtype=torch.float32)
        self.xs = torch.tensor(fr["X_season"],dtype=torch.float32)
        self.y  = torch.tensor(fr["y_seq"],dtype=torch.float32)
        self.sw = torch.tensor(fr["sw_idx"],dtype=torch.long)
    def __len__(self): return len(self.sw)
    def __getitem__(self,i): return self.x[i],self.xs[i],self.y[i],self.sw[i]
class PoissonLSTM(nn.Module):
    def __init__(self,n_sw):
        super().__init__()
        self.emb = nn.Embedding(n_sw, EMB)
        self.rnn = nn.LSTM(3,HID,LAY,batch_first=True,
                           dropout=DROP if LAY>1 else 0.)
        self.head=nn.Sequential(nn.Linear(HID+EMB,HID),nn.ReLU(),
                                nn.Linear(HID,HID//2),nn.ReLU(),
                                nn.Linear(HID//2,12))
        self.sp = nn.Softplus()
    def forward(self,x,xs,sw):
        h,_=self.rnn(torch.cat([x,xs],-1))
        h=torch.cat([h[:,-1],self.emb(sw)],1)
        return self.sp(self.head(h))
def build_frames(years, nonzero=True):
    X_seq,X_sea,y_seq,sw_idx=[],[],[],[]
    for sw,row in counts.iterrows():
        for Y in years:
            tr,tg=month_range(Y-2,Y-1),month_range(Y,Y)
            if nonzero and row[tr].sum()==0: continue
            X_seq.append(np.log1p(row[tr]).values[:,None])
            X_sea.append(sincos(tr)); y_seq.append(row[tg].values)
            sw_idx.append(sw2idx[sw])
    return dict(X_seq=np.stack(X_seq), X_season=np.stack(X_sea),
                y_seq=np.stack(y_seq), sw_idx=np.array(sw_idx,dtype=np.int64))
train_frames=build_frames(TRAIN_YEARS,True)
ds=WinDS(train_frames); perm=np.random.permutation(len(ds))
n_val=max(1,int(.1*len(ds)))
dl_tr=DataLoader(torch.utils.data.Subset(ds,perm[:-n_val]),BATCH,shuffle=True)
dl_va=DataLoader(torch.utils.data.Subset(ds,perm[-n_val:]),BATCH,shuffle=False)
model=PoissonLSTM(len(switches)).to(DEVICE)
loss_fn,opt=nn.PoissonNLLLoss(log_input=False),torch.optim.Adam(model.parameters(),lr=LR,weight_decay=WD)
best,bad,best_ep=1e9,0,0
for ep in range(1,EPOCHS+1):
    model.train()
    for xb,xs,yb,swb in dl_tr:
        xb,xs,yb,swb=[t.to(DEVICE) for t in (xb,xs,yb,swb)]
        opt.zero_grad(); loss_fn(model(xb,xs,swb),yb).backward()
        nn.utils.clip_grad_norm_(model.parameters(),CLIP); opt.step()
    with torch.no_grad():
        v=np.mean([loss_fn(model(x.to(DEVICE),xs.to(DEVICE),sw.to(DEVICE)),
                           y.to(DEVICE)).item() for x,xs,y,sw in dl_va])
    if v<best-1e-4: best,bad,best_ep,ckpt=v,0,ep,model.state_dict()
    else: bad+=1
    if bad>=PATIENCE: break
model.load_state_dict(ckpt)
log(f"Training finished in {best_ep} epochs | best val-loss={best:.4f}")

# ─── 3 · TARGET YEAR forecasts ──────────────────────────────────────────
log("\nSTEP 3: Generating forecasts for target year...")
eval_frames=build_frames([TARGET_YEAR],False)
mu,swids=[],[]
with torch.no_grad():
    for xb,xs,_,swb in DataLoader(WinDS(eval_frames),256,False):
        xb,xs,swb=[t.to(DEVICE) for t in (xb,xs,swb)]
        mu.append(model(xb,xs,swb).cpu().numpy())
        swids.append(swb.cpu().numpy())
MU,SWIDX=np.concatenate(mu),np.concatenate(swids)
QTRS=months_to_quarters(MU)
ym=month_range(TARGET_YEAR,TARGET_YEAR)
rows_q,rows_y=[],[]
for i,ix in enumerate(SWIDX):
    sw=switches[ix]
    rows_y.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,PRED_FULL_YEAR=float(MU[i].sum())))
    for qi,qsum in enumerate(QTRS[i],1):
        rows_q.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,QUARTER=qi,PRED_FAULTS_Q=float(qsum)))
q_preds = pd.DataFrame(rows_q)
y_preds = pd.DataFrame(rows_y)
log(f"Generated quarterly and yearly forecasts for {TARGET_YEAR}.")

# ─── 4 · PRE-PROCESSING FOR DYNAMIC FACTORS ─────────────────────────────
log("\nSTEP 4: Pre-processing cable master data for dynamic calculations...")
cables_master = pd.read_csv(CABLE_CSV, low_memory=False)
def parse_date_from_col(col_name, prefix):
    try:
        date_str = col_name.replace(f"{prefix}_Month_", "")
        return pd.to_datetime(date_str, format='%Y%m')
    except: return None
cycle_cols = {c: parse_date_from_col(c, "CYCLE") for c in cables_master.columns if c.startswith("CYCLE_Month_")}
var_cols = {c: parse_date_from_col(c, "VAR") for c in cables_master.columns if c.startswith("VAR_Month_")}
cycle_cols = {k: v for k, v in cycle_cols.items() if v is not None}
var_cols = {k: v for k, v in var_cols.items() if v is not None}
log(f"Found {len(cycle_cols)} monthly cycle columns and {len(var_cols)} monthly variation columns.")

# ─── 5 · QUARTERLY HEALTH SCORE WITH DYNAMIC LOADING ────────────────────
log("\nSTEP 5: Calculating Health Scores per Quarter with dynamic loading...")
QUARTERS = {
    "Q1_Jan-Mar": (1, 3), "Q2_Apr-Jun": (4, 6),
    "Q3_Jul-Sep": (7, 9), "Q4_Oct-Dec": (10, 12)
}
LABELS = {
    "c":"High cyclic loading (c)", "r":"Wide load-range utilisation (r)",
    "a":"Advanced cable age (a)", "f":"Many historic faults (f)",
    "s":"Numerous joints / segments (s)", "p":"High predicted faults per km (p)",
    "i":"Frequent interruptions (i)", "l":"Long circuit length (ℓ)",
    "u":"Faults in same quarter last year (u)"
}
OUT_DIR.mkdir(parents=True, exist_ok=True)

for i, (q_name, (start_month, end_month)) in enumerate(QUARTERS.items()):
    q_num = i + 1
    log(f"\n--- Processing {q_name} ({TARGET_YEAR}) ---")
    q_out_dir = OUT_DIR / q_name
    q_out_dir.mkdir(parents=True, exist_ok=True)
    q_start_date = pd.Timestamp(f"{TARGET_YEAR}-{start_month}-01")
    loading_start_date = q_start_date - pd.DateOffset(years=1)
    cab = cables_master.drop_duplicates("DESTINATION_SWITCH_ID").rename(columns={
        "DESTINATION_SWITCH_ID":"SWITCH_ID", "MEASUREDLENGTH":"LENGTH_M",
        "COMMISSIONEDDATE":"DATE_INSTALLED", "NO_OF_SEGMENT":"SEGMENTS"})
    cab["DATE_INSTALLED"] = pd.to_datetime(cab["DATE_INSTALLED"], errors="coerce", utc=True).dt.tz_localize(None)
    cab["LENGTH_KM"] = pd.to_numeric(cab["LENGTH_M"], errors="coerce") / 1000
    relevant_cycle_cols = [c for c, dt in cycle_cols.items() if loading_start_date <= dt < q_start_date]
    relevant_var_cols = [c for c, dt in var_cols.items() if loading_start_date <= dt < q_start_date]
    if relevant_cycle_cols: cab["cycle_pm"] = cab[relevant_cycle_cols].mean(axis=1)
    else: cab["cycle_pm"] = 0
    if relevant_var_cols:
        median_of_period = cab[relevant_var_cols].median(axis=1)
        cab["load_range_idx"] = cab[relevant_var_cols].mean(axis=1) / median_of_period.replace(0, np.nan)
    else: cab["load_range_idx"] = 0
    cab["c_raw"] = cab["cycle_pm"]; cab["r_raw"] = cab["load_range_idx"]
    hist = fault[fault["TIME_OUTAGE"] < q_start_date].groupby("SWITCH_ID").size()
    cab = cab.merge(hist.rename("hist_faults"), on="SWITCH_ID", how="left").fillna({"hist_faults": 0})
    cab["f_raw"] = cab["hist_faults"] / cab["LENGTH_KM"].replace(0, np.nan)
    q_pred_current = q_preds[q_preds['QUARTER'] == q_num]
    cab = cab.merge(q_pred_current[['SWITCH_ID', 'PRED_FAULTS_Q']], on="SWITCH_ID", how="left").fillna({"PRED_FAULTS_Q": 0})
    cab["p_raw"] = cab["PRED_FAULTS_Q"].replace(0, np.nan)
    fault_sorted = fault[fault["TIME_OUTAGE"] < q_start_date].sort_values(["SWITCH_ID", "TIME_OUTAGE"])
    fault_sorted["Δt_h"] = (fault_sorted.groupby("SWITCH_ID")["TIME_OUTAGE"].diff().dt.total_seconds().div(3600))
    mean_dt = fault_sorted.groupby("SWITCH_ID")["Δt_h"].mean().rename("mean_h")
    cab = cab.merge(mean_dt, on="SWITCH_ID", how="left")
    cab["i_raw"] = 1 / cab["mean_h"].clip(lower=1)
    last_year = TARGET_YEAR - 1
    same_q_faults_last_year = fault[
        (fault["TIME_OUTAGE"].dt.year == last_year) &
        (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))
    ].groupby("SWITCH_ID").size()
    cab = cab.merge(same_q_faults_last_year.rename("faults_same_q_last_yr"), on="SWITCH_ID", how="left").fillna({"faults_same_q_last_yr": 0})
    cab["u_raw"] = cab["faults_same_q_last_yr"]
    cab["a_raw"] = (q_start_date - cab["DATE_INSTALLED"]).dt.days / (EXPECTED_LIFE_YEARS * 365)
    cab["s_raw"] = (cab["SEGMENTS"].fillna(1) - 1).clip(lower=0)
    cab["l_raw"] = np.log1p(cab["LENGTH_KM"])
    for col in ["a_raw", "c_raw", "f_raw", "i_raw", "l_raw", "p_raw", "r_raw", "s_raw", "u_raw"]:
        cab[col[0]] = robust(cab[col])
    risk = sum(W[k] * cab[k] for k in W.keys())
    cab["health_score"] = np.rint(100 * (1 - risk)).clip(0, 100).astype(int)
    cab["health_score_10"] = np.clip(np.ceil(cab["health_score"] / 10), 1, 10).astype(int)
    cab["health_band"] = pd.cut(cab["health_score"], [-np.inf, 40, 70, 100], labels=["Poor", "Moderate", "Good"])
    for k, wv in W.items(): cab[f"weight_{k}"] = wv
    risk_contrib = pd.DataFrame({k: W[k] * cab[k] for k in W.keys()})
    top3 = risk_contrib.apply(lambda r: r.nlargest(3).index.tolist(), axis=1)
    cab["primary_health_driver"] = top3.apply(lambda lst: LABELS[lst[0]])
    cab["top3_health_drivers"] = top3.apply(lambda lst: "; ".join(LABELS[k] for k in lst))
    mask_nofault = cab["hist_faults"].eq(0)
    cab.loc[mask_nofault, ["health_score", "health_score_10"]] = [100, 10]
    cab.loc[mask_nofault, "health_band"] = "Good"
    cab.loc[mask_nofault, "primary_health_driver"] = "No recorded faults"
    cab.loc[mask_nofault, "top3_health_drivers"] = "No recorded faults"
    output_cols = [c for c in cab.columns if c not in cycle_cols and c not in var_cols]
    cab[output_cols].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_scored.csv", index=False)
    cab[output_cols][~mask_nofault].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_predictions.csv", index=False)
    log(f"Saved results for {len(cab)} cables to {q_out_dir.name}/")
    actual_faults_q = fault[(fault["TIME_OUTAGE"].dt.year == TARGET_YEAR) & (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))]
    actual_switches_q = actual_faults_q["SWITCH_ID"].unique()
    cab["ACTUAL_FAIL_Q"] = cab["SWITCH_ID"].isin(actual_switches_q).astype(int)
    pred = cab["health_band"].map({"Poor": 1, "Moderate": 1, "Good": 0})
    if len(cab["ACTUAL_FAIL_Q"].unique()) > 1:
        cm = confusion_matrix(cab["ACTUAL_FAIL_Q"], pred)
        au = roc_auc_score(cab["ACTUAL_FAIL_Q"], 100 - cab["health_score"])
        log(f"{q_name} confusion matrix:\n{cm}\nAUROC = {au:.3f}")
    else:
        log(f"Skipping validation for {q_name}: Only one class present in actuals.")

# --- Final Summary ---
total_predicted_faults = y_preds['PRED_FULL_YEAR'].sum()
total_actual_faults_2024 = fault[fault['TIME_OUTAGE'].dt.year == TARGET_YEAR].shape[0]

log("\n" + "="*45)
log(f"FLEET-WIDE FAULT SUMMARY FOR FULL YEAR {TARGET_YEAR}")
log("="*45)
log(f"Total Predicted Faults (Yearly): {total_predicted_faults:.1f}")
log(f"Total Actual Faults (Yearly):    {total_actual_faults_2024}")
log("="*45)
log(f"\nPipeline complete → Quarterly results saved in: {OUT_DIR.resolve()}")

16:46:49 | STEP 1: Processing fault history...
16:46:49 | Processed fault history for 210 switches.
16:46:49 | 
STEP 2: Building dataset and training Poisson-LSTM model...
16:46:52 | Training finished in 4 epochs | best val-loss=0.1891
16:46:52 | 
STEP 3: Generating forecasts for target year...
16:46:53 | Generated quarterly and yearly forecasts for 2024.
16:46:53 | 
STEP 4: Pre-processing cable master data for dynamic calculations...
16:46:53 | Found 0 monthly cycle columns and 0 monthly variation columns.
16:46:53 | 
STEP 5: Calculating Health Scores per Quarter with dynamic loading...
16:46:53 | 
--- Processing Q1_Jan-Mar (2024) ---
16:46:53 | Saved results for 256 cables to Q1_Jan-Mar/
16:46:53 | Q1_Jan-Mar confusion matrix:
[[165  69]
 [  5  17]]
AUROC = 0.781
16:46:53 | 
--- Processing Q2_Apr-Jun (2024) ---
16:46:53 | Saved results for 256 cables to Q2_Apr-Jun/
16:46:53 | Q2_Apr-Jun confusion matrix:
[[151  61]
 [ 17  27]]
AUROC = 0.727
16:46:53 | 
--- Processing Q3_Jul-Sep (2024

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HT-cable Poisson-LSTM forecast + 9-factor health score (2018-2024)
2025-08-19 — QUARTERLY HEALTH SCORE + AGGRESSIVE WEIGHTS
"""

from __future__ import annotations
import logging, math, sys, json, numpy as np, pandas as pd
import torch, torch.nn as nn
from pathlib import Path
from sklearn.metrics import confusion_matrix, roc_auc_score
from torch.utils.data import Dataset, DataLoader
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)

# ── NEW: plotting (headless-safe) ────────────────────────────────────────
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

# ─── paths ──────────────────────────────────────────────────────────────
FAULT_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/FAULT DATA/HT_fault_cable_info_processed_with_affected.csv")
CABLE_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SWNO_MASTER_COMBINED_FULL_FINAL3.csv")
OUT_DIR   = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/A_fault_model_with_health_QUARTERLY")

# ─── constants / hyper-params ───────────────────────────────────────────
MIN_YEAR, TARGET_YEAR   = 2016, 2024
TRAIN_YEARS             = list(range(2018, TARGET_YEAR))
KEEP_VOLTAGES           = {22, 33}
EXPECTED_LIFE_YEARS     = 35

# Aggressive weights
W = dict(
    a=0.01500,  # Age (decreased)
    c=0.10000,  # Cycle Count (increased)
    f=0.155000,  # Historic Faults (increased)
    i=0.14000,  # MTBF (increased)
    l=0.15000,  # Length (decreased)
    p=0.25000,  # Predicted Faults (increased)
    r=0.06500,  # Daily Variation (increased)
    s=0.05000,  # Segments (decreased)
    u=0.10000,  # Same Qtr Last Year (decreased)
)

BATCH, EPOCHS, PATIENCE = 64, 40, 12
LR, WD, CLIP            = 1e-3, 1e-5, 1.0
HID, LAY, DROP, EMB     = 512, 2, 0.1, 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42); np.random.seed(42)

logging.basicConfig(format="%(asctime)s | %(message)s",
                    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout)
log = logging.getLogger("pipeline").info

# ─── helpers ────────────────────────────────────────────────────────────
def v_to_num(v):
    try: return float(str(v).lower().replace("kv",""))
    except: return math.nan

def norm_sw(s):
    return (s.astype(str).str.upper().str.strip()
             .str.replace(r"^(SWNO_|SWNO|SW|S)\s*", "", regex=True)
             .str.replace(r"\D+","",regex=True)
             .replace("", np.nan)).astype("Int64")

def month_range(a,b): return pd.period_range(f"{a}-01", f"{b}-12", freq="M")

def sincos(idx):
    ang = 2*np.pi*idx.month.values/12
    return np.stack([np.sin(ang), np.cos(ang)], 1)

def robust(s, lo=5, hi=95):
    c=s.replace([np.inf,-np.inf],np.nan).dropna()
    if c.empty: return pd.Series(0., index=s.index)
    a,b=np.percentile(c,[lo,hi]); return ((s-a).clip(0,b-a)/(b-a+1e-9)).fillna(0.)

def months_to_quarters(mat):
    q=np.zeros((mat.shape[0],4))
    q[:,0]=mat[:,0:3].sum(1); q[:,1]=mat[:,3:6].sum(1)
    q[:,2]=mat[:,6:9].sum(1); q[:,3]=mat[:,9:12].sum(1)
    return q

# ── NEW: confusion-matrix plot helper ───────────────────────────────────
def save_confusion_matrix(cm: np.ndarray, save_path: Path, title: str):
    """Save a 2×2 confusion matrix as a blue heatmap with counts."""
    fig, ax = plt.subplots(figsize=(6, 5))
    im = ax.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
    fig.colorbar(im, ax=ax)

    ax.set(
        xticks=[0, 1], yticks=[0, 1],
        xticklabels=["Pred OK", "Pred Fail"],
        yticklabels=["Actual OK", "Actual Fail"],
        xlabel="Prediction", ylabel="Reality", title=title,
    )

    vmax = cm.max() if cm.size else 1
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, f"{int(cm[i, j])}",
                    ha="center", va="center",
                    color=("white" if cm[i, j] > vmax/2 else "black"),
                    fontsize=12)
    fig.tight_layout()
    fig.savefig(save_path, dpi=200, bbox_inches="tight")
    plt.close(fig)

# ─── 1 · fault history ──────────────────────────────────────────────────
log("STEP 1: Processing fault history...")
fault=pd.read_csv(FAULT_CSV,parse_dates=["TIME_OUTAGE"],low_memory=False)
sw_col="TO_SWITCH" if "TO_SWITCH" in fault.columns else fault.columns[0]
fault["SWITCH_ID"]=norm_sw(fault[sw_col])
fault=fault.dropna(subset=["SWITCH_ID","TIME_OUTAGE"])
fault["TIME_OUTAGE"] = fault["TIME_OUTAGE"].dt.tz_localize(None)
fault=fault[fault["TIME_OUTAGE"].dt.year.between(MIN_YEAR,TARGET_YEAR)]
if "VOLTAGE" in fault.columns:
    fault["VNUM"]=fault["VOLTAGE"].apply(v_to_num)
    fault=fault[fault["VNUM"].isin(KEEP_VOLTAGES)]
fault=fault.drop_duplicates(["SWITCH_ID","TIME_OUTAGE"])
fault["YM"]=fault["TIME_OUTAGE"].dt.to_period("M")
idx_full=month_range(MIN_YEAR,TARGET_YEAR)
counts=(fault[fault["TIME_OUTAGE"].dt.year<=TARGET_YEAR-1]
        .groupby(["SWITCH_ID","YM"]).size()
        .unstack(fill_value=0)
        .reindex(columns=idx_full,fill_value=0).astype(float))
switches=counts.index.tolist(); sw2idx={sw:i for i,sw in enumerate(switches)}
log(f"Processed fault history for {len(switches)} switches.")

# ─── 2 · Poisson-LSTM dataset & training ────────────────────────────────
log("\nSTEP 2: Building dataset and training Poisson-LSTM model...")
class WinDS(Dataset):
    def __init__(self, fr):
        self.x  = torch.tensor(fr["X_seq"],dtype=torch.float32)
        self.xs = torch.tensor(fr["X_season"],dtype=torch.float32)
        self.y  = torch.tensor(fr["y_seq"],dtype=torch.float32)
        self.sw = torch.tensor(fr["sw_idx"],dtype=torch.long)
    def __len__(self): return len(self.sw)
    def __getitem__(self,i): return self.x[i],self.xs[i],self.y[i],self.sw[i]

class PoissonLSTM(nn.Module):
    def __init__(self,n_sw):
        super().__init__()
        self.emb = nn.Embedding(n_sw, EMB)
        self.rnn = nn.LSTM(3,HID,LAY,batch_first=True,
                           dropout=DROP if LAY>1 else 0.)
        self.head=nn.Sequential(nn.Linear(HID+EMB,HID),nn.ReLU(),
                                nn.Linear(HID,HID//2),nn.ReLU(),
                                nn.Linear(HID//2,12))
        self.sp = nn.Softplus()
    def forward(self,x,xs,sw):
        h,_=self.rnn(torch.cat([x,xs],-1))
        h=torch.cat([h[:,-1],self.emb(sw)],1)
        return self.sp(self.head(h))

def build_frames(years, nonzero=True):
    X_seq,X_sea,y_seq,sw_idx=[],[],[],[]
    for sw,row in counts.iterrows():
        for Y in years:
            tr,tg=month_range(Y-2,Y-1),month_range(Y,Y)
            if nonzero and row[tr].sum()==0: continue
            X_seq.append(np.log1p(row[tr]).values[:,None])
            X_sea.append(sincos(tr)); y_seq.append(row[tg].values)
            sw_idx.append(sw2idx[sw])
    return dict(X_seq=np.stack(X_seq), X_season=np.stack(X_sea),
                y_seq=np.stack(y_seq), sw_idx=np.array(sw_idx,dtype=np.int64))

train_frames=build_frames(TRAIN_YEARS,True)
ds=WinDS(train_frames); perm=np.random.permutation(len(ds))
n_val=max(1,int(.1*len(ds)))
dl_tr=DataLoader(torch.utils.data.Subset(ds,perm[:-n_val]),BATCH,shuffle=True)
dl_va=DataLoader(torch.utils.data.Subset(ds,perm[-n_val:]),BATCH,shuffle=False)

model=PoissonLSTM(len(switches)).to(DEVICE)
loss_fn,opt=nn.PoissonNLLLoss(log_input=False),torch.optim.Adam(model.parameters(),lr=LR,weight_decay=WD)
best,bad,best_ep=1e9,0,0
for ep in range(1,EPOCHS+1):
    model.train()
    for xb,xs,yb,swb in dl_tr:
        xb,xs,yb,swb=[t.to(DEVICE) for t in (xb,xs,yb,swb)]
        opt.zero_grad(); loss_fn(model(xb,xs,swb),yb).backward()
        nn.utils.clip_grad_norm_(model.parameters(),CLIP); opt.step()
    with torch.no_grad():
        v=np.mean([loss_fn(model(x.to(DEVICE),xs.to(DEVICE),sw.to(DEVICE)),
                           y.to(DEVICE)).item() for x,xs,y,sw in dl_va])
    if v<best-1e-4: best,bad,best_ep,ckpt=v,0,ep,model.state_dict()
    else: bad+=1
    if bad>=PATIENCE: break
model.load_state_dict(ckpt)
log(f"Training finished in {best_ep} epochs | best val-loss={best:.4f}")

# ─── 3 · TARGET YEAR forecasts ──────────────────────────────────────────
log("\nSTEP 3: Generating forecasts for target year...")
eval_frames=build_frames([TARGET_YEAR],False)
mu,swids=[],[]
with torch.no_grad():
    for xb,xs,_,swb in DataLoader(WinDS(eval_frames),256,False):
        xb,xs,swb=[t.to(DEVICE) for t in (xb,xs,swb)]
        mu.append(model(xb,xs,swb).cpu().numpy())
        swids.append(swb.cpu().numpy())
MU,SWIDX=np.concatenate(mu),np.concatenate(swids)
QTRS=months_to_quarters(MU)
ym=month_range(TARGET_YEAR,TARGET_YEAR)

rows_q,rows_y=[],[]
for i,ix in enumerate(SWIDX):
    sw=switches[ix]
    rows_y.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,PRED_FULL_YEAR=float(MU[i].sum())))
    for qi,qsum in enumerate(QTRS[i],1):
        rows_q.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,QUARTER=qi,PRED_FAULTS_Q=float(qsum)))
q_preds = pd.DataFrame(rows_q)
y_preds = pd.DataFrame(rows_y)
log(f"Generated quarterly and yearly forecasts for {TARGET_YEAR}.")

# ─── 4 · PRE-PROCESSING FOR DYNAMIC FACTORS ─────────────────────────────
log("\nSTEP 4: Pre-processing cable master data for dynamic calculations...")
cables_master = pd.read_csv(CABLE_CSV, low_memory=False)

def parse_date_from_col(col_name, prefix):
    try:
        date_str = col_name.replace(f"{prefix}_Month_", "")
        return pd.to_datetime(date_str, format='%Y%m')
    except: return None

cycle_cols = {c: parse_date_from_col(c, "CYCLE") for c in cables_master.columns if c.startswith("CYCLE_Month_")}
var_cols   = {c: parse_date_from_col(c, "VAR")   for c in cables_master.columns if c.startswith("VAR_Month_")}
cycle_cols = {k: v for k, v in cycle_cols.items() if v is not None}
var_cols   = {k: v for k, v in var_cols.items() if v is not None}
log(f"Found {len(cycle_cols)} monthly cycle columns and {len(var_cols)} monthly variation columns.")

# ─── 5 · QUARTERLY HEALTH SCORE WITH DYNAMIC LOADING ────────────────────
log("\nSTEP 5: Calculating Health Scores per Quarter with dynamic loading...")
QUARTERS = {
    "Q1_Jan-Mar": (1, 3), "Q2_Apr-Jun": (4, 6),
    "Q3_Jul-Sep": (7, 9), "Q4_Oct-Dec": (10, 12)
}
LABELS = {
    "c":"High cyclic loading (c)", "r":"Wide load-range utilisation (r)",
    "a":"Advanced cable age (a)", "f":"Many historic faults (f)",
    "s":"Numerous joints / segments (s)", "p":"High predicted faults per km (p)",
    "i":"Frequent interruptions (i)", "l":"Long circuit length (ℓ)",
    "u":"Faults in same quarter last year (u)"
}
OUT_DIR.mkdir(parents=True, exist_ok=True)

for i, (q_name, (start_month, end_month)) in enumerate(QUARTERS.items()):
    q_num = i + 1
    log(f"\n--- Processing {q_name} ({TARGET_YEAR}) ---")
    q_out_dir = OUT_DIR / q_name
    q_out_dir.mkdir(parents=True, exist_ok=True)

    q_start_date = pd.Timestamp(f"{TARGET_YEAR}-{start_month}-01")
    loading_start_date = q_start_date - pd.DateOffset(years=1)

    cab = (cables_master.drop_duplicates("DESTINATION_SWITCH_ID")
           .rename(columns={"DESTINATION_SWITCH_ID":"SWITCH_ID",
                            "MEASUREDLENGTH":"LENGTH_M",
                            "COMMISSIONEDDATE":"DATE_INSTALLED",
                            "NO_OF_SEGMENT":"SEGMENTS"}))

    cab["DATE_INSTALLED"] = pd.to_datetime(cab["DATE_INSTALLED"], errors="coerce", utc=True).dt.tz_localize(None)
    cab["LENGTH_KM"] = pd.to_numeric(cab["LENGTH_M"], errors="coerce") / 1000

    relevant_cycle_cols = [c for c, dt in cycle_cols.items() if loading_start_date <= dt < q_start_date]
    relevant_var_cols   = [c for c, dt in var_cols.items()   if loading_start_date <= dt < q_start_date]

    if relevant_cycle_cols: cab["cycle_pm"] = cab[relevant_cycle_cols].mean(axis=1)
    else: cab["cycle_pm"] = 0

    if relevant_var_cols:
        median_of_period = cab[relevant_var_cols].median(axis=1)
        cab["load_range_idx"] = cab[relevant_var_cols].mean(axis=1) / median_of_period.replace(0, np.nan)
    else:
        cab["load_range_idx"] = 0

    cab["c_raw"] = cab["cycle_pm"]
    cab["r_raw"] = cab["load_range_idx"]

    # Historic faults per switch (before quarter)
    hist = fault[fault["TIME_OUTAGE"] < q_start_date].groupby("SWITCH_ID").size()
    cab = cab.merge(hist.rename("hist_faults"), on="SWITCH_ID", how="left").fillna({"hist_faults": 0})
    cab["f_raw"] = cab["hist_faults"] / cab["LENGTH_KM"].replace(0, np.nan)

    # Predicted quarterly faults
    q_pred_current = q_preds[q_preds['QUARTER'] == q_num]
    cab = cab.merge(q_pred_current[['SWITCH_ID', 'PRED_FAULTS_Q']], on="SWITCH_ID", how="left").fillna({"PRED_FAULTS_Q": 0})
    cab["p_raw"] = cab["PRED_FAULTS_Q"].replace(0, np.nan)

    # MTBF proxy (inverse mean inter-fault hours)
    fault_sorted = fault[fault["TIME_OUTAGE"] < q_start_date].sort_values(["SWITCH_ID", "TIME_OUTAGE"])
    fault_sorted["Δt_h"] = (fault_sorted.groupby("SWITCH_ID")["TIME_OUTAGE"].diff().dt.total_seconds().div(3600))
    mean_dt = fault_sorted.groupby("SWITCH_ID")["Δt_h"].mean().rename("mean_h")
    cab = cab.merge(mean_dt, on="SWITCH_ID", how="left")
    cab["i_raw"] = 1 / cab["mean_h"].clip(lower=1)

    # Same quarter last year
    last_year = TARGET_YEAR - 1
    same_q_faults_last_year = fault[
        (fault["TIME_OUTAGE"].dt.year == last_year) &
        (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))
    ].groupby("SWITCH_ID").size()
    cab = cab.merge(same_q_faults_last_year.rename("faults_same_q_last_yr"), on="SWITCH_ID", how="left").fillna({"faults_same_q_last_yr": 0})
    cab["u_raw"] = cab["faults_same_q_last_yr"]

    # Other factors
    cab["a_raw"] = (q_start_date - cab["DATE_INSTALLED"]).dt.days / (EXPECTED_LIFE_YEARS * 365)
    cab["s_raw"] = (cab["SEGMENTS"].fillna(1) - 1).clip(lower=0)
    cab["l_raw"] = np.log1p(cab["LENGTH_KM"])

    # Robust scaling to [0,1]
    for col in ["a_raw","c_raw","f_raw","i_raw","l_raw","p_raw","r_raw","s_raw","u_raw"]:
        cab[col[0]] = robust(cab[col])

    # Weighted risk and health
    risk = sum(W[k] * cab[k] for k in W.keys())
    cab["health_score"]    = np.rint(100 * (1 - risk)).clip(0, 100).astype(int)
    cab["health_score_10"] = np.clip(np.ceil(cab["health_score"] / 10), 1, 10).astype(int)
    cab["health_band"]     = pd.cut(cab["health_score"], [-np.inf, 40, 60, 100], labels=["Poor", "Moderate", "Good"])

    for k, wv in W.items(): cab[f"weight_{k}"] = wv
    risk_contrib = pd.DataFrame({k: W[k] * cab[k] for k in W.keys()})
    top3 = risk_contrib.apply(lambda r: r.nlargest(3).index.tolist(), axis=1)
    cab["primary_health_driver"] = top3.apply(lambda lst: LABELS[lst[0]])
    cab["top3_health_drivers"]   = top3.apply(lambda lst: "; ".join(LABELS[k] for k in lst))

    # No-fault switches → force Good
    mask_nofault = cab["hist_faults"].eq(0)
    cab.loc[mask_nofault, ["health_score", "health_score_10"]] = [100, 10]
    cab.loc[mask_nofault, "health_band"] = "Good"
    cab.loc[mask_nofault, "primary_health_driver"] = "No recorded faults"
    cab.loc[mask_nofault, "top3_health_drivers"]   = "No recorded faults"

    # Save scored tables
    output_cols = [c for c in cab.columns if c not in cycle_cols and c not in var_cols]
    cab[output_cols].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_scored.csv", index=False)
    cab[output_cols][~mask_nofault].to_csv(q_out_dir / f"cable_health_{TARGET_YEAR}_{q_name}_predictions.csv", index=False)
    log(f"Saved results for {len(cab)} cables to {q_out_dir.name}/")

    # ── Validation labels and predictions ────────────────────────────────
    actual_faults_q = fault[
        (fault["TIME_OUTAGE"].dt.year == TARGET_YEAR) &
        (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))
    ]
    actual_switches_q = actual_faults_q["SWITCH_ID"].unique()
    cab["ACTUAL_FAIL_Q"] = cab["SWITCH_ID"].isin(actual_switches_q).astype(int)

    pred = cab["health_band"].map({"Poor": 1, "Moderate": 1, "Good": 0}).fillna(0)
    y_true = cab["ACTUAL_FAIL_Q"].astype(int).values
    y_pred = pred.astype(int).values

    # Always build 2×2 CM so we can save a plot even if a class is missing
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # [[TN, FP],[FN, TP]]
    cm_path = q_out_dir / f"confusion_matrix_{TARGET_YEAR}_{q_name}.png"
    save_confusion_matrix(cm, cm_path, title=f"{q_name} — Confusion Matrix")
    log(f"Saved confusion-matrix plot → {cm_path}")

    # AUROC only when both classes present
    if np.unique(y_true).size > 1:
        au = roc_auc_score(y_true, 100 - cab["health_score"])
        log(f"{q_name} AUROC = {au:.3f}")
    else:
        log(f"{q_name} AUROC skipped (only one class present)")

# --- Final Summary ---
total_predicted_faults = y_preds['PRED_FULL_YEAR'].sum()
total_actual_faults_2024 = fault[fault['TIME_OUTAGE'].dt.year == TARGET_YEAR].shape[0]

log("\n" + "="*45)
log(f"FLEET-WIDE FAULT SUMMARY FOR FULL YEAR {TARGET_YEAR}")
log("="*45)
log(f"Total Predicted Faults (Yearly): {total_predicted_faults:.1f}")
log(f"Total Actual Faults (Yearly):    {total_actual_faults_2024}")
log("="*45)
log(f"\nPipeline complete → Quarterly results saved in: {OUT_DIR.resolve()}")


16:43:38 | STEP 1: Processing fault history...


16:43:38 | Processed fault history for 210 switches.
16:43:38 | 
STEP 2: Building dataset and training Poisson-LSTM model...
16:43:41 | Training finished in 4 epochs | best val-loss=0.1891
16:43:41 | 
STEP 3: Generating forecasts for target year...
16:43:41 | Generated quarterly and yearly forecasts for 2024.
16:43:41 | 
STEP 4: Pre-processing cable master data for dynamic calculations...
16:43:41 | Found 0 monthly cycle columns and 0 monthly variation columns.
16:43:41 | 
STEP 5: Calculating Health Scores per Quarter with dynamic loading...
16:43:41 | 
--- Processing Q1_Jan-Mar (2024) ---
16:43:42 | Saved results for 256 cables to Q1_Jan-Mar/
16:43:42 | Saved confusion-matrix plot → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/A_fault_model_with_health_QUARTERLY/Q1_Jan-Mar/confusion_matrix_2024_Q1_Jan-Mar.png
16:43:42 | Q1_Jan-Mar AUROC = 0.778
16:43:42 | 
--- Processing Q2_Apr-Jun (2024) ---
16:43:42 | Saved results for 256 cables to Q2_Apr-Jun/
16:43:42 | Saved confusion-matrix 

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HT-cable Poisson-LSTM forecast + Health Score with Machine Learning Weights
2025-08-20 — 6-MONTH PERIODS + DATA-DRIVEN WEIGHTS (LOGISTIC REGRESSION) - FULL SCRIPT
"""

from __future__ import annotations
import logging, math, sys, json, numpy as np, pandas as pd
import torch, torch.nn as nn
from pathlib import Path
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)

# ─── paths ──────────────────────────────────────────────────────────────
FAULT_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/FAULT DATA/HT_fault_cable_info_processed_with_affected.csv")
CABLE_CSV = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SWNO_MASTER_COMBINED_FULL_FINAL3.csv")
OUT_DIR = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/A_fault_model_with_health_ML_WEIGHTS")

# ─── constants / hyper-params ───────────────────────────────────────────
MIN_YEAR, TARGET_YEAR   = 2016, 2024
TRAIN_YEARS             = list(range(2018, TARGET_YEAR))
HISTORY_YEARS           = list(range(2018, TARGET_YEAR)) # Years to build ML training set
KEEP_VOLTAGES           = {22, 33}
EXPECTED_LIFE_YEARS     = 35

# NOTE: The static 'W' dictionary is now replaced by a machine learning model
BATCH, EPOCHS, PATIENCE = 64, 40, 12
LR, WD, CLIP            = 1e-3, 1e-5, 1.0
HID, LAY, DROP, EMB     = 512, 2, 0.1, 16
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42); np.random.seed(42)

logging.basicConfig(format="%(asctime)s | %(message)s",
                    datefmt="%H:%M:%S", level=logging.INFO, stream=sys.stdout)
log = logging.getLogger("pipeline").info

# ─── helper functions ───────────────────────────────────────────────────
def v_to_num(v):
    try: return float(str(v).lower().replace("kv",""))
    except: return math.nan
def norm_sw(s):
    return (s.astype(str).str.upper().str.strip()
             .str.replace(r"^(SWNO_|SWNO|SW|S)\s*", "", regex=True)
             .str.replace(r"\D+","",regex=True)
             .replace("", np.nan)).astype("Int64")
def month_range(a,b): return pd.period_range(f"{a}-01", f"{b}-12", freq="M")
def sincos(idx):
    ang = 2*np.pi*idx.month.values/12
    return np.stack([np.sin(ang), np.cos(ang)], 1)
def months_to_halves(mat):
    h=np.zeros((mat.shape[0],2))
    h[:,0]=mat[:,0:6].sum(1); h[:,1]=mat[:,6:12].sum(1)
    return h

# ─── 1 · fault history ──────────────────────────────────────────────────
log("STEP 1: Processing fault history...")
fault=pd.read_csv(FAULT_CSV,parse_dates=["TIME_OUTAGE"],low_memory=False)
sw_col="TO_SWITCH" if "TO_SWITCH" in fault.columns else fault.columns[0]
fault["SWITCH_ID"]=norm_sw(fault[sw_col])
fault=fault.dropna(subset=["SWITCH_ID","TIME_OUTAGE"])
fault["TIME_OUTAGE"] = fault["TIME_OUTAGE"].dt.tz_localize(None)
fault=fault[fault["TIME_OUTAGE"].dt.year.between(MIN_YEAR,TARGET_YEAR)]
if "VOLTAGE" in fault.columns:
    fault["VNUM"]=fault["VOLTAGE"].apply(v_to_num)
    fault=fault[fault["VNUM"].isin(KEEP_VOLTAGES)]
fault=fault.drop_duplicates(["SWITCH_ID","TIME_OUTAGE"])
fault["YM"]=fault["TIME_OUTAGE"].dt.to_period("M")
idx_full=month_range(MIN_YEAR,TARGET_YEAR)
counts=(fault[fault["TIME_OUTAGE"].dt.year<=TARGET_YEAR-1]
        .groupby(["SWITCH_ID","YM"]).size()
        .unstack(fill_value=0)
        .reindex(columns=idx_full,fill_value=0).astype(float))
switches=counts.index.tolist(); sw2idx={sw:i for i,sw in enumerate(switches)}
log(f"Processed fault history for {len(switches)} switches.")

# ─── 2 · Poisson-LSTM dataset & training ────────────────────────────────
log("\nSTEP 2: Building dataset and training Poisson-LSTM model...")
class WinDS(Dataset):
    def __init__(self, fr):
        self.x  = torch.tensor(fr["X_seq"],dtype=torch.float32)
        self.xs = torch.tensor(fr["X_season"],dtype=torch.float32)
        self.y  = torch.tensor(fr["y_seq"],dtype=torch.float32)
        self.sw = torch.tensor(fr["sw_idx"],dtype=torch.long)
    def __len__(self): return len(self.sw)
    def __getitem__(self,i): return self.x[i],self.xs[i],self.y[i],self.sw[i]
class PoissonLSTM(nn.Module):
    def __init__(self,n_sw):
        super().__init__()
        self.emb = nn.Embedding(n_sw, EMB)
        self.rnn = nn.LSTM(3,HID,LAY,batch_first=True, dropout=DROP if LAY>1 else 0.)
        self.head=nn.Sequential(nn.Linear(HID+EMB,HID),nn.ReLU(),
                                nn.Linear(HID,HID//2),nn.ReLU(),
                                nn.Linear(HID//2,12))
        self.sp = nn.Softplus()
    def forward(self,x,xs,sw):
        h,_=self.rnn(torch.cat([x,xs],-1)); h=torch.cat([h[:,-1],self.emb(sw)],1)
        return self.sp(self.head(h))
def build_frames(years, nonzero=True):
    X_seq,X_sea,y_seq,sw_idx=[],[],[],[]
    for sw,row in counts.iterrows():
        for Y in years:
            tr,tg=month_range(Y-2,Y-1),month_range(Y,Y)
            if nonzero and row[tr].sum()==0: continue
            X_seq.append(np.log1p(row[tr]).values[:,None])
            X_sea.append(sincos(tr)); y_seq.append(row[tg].values)
            sw_idx.append(sw2idx[sw])
    return dict(X_seq=np.stack(X_seq), X_season=np.stack(X_sea),
                y_seq=np.stack(y_seq), sw_idx=np.array(sw_idx,dtype=np.int64))
train_frames=build_frames(TRAIN_YEARS,True)
ds=WinDS(train_frames); perm=np.random.permutation(len(ds))
n_val=max(1,int(.1*len(ds)))
dl_tr=DataLoader(torch.utils.data.Subset(ds,perm[:-n_val]),BATCH,shuffle=True)
dl_va=DataLoader(torch.utils.data.Subset(ds,perm[-n_val:]),BATCH,shuffle=False)
model=PoissonLSTM(len(switches)).to(DEVICE)
loss_fn,opt=nn.PoissonNLLLoss(log_input=False),torch.optim.Adam(model.parameters(),lr=LR,weight_decay=WD)
best,bad,best_ep=1e9,0,0
for ep in range(1,EPOCHS+1):
    model.train()
    for xb,xs,yb,swb in dl_tr:
        xb,xs,yb,swb=[t.to(DEVICE) for t in (xb,xs,yb,swb)]
        opt.zero_grad(); loss_fn(model(xb,xs,swb),yb).backward()
        nn.utils.clip_grad_norm_(model.parameters(),CLIP); opt.step()
    with torch.no_grad():
        v=np.mean([loss_fn(model(x.to(DEVICE),xs.to(DEVICE),sw.to(DEVICE)),
                           y.to(DEVICE)).item() for x,xs,y,sw in dl_va])
    if v<best-1e-4: best,bad,best_ep,ckpt=v,0,ep,model.state_dict()
    else: bad+=1
    if bad>=PATIENCE: break
model.load_state_dict(ckpt)
log(f"Training finished in {best_ep} epochs | best val-loss={best:.4f}")

# ─── 3 · TARGET YEAR forecasts ──────────────────────────────────────────
log("\nSTEP 3: Generating forecasts for target year...")
eval_frames=build_frames([TARGET_YEAR],False)
mu,swids=[],[]
with torch.no_grad():
    for xb,xs,_,swb in DataLoader(WinDS(eval_frames),256,False):
        xb,xs,swb=[t.to(DEVICE) for t in (xb,xs,swb)]
        mu.append(model(xb,xs,swb).cpu().numpy()); swids.append(swb.cpu().numpy())
MU,SWIDX=np.concatenate(mu),np.concatenate(swids)
HALVES=months_to_halves(MU)
rows_h,rows_y=[],[]
for i,ix in enumerate(SWIDX):
    sw=switches[ix]
    rows_y.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,PRED_FULL_YEAR=float(MU[i].sum())))
    for hi,hsum in enumerate(HALVES[i],1):
        rows_h.append(dict(SWITCH_ID=sw,YEAR=TARGET_YEAR,HALF=hi, PRED_FAULTS_H=float(hsum)))
h_preds = pd.DataFrame(rows_h)
y_preds = pd.DataFrame(rows_y)
log(f"Generated semi-annual and yearly forecasts for {TARGET_YEAR}.")

# ─── 4 · PRE-PROCESSING ────────────────────────────────────────────────
log("\nSTEP 4: Pre-processing cable master data...")
cables_master = pd.read_csv(CABLE_CSV, low_memory=False)
def parse_date_from_col(col_name, prefix):
    try:
        date_str = col_name.replace(f"{prefix}_Month_", ""); return pd.to_datetime(date_str, format='%Y%m')
    except: return None
cycle_cols = {c: parse_date_from_col(c, "CYCLE") for c in cables_master.columns if c.startswith("CYCLE_Month_")}
var_cols = {c: parse_date_from_col(c, "VAR") for c in cables_master.columns if c.startswith("VAR_Month_")}
cycle_cols = {k: v for k, v in cycle_cols.items() if v is not None}
var_cols = {k: v for k, v in var_cols.items() if v is not None}
log(f"Found {len(cycle_cols)} monthly cycle columns and {len(var_cols)} monthly variation columns.")

# ─── 5. NEW: Train ML Model to Find Optimal Weights ───────────────────
def train_health_model(history_years, fault_df, cable_df, cycle_cols, var_cols):
    log("\nSTEP 5A: Building training set for health score model...")
    training_data = []
    PERIODS = {"H1": (1, 6), "H2": (7, 12)}
    feature_names = ['a_raw', 'c_raw', 'f_raw', 'i_raw', 'l_raw', 'p_raw', 'r_raw', 's_raw', 'u_raw']
    
    base_cab = cable_df.drop_duplicates("DESTINATION_SWITCH_ID").rename(columns={
        "DESTINATION_SWITCH_ID":"SWITCH_ID", "MEASUREDLENGTH":"LENGTH_M",
        "COMMISSIONEDDATE":"DATE_INSTALLED", "NO_OF_SEGMENT":"SEGMENTS"})
    base_cab["DATE_INSTALLED"] = pd.to_datetime(base_cab["DATE_INSTALLED"], errors="coerce", utc=True).dt.tz_localize(None)
    base_cab["LENGTH_KM"] = pd.to_numeric(base_cab["LENGTH_M"], errors="coerce") / 1000

    for year in history_years:
        for p_name, (start_month, end_month) in PERIODS.items():
            cab = base_cab.copy()
            p_start_date = pd.Timestamp(f"{year}-{start_month}-01")
            
            # --- Calculate all 9 factors for this historical period ---
            cab["a_raw"] = (p_start_date - cab["DATE_INSTALLED"]).dt.days / (EXPECTED_LIFE_YEARS * 365)
            hist = fault_df[fault_df["TIME_OUTAGE"] < p_start_date].groupby("SWITCH_ID").size()
            cab = cab.merge(hist.rename("hist_faults"), on="SWITCH_ID", how="left").fillna(0)
            cab["f_raw"] = cab["hist_faults"] / cab["LENGTH_KM"].replace(0, np.nan)
            fault_sorted = fault_df[fault_df["TIME_OUTAGE"] < p_start_date].sort_values(["SWITCH_ID", "TIME_OUTAGE"])
            fault_sorted["Δt_h"] = (fault_sorted.groupby("SWITCH_ID")["TIME_OUTAGE"].diff().dt.total_seconds().div(3600))
            mean_dt = fault_sorted.groupby("SWITCH_ID")["Δt_h"].mean().rename("mean_h")
            cab = cab.merge(mean_dt, on="SWITCH_ID", how="left"); cab["i_raw"] = 1 / cab["mean_h"].clip(lower=1)
            cab["s_raw"] = (cab["SEGMENTS"].fillna(1) - 1).clip(lower=0)
            cab["l_raw"] = np.log1p(cab["LENGTH_KM"])
            
            last_year = year - 1
            same_period_faults = fault_df[(fault_df["TIME_OUTAGE"].dt.year == last_year) & (fault_df["TIME_OUTAGE"].dt.month.between(start_month, end_month))].groupby("SWITCH_ID").size()
            cab = cab.merge(same_period_faults.rename("u_raw"), on="SWITCH_ID", how="left").fillna(0)
            
            actual_faults_in_period = fault_df[(fault_df["TIME_OUTAGE"].dt.year == year) & (fault_df["TIME_OUTAGE"].dt.month.between(start_month, end_month))].groupby("SWITCH_ID").size()
            cab = cab.merge(actual_faults_in_period.rename("p_raw"), on="SWITCH_ID", how="left").fillna(0)
            
            loading_start_date = p_start_date - pd.DateOffset(years=1)
            rel_cyc_cols = [c for c, dt in cycle_cols.items() if loading_start_date <= dt < p_start_date]
            rel_var_cols = [c for c, dt in var_cols.items() if loading_start_date <= dt < p_start_date]
            cab["c_raw"] = cab[rel_cyc_cols].mean(axis=1) if rel_cyc_cols else 0
            if rel_var_cols:
                median = cab[rel_var_cols].median(axis=1)
                cab["r_raw"] = cab[rel_var_cols].mean(axis=1) / median.replace(0, np.nan)
            else: cab["r_raw"] = 0
            
            faulty_switches = actual_faults_in_period.index.unique()
            cab["ACTUAL_FAIL"] = cab["SWITCH_ID"].isin(faulty_switches).astype(int)
            training_data.append(cab)

    training_df = pd.concat(training_data, ignore_index=True).fillna(0)
    
    log("STEP 5B: Training Logistic Regression model...")
    X_train = training_df[feature_names].replace([np.inf, -np.inf], 0)
    y_train = training_df["ACTUAL_FAIL"]
    
    scaler = StandardScaler(); X_train_scaled = scaler.fit_transform(X_train)
    model = LogisticRegression(class_weight='balanced', random_state=42, solver='liblinear', C=0.1)
    model.fit(X_train_scaled, y_train)
    
    feature_importance = pd.DataFrame({'feature': feature_names, 'importance': np.abs(model.coef_[0])})
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    log("Data-Driven Feature Importance:\n" + str(feature_importance.round(3)))
    
    return model, scaler, feature_importance

# ─── 6. MAIN EXECUTION with ML Model ────────────────────────────────────
model_ml, scaler, feature_importance = train_health_model(HISTORY_YEARS, fault, cables_master, cycle_cols, var_cols)

log("\nSTEP 6: Calculating Health Scores for {} using the trained ML model...".format(TARGET_YEAR))
PERIODS = {"H1_Jan-Jun": (1, 6), "H2_Jul-Dec": (7, 12)}
LABELS = {
    "a_raw":"Advanced cable age (a)", "c_raw":"High cyclic loading (c)", "f_raw":"Many historic faults (f)",
    "i_raw":"Frequent interruptions (i)", "l_raw":"Long circuit length (l)", "p_raw":"High predicted faults (p)",
    "r_raw":"Wide load-range utilisation (r)", "s_raw":"Numerous joints / segments (s)", "u_raw":"Faults in same period last year (u)"
}
base_cab = cables_master.drop_duplicates("DESTINATION_SWITCH_ID").rename(columns={
    "DESTINATION_SWITCH_ID":"SWITCH_ID", "MEASUREDLENGTH":"LENGTH_M",
    "COMMISSIONEDDATE":"DATE_INSTALLED", "NO_OF_SEGMENT":"SEGMENTS"})
base_cab["DATE_INSTALLED"] = pd.to_datetime(base_cab["DATE_INSTALLED"], errors="coerce", utc=True).dt.tz_localize(None)
base_cab["LENGTH_KM"] = pd.to_numeric(base_cab["LENGTH_M"], errors="coerce") / 1000

for i, (p_name, (start_month, end_month)) in enumerate(PERIODS.items()):
    p_num = i + 1; log(f"\n--- Processing {p_name} ({TARGET_YEAR}) ---")
    p_out_dir = OUT_DIR / p_name; p_out_dir.mkdir(parents=True, exist_ok=True)
    
    cab = base_cab.copy()
    p_start_date = pd.Timestamp(f"{TARGET_YEAR}-{start_month}-01")
    
    # --- Calculate all 9 factors for the target period ---
    cab["a_raw"] = (p_start_date - cab["DATE_INSTALLED"]).dt.days/(EXPECTED_LIFE_YEARS*365)
    hist = fault[fault["TIME_OUTAGE"]<p_start_date].groupby("SWITCH_ID").size()
    cab = cab.merge(hist.rename("hist_faults"),on="SWITCH_ID",how="left").fillna(0)
    cab["f_raw"]=cab["hist_faults"]/cab["LENGTH_KM"].replace(0,np.nan)
    fault_sorted=fault[fault["TIME_OUTAGE"]<p_start_date].sort_values(["SWITCH_ID","TIME_OUTAGE"])
    fault_sorted["Δt_h"]=(fault_sorted.groupby("SWITCH_ID")["TIME_OUTAGE"].diff().dt.total_seconds().div(3600))
    mean_dt=fault_sorted.groupby("SWITCH_ID")["Δt_h"].mean().rename("mean_h")
    cab=cab.merge(mean_dt,on="SWITCH_ID",how="left");cab["i_raw"]=1/cab["mean_h"].clip(lower=1)
    cab["s_raw"]=(cab["SEGMENTS"].fillna(1)-1).clip(lower=0)
    cab["l_raw"]=np.log1p(cab["LENGTH_KM"])
    last_year=TARGET_YEAR-1
    same_period_faults=fault[(fault["TIME_OUTAGE"].dt.year==last_year)&(fault["TIME_OUTAGE"].dt.month.between(start_month,end_month))].groupby("SWITCH_ID").size()
    cab=cab.merge(same_period_faults.rename("u_raw"),on="SWITCH_ID",how="left").fillna(0)
    h_pred_current=h_preds[h_preds['HALF']==p_num]
    cab=cab.merge(h_pred_current[['SWITCH_ID','PRED_FAULTS_H']],on="SWITCH_ID",how="left",).fillna({"PRED_FAULTS_H":0})
    cab["p_raw"]=cab["PRED_FAULTS_H"]
    loading_start_date=p_start_date-pd.DateOffset(years=1)
    rel_cyc_cols=[c for c,dt in cycle_cols.items() if loading_start_date<=dt<p_start_date]
    rel_var_cols=[c for c,dt in var_cols.items() if loading_start_date<=dt<p_start_date]
    cab["c_raw"]=cab[rel_cyc_cols].mean(axis=1) if rel_cyc_cols else 0
    if rel_var_cols:
        median=cab[rel_var_cols].median(axis=1); cab["r_raw"]=cab[rel_var_cols].mean(axis=1)/median.replace(0,np.nan)
    else: cab["r_raw"]=0
    
    # --- Predict with ML model ---
    features = ['a_raw', 'c_raw', 'f_raw', 'i_raw', 'l_raw', 'p_raw', 'r_raw', 's_raw', 'u_raw']
    X_target = cab[features].fillna(0).replace([np.inf, -np.inf], 0)
    X_target_scaled = scaler.transform(X_target)
    
    fail_probability = model_ml.predict_proba(X_target_scaled)[:, 1]
    cab["health_score"] = np.rint(100 * (1 - fail_probability)).clip(0, 100).astype(int)
    
    # --- Analysis & Output ---
    cab["health_band"] = pd.cut(cab["health_score"], [-np.inf, 40, 70, 100], labels=["Poor", "Moderate", "Good"])
    risk_contrib = pd.DataFrame(X_target_scaled * model_ml.coef_[0], columns=features)
    cab["primary_health_driver"]=risk_contrib.idxmax(axis=1).map(LABELS)
    
    mask_nofault = cab["hist_faults"].eq(0)
    cab.loc[mask_nofault, "health_score"] = 100
    cab.loc[mask_nofault, "health_band"] = "Good"
    cab.loc[mask_nofault, "primary_health_driver"] = "No recorded faults"

    cab.to_csv(p_out_dir / f"cable_health_{TARGET_YEAR}_{p_name}_scored.csv", index=False)
    log(f"Saved results for {len(cab)} cables to {p_out_dir.name}/")
    
    actual_faults_p = fault[(fault["TIME_OUTAGE"].dt.year == TARGET_YEAR) & (fault["TIME_OUTAGE"].dt.month.between(start_month, end_month))]
    actual_switches_p = actual_faults_p["SWITCH_ID"].unique()
    cab["ACTUAL_FAIL_H"] = cab["SWITCH_ID"].isin(actual_switches_p).astype(int)
    pred = cab["health_band"].map({"Poor": 1, "Moderate": 1, "Good": 0})
    if len(cab["ACTUAL_FAIL_H"].unique()) > 1:
        cm = confusion_matrix(cab["ACTUAL_FAIL_H"], pred)
        au = roc_auc_score(cab["ACTUAL_FAIL_H"], 100 - cab["health_score"])
        log(f"{p_name} confusion matrix:\n{cm}\nAUROC = {au:.3f}")
    else:
        log(f"Skipping validation for {p_name}: Only one class present in actuals.")

# --- Final Summary ---
total_predicted_faults = y_preds['PRED_FULL_YEAR'].sum()
total_actual_faults_2024 = fault[fault['TIME_OUTAGE'].dt.year == TARGET_YEAR].shape[0]

log("\n" + "="*45)
log(f"FLEET-WIDE FAULT SUMMARY FOR FULL YEAR {TARGET_YEAR}")
log("="*45)
log(f"Total Predicted Faults (Yearly): {total_predicted_faults:.1f}")
log(f"Total Actual Faults (Yearly):    {total_actual_faults_2024}")
log("="*45)
log(f"\nPipeline complete → ML-driven results saved in: {OUT_DIR.resolve()}")

16:41:03 | STEP 1: Processing fault history...
16:41:03 | Processed fault history for 210 switches.
16:41:03 | 
STEP 2: Building dataset and training Poisson-LSTM model...
16:41:07 | Training finished in 4 epochs | best val-loss=0.1891
16:41:07 | 
STEP 3: Generating forecasts for target year...
16:41:08 | Generated semi-annual and yearly forecasts for 2024.
16:41:08 | 
STEP 4: Pre-processing cable master data...
16:41:08 | Found 0 monthly cycle columns and 0 monthly variation columns.
16:41:08 | 
STEP 5A: Building training set for health score model...
16:41:08 | STEP 5B: Training Logistic Regression model...
16:41:08 | Data-Driven Feature Importance:
  feature  importance
5   p_raw       4.519
7   s_raw       0.127
8   u_raw       0.094
4   l_raw       0.066
2   f_raw       0.044
3   i_raw       0.011
0   a_raw       0.007
1   c_raw       0.000
6   r_raw       0.000
16:41:08 | 
STEP 6: Calculating Health Scores for 2024 using the trained ML model...
16:41:08 | 
--- Processing H1_Jan-J