FILTER 11KV BUT  INCLUDE ALL THE VALUE OF 11KV BECAUSE OF 11Kv, 11KV, 11kV

In [44]:
# feeder_trace_latest_audit_with_rank.py
"""
Workflow
========
1. Load HTCABLE.csv, drop unused columns, remove fully-identical rows.
2. Trace every feeder edge‑by‑edge, annotate with RANK (distance from feeder start).
3. Load ENERGYAUDIT.csv, for each transformer (FUNC_LOC) compute:
   * LATEST_DT_DATE  → most‑recent SYSTEM_DATE
   * DT_LOAD         → average MD_KVA across all rows
4. Merge audit stats onto trace (DESTINATION_LOCATION = FUNC_LOC).
"""

from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, List, Set, Optional

# ── CONFIG ────────────────────────────────────────────────────────────────────
INPUT_HT      = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_ENERGY  = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
OUTPUT_PATH   = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_with_DT.csv"


FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"   # ≡ FUNC_LOC in audit

FUNC_LOC_COL = "FUNC_LOC"
DATE_COL     = "SYSTEM_DATE"
LOAD_COL     = "MD_KVA"

REDUNDANT_COLS = [
    "COMMENTS", "GLOBALID", "MEASUREDLENGTH", "UNNAMED: 0", "OBJECTID"
]
# ─────────────────────────────────────────────────────────────────────────────

# 1️  LOAD & CLEAN HT-CABLE ---------------------------------------------------
ht_path = Path(INPUT_HT).expanduser()

ht = pd.read_csv(ht_path, low_memory=False)
ht = ht.drop(columns=[c for c in REDUNDANT_COLS if c in ht.columns], errors="ignore")
ht = ht.drop_duplicates()  # remove fully-identical rows

# helper to pull token after 2nd underscore
def _feeder_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[2] if len(p) >= 3 and (p[1] == '11kV' or p[1]=='11Kv' or p[1]=='11KV') else None




# def extract_feeder_id(value: str | int | float | None) -> Optional[str]:
#     """
#     Return the token after the 2nd underscore only if the middle token is '11kV'.
#     Example: 'AMBVLI_11kV_19556' ➜ '19556'
#     """
#     if not isinstance(value, str):
#         value = str(value) if value is not None else ""
#     parts = value.split("_")
#     if len(parts) >= 3 :
#         return parts[2]
#     return None



ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token)

for col in [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]:
    ht[col] = ht[col].astype(str)

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]
source_idx: Dict[Tuple[str, str], pd.DataFrame] = {
    (k[0], k[1]): g[edge_cols]
    for k, g in ht.groupby([SRC_LOC_COL, "FEEDER_ID"], sort=False)
}

# 2️  FEEDER TRACER (with RANK) -----------------------------------------------
from collections import deque

def trace_feeder(fid: str) -> list:
    rows = []
    visited = set()
    feeder_edges = ht[ht["FEEDER_ID"] == fid][[SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL]].copy()
    feeder_edges[SRC_LOC_COL] = feeder_edges[SRC_LOC_COL].astype(str).str.strip()
    feeder_edges[DST_LOC_COL] = feeder_edges[DST_LOC_COL].astype(str).str.strip()

    from_loc_map = {}
    for _, r in feeder_edges.iterrows():
        from_loc_map.setdefault(r[SRC_LOC_COL], []).append(
            tuple(r[c] for c in [SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL])
        )

    all_from = set(feeder_edges[SRC_LOC_COL])
    all_to = set(feeder_edges[DST_LOC_COL])
    root_candidates = (all_from - all_to) or all_from or set(feeder_edges[SRC_LOC_COL].unique())

    all_edges = set((row[SRC_LOC_COL], row[DST_LOC_COL]) for _, row in feeder_edges.iterrows())
    unvisited_edges = all_edges - visited

    from collections import deque
    while unvisited_edges:
        # Find the next root (or any remaining edge)
        found = False
        for root in root_candidates:
            start_rows = [e for e in feeder_edges.to_records(index=False)
                          if e[0] == root and (e[0], e[1]) in unvisited_edges]
            if start_rows:
                found = True
                break
        if not found:
            # Just pick any edge not yet visited
            start_rows = [e for e in feeder_edges.to_records(index=False)
                          if (e[0], e[1]) in unvisited_edges]
            if not start_rows:
                break
        queue = deque()
        for srow in start_rows:
            queue.append((srow, 0))
        while queue:
            (src_loc, dst_loc, src_sw, dst_sw), rank = queue.popleft()
            if (src_loc, dst_loc) in visited:
                continue
            visited.add((src_loc, dst_loc))
            unvisited_edges.discard((src_loc, dst_loc))
            rows.append({
                "FEEDER_ID": fid,
                "FROM_TO": f"{src_sw}-{dst_sw}",
                "FROM_SWITCH": src_sw,
                "TO_SWITCH": dst_sw,
                "SOURCE_LOCATION": src_loc,
                "DESTINATION_LOCATION": dst_loc,
                "RANK": rank
            })
            for next_edge in from_loc_map.get(dst_loc, []):
                if (next_edge[0], next_edge[1]) not in visited:
                    queue.append((next_edge, rank + 1))
    return rows

# 3️  TRACE ALL FEEDERS -------------------------------------------------------
all_edges: List[dict] = []
feeder_ids = [str(f) for f in ht["FEEDER_ID"].dropna().unique()]
print(f"Tracing {len(feeder_ids)} feeders …")
for i, fid in enumerate(feeder_ids, 1):
    if i % 100 == 0 or i in {1, len(feeder_ids)}:
        print(f"  → {i}/{len(feeder_ids)}: {fid}")
    all_edges.extend(trace_feeder(fid))

trace_df = pd.DataFrame(all_edges)

# 4️ LOAD ENERGY-AUDIT & AGGREGATE -----------------------------------------
audit_path = Path(INPUT_ENERGY).expanduser()
if not audit_path.exists():
    raise FileNotFoundError(audit_path)

print("\nLoading energy-audit …")
audit = pd.read_csv(audit_path, low_memory=False, parse_dates=[DATE_COL])
audit.columns = [c.upper() for c in audit.columns]

audit[DATE_COL] = pd.to_datetime(audit[DATE_COL], errors="coerce")

audit = audit[[FUNC_LOC_COL, DATE_COL, LOAD_COL]].dropna(subset=[FUNC_LOC_COL])

agg = (audit.groupby(FUNC_LOC_COL)
           .agg(LATEST_DT_DATE=(DATE_COL, "max"),
                DT_LOAD=(LOAD_COL,  "mean"))
           .reset_index())
agg[FUNC_LOC_COL] = agg[FUNC_LOC_COL].astype(str)

# 5️  MERGE TRACE ← AUDIT -----------------------------------------------------
merged = (trace_df.merge(agg, how="left",
                 left_on="DESTINATION_LOCATION",
                 right_on=FUNC_LOC_COL).drop(columns=[FUNC_LOC_COL]))

merged["LATEST_DT_DATE"] = pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date

# 6️  EXPORT ------------------------------------------------------------------
cols = ["FEEDER_ID", "FROM_TO", "SOURCE_LOCATION", "DESTINATION_LOCATION", "RANK", "LATEST_DT_DATE", "DT_LOAD"]
merged.to_csv(OUTPUT_PATH, index=False, columns=cols)
print(f"\nSaved {len(merged):,} rows → {OUTPUT_PATH}")

if __name__ == "__main__":
    try:
        from IPython.display import display
        display(merged.head())
    except Exception:
        pass


Tracing 1424 feeders …
  → 1/1424: 15454
  → 100/1424: 41897
  → 200/1424: 28223
  → 300/1424: 39624
  → 400/1424: 41709
  → 500/1424: 31267
  → 600/1424: 28674
  → 700/1424: 35873
  → 800/1424: 30135
  → 900/1424: 18093
  → 1000/1424: 30031
  → 1100/1424: 03101
  → 1200/1424: 35038
  → 1300/1424: 30886
  → 1400/1424: 19090
  → 1424/1424: BUSPT

Loading energy-audit …

Saved 16,844 rows → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_with_DT.csv


Unnamed: 0,FEEDER_ID,FROM_TO,FROM_SWITCH,TO_SWITCH,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,LATEST_DT_DATE,DT_LOAD
0,15454,15454-38196,15454,38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,0,2025-04-04,127.425882
1,15454,38195-34116,38195,34116,1S-MH-MU-ZST-CL02-1238,1S-MH-MU-ZST-CL02-0894,1,2025-04-04,233.628927
2,15454,38197-DT,38197,DT,1S-MH-MU-ZST-CL02-1238,1S-MH-MU-ZST-CL02-1238,1,2025-04-04,127.425882
3,15454,34114-32764,34114,32764,1S-MH-MU-ZST-CL02-0894,1S-MH-MU-ZST-CL01-0860,2,2025-04-04,134.062123
4,15454,34115-DT,34115,DT,1S-MH-MU-ZST-CL02-0894,1S-MH-MU-ZST-CL02-0894,2,2025-04-04,233.628927


In [2]:
def extract_mid_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[1] if len(p) >= 2 else None

unique_mid_tokens = ht[FEEDER_ID_COL].apply(extract_mid_token).unique()
print("Unique mid tokens from FEEDER_ID_COL:", unique_mid_tokens)

Unique mid tokens from FEEDER_ID_COL: [None '11KV' '11kV' 'GOR0552' 'BOR00552' 'BOR00952' 'SAK1152' 'SAK0452'
 '33KV' '33kV' '22kV' 'GOI00152' '11Kv' 'REACTOR' '22KV' 'VER00152'
 'GHO01452' 'GHO01652' 'AAR01352' 'AAR01552' 'GOI00352' 'GOI00852'
 'CHE00952' 'CHE00152' 'CHE00552' 'CHE00652' 'DHN00352' '40973' '40974'
 '40976' '40977' '40978' '40980' '33360' 'AAR00452' 'AAR00552' 'AAR00652'
 'AAR00852' 'AAR01152' 'BOR00152' 'BOR00752' 'CHE00252' 'CHE00852'
 'CHE01052' 'GHD00752' 'GHD00952' 'GHD1052' 'GOI00252' 'GOI00752'
 'GOR00252' 'GOR00752' 'GOR01052' 'SAK00752' 'SAK00952' 'SAK01252'
 'VER00652' 'VER00752' 'VER00852' 'VER01052' 'VER01352' '33Kv' 'BOR00352'
 'BOR01652' 'VER01452' 'VER01552' 'SAK00652' 'SAK00352' 'GOI00652'
 'GOR00452' 'GOR00152']


In [3]:
def extract_mid_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[0] if len(p) >= 2 else None

unique_mid_tokens = ht[FEEDER_ID_COL].apply(extract_mid_token).unique()
print("Unique mid tokens from FEEDER_ID_COL:", unique_mid_tokens)

Unique mid tokens from FEEDER_ID_COL: [None '24THRD' 'AAR01752' 'AAR01852' 'AAR01952' 'AAR1252' 'AAR1452' 'ACRO'
 'AMBVLI' 'ANDHRI' 'ANIK' 'ARY220' 'ARY' 'BANDRA' 'BBLWDI' 'BHAVANS'
 'BHAYW' 'BHYNDR' 'BKC' 'BNDRTE' 'BORIVLI' 'BORVLI' 'BOR' 'CAMA' 'CHAKALA'
 'CHBNDR' 'CHDNGR' 'CHDVLI' 'CHE' 'CHMBUR' 'CHMBU' 'CHMB' 'CHUNA' 'CHVALI'
 'CPWDMAREC' 'DAHICHNKA' 'DAHISRW' 'DAHISR' 'DEVIDAS' 'DHA' 'DINDO'
 'ERANGL' 'ESIC' 'GHO00452' 'GHOD' 'GKLDHM' 'GNSHNG' 'GODREJBKC'
 'GOI00452' 'GOI220' 'GOR220' 'GORAI' 'GOREG' 'HCC' 'HINGWALA' 'HIRANANDA'
 'HIRANA' 'HULL' 'JANKALYAN' 'JBNGR' 'JUHUN' 'JUHU' 'KADAMWADI' 'KALANR'
 'KALINA' 'KALPATARU' 'KANA' 'KANDI' 'KHAR' 'KIE' 'KOHINR' 'KURLA' 'KURL'
 'LKHWLA' 'MAHANANDA' 'MAHULSRA' 'MAKERS' 'MALAD' 'MAL' 'MANK' 'MAROL'
 'MBI00152' 'MBI00652' 'MBO00152' 'MBR00152' 'MBR00252' 'MBR00552'
 'MBR00752' 'MBR00852' 'MGHWDI' 'MHADAMANK' 'MHADASAH' 'MHADSAH' 'MIDC'
 'MINDSP' 'MIRA' 'MMRDA' 'MNR00152' 'MTR00152' 'MTR00252' 'MTR00352'
 'MVR00152' 'NAHAR SHAKTI DSS' 'NA

In [6]:
def extract_mid_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[2] if len(p) >= 3 else None

unique_mid_tokens = ht[FEEDER_ID_COL].apply(extract_mid_token).unique()
print("Unique mid tokens from FEEDER_ID_COL:", unique_mid_tokens)

Unique mid tokens from FEEDER_ID_COL: [None '15454' '15451' ... '40877' '40878' 'BUSPT']


REMOVE DT AND USE 11KV VOATGE FILE


In [None]:
# feeder_trace_latest_audit_with_rank_updated.py
"""
Workflow
========
1. Load HTCABLE.csv, drop unused columns, remove fully‑identical rows.
2. Trace every feeder edge‑by‑edge, annotate with RANK (distance from feeder start).
3. Load ENERGYAUDIT.csv, for each transformer (FUNC_LOC) compute:
   * LATEST_DT_DATE  → most‑recent SYSTEM_DATE
   * DT_LOAD         → average MD_KVA across all rows
4. Merge audit stats onto trace (DESTINATION_LOCATION = FUNC_LOC).
"""

from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, List, Set, Optional
import re

# ── CONFIG ────────────────────────────────────────────────────────────────────
INPUT_HT      = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_ENERGY  = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
# FEEDER_LIST_PATH = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/FEEDERDETAILS.csv"
OUTPUT_PATH = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT_connected.csv"

FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"   # ≡ FUNC_LOC in audit

FUNC_LOC_COL = "FUNC_LOC"
DATE_COL     = "SYSTEM_DATE"
LOAD_COL     = "MD_KVA"

REDUNDANT_COLS = [

]
# ─────────────────────────────────────────────────────────────────────────────

# 1️  LOAD & CLEAN HT-CABLE ---------------------------------------------------
ht_path = Path(INPUT_HT).expanduser()

ht = pd.read_csv(ht_path, low_memory=False)
ht = ht.drop(columns=[c for c in REDUNDANT_COLS if c in ht.columns], errors="ignore")
ht = ht.drop_duplicates()  # remove fully-identical rows

# helper to pull token after 2nd underscore
# ── helper to pull token after the 2nd underscore ────────────────────────────
def _feeder_token(val: str | int | float | None) -> Optional[str]:
    """
    Extract the FEEDER_ID part (token‑3 in strings like XXX_11kV_000123…)
    and drop any *leading* ‘0’ characters from that token.
    """
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    parts = val.split("_")

    # keep only rows whose middle token really marks 11 kV
    if len(parts) < 3 or parts[1].upper() != "11KV":
        return None

    token = parts[2].lstrip("0")        # ← strips leading zeros
    return token if token else None     # keep None instead of empty string

ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token).astype(str).str.strip()

for col in [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]:
    ht[col] = ht[col].astype(str)

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]
source_idx: Dict[Tuple[str, str], pd.DataFrame] = {
    (k[0], k[1]): g[edge_cols]
    for k, g in ht.groupby([SRC_LOC_COL, "FEEDER_ID"], sort=False)
}
#   feeder_edges = ht[ht["FEEDER_ID"] == fid][[SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL]].copy()
#     feeder_edges[SRC_LOC_COL] = feeder_edges[SRC_LOC_COL].astype(str).str.strip()
#     feeder_edges[DST_LOC_COL] = feeder_edges[DST_LOC_COL].astype(str).str.strip()
ht[FEEDER_ID_COL] = ht["FEEDER_ID"].astype(str).str.strip()
# 2️  FEEDER TRACER (with RANK) -----------------------------------------------
def trace_feeder(fid: str) -> List[dict]:
    rows: List[dict] = []
    visited: Set[Tuple[str, str]] = set()
    
    # queue holds tuples: (edge_tuple, rank)
    start = ht[(ht[SRC_SWITCH_COL] == fid) & (ht["FEEDER_ID"] == fid)][edge_cols]
    queue = [(row, 0) for row in start.to_records(index=False).tolist()]  # (edge, rank)
  
    while queue:
        (from_sw, to_sw, src_loc, dst_loc), rank = queue.pop(0)
        if (from_sw, to_sw) in visited:
            continue
        visited.add((from_sw, to_sw))

        rows.append({
            "FEEDER_ID": fid,
            "FROM_TO": f"{from_sw}-{to_sw}",
            "SOURCE_LOCATION": src_loc,
            "DESTINATION_LOCATION": dst_loc,
            "RANK": rank  # Level in the feeder tree
        })

        nxt = source_idx.get((dst_loc, fid))
        if nxt is not None and not nxt.empty:
            # Each downstream edge gets rank+1
            queue.extend([(row, rank + 1) for row in nxt.to_records(index=False).tolist()])

    return rows

# 3️  TRACE ALL FEEDERS -------------------------------------------------------
all_edges: List[dict] = []
feeder_ids = [str(f) for f in ht["FEEDER_ID"].dropna().unique()]
print(f"Tracing {len(feeder_ids)} feeders …")
for i, fid in enumerate(feeder_ids, 1):
    if i % 100 == 0 or i in {1, len(feeder_ids)}:
        print(f"  -> {i}/{len(feeder_ids)}: {fid}")
    all_edges.extend(trace_feeder(fid))

trace_df = pd.DataFrame(all_edges)

# 4️ LOAD ENERGY-AUDIT & AGGREGATE -----------------------------------------
audit_path = Path(INPUT_ENERGY).expanduser()
if not audit_path.exists():
    raise FileNotFoundError(audit_path)

print("\nLoading energy-audit …")
audit = pd.read_csv(audit_path, low_memory=False, parse_dates=[DATE_COL])
audit.columns = [c.upper() for c in audit.columns]

audit[DATE_COL] = pd.to_datetime(audit[DATE_COL], errors="coerce")

audit = audit[[FUNC_LOC_COL, DATE_COL, LOAD_COL]].dropna(subset=[FUNC_LOC_COL])

agg = (audit.groupby(FUNC_LOC_COL)
           .agg(LATEST_DT_DATE=(DATE_COL, "max"),
                DT_LOAD=(LOAD_COL,  "mean"))
           .reset_index())
agg[FUNC_LOC_COL] = agg[FUNC_LOC_COL].astype(str)

# 5️  MERGE TRACE ← AUDIT -----------------------------------------------------
merged = (trace_df.merge(agg, how="left",
                 left_on="DESTINATION_LOCATION",
                 right_on=FUNC_LOC_COL).drop(columns=[FUNC_LOC_COL]))

merged["LATEST_DT_DATE"] = pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date

# Add LOCATION column as a copy of DESTINATION_LOCATION
merged["LOCATION"] = merged["DESTINATION_LOCATION"]

# KEEP ONLY ROWS WHERE FROM_TO IS xxxx-yyyy BOTH NUMERIC
# def from_to_is_numeric(s):
#     match = re.fullmatch(r'(\d+)-(\d+)', str(s))
#     return bool(match)
# merged = merged[merged['FROM_TO'].apply(from_to_is_numeric)]

merged_raw = merged.copy()        \

# ── 1. FROM_TO numeric ----------------------------------------
def from_to_is_numeric(s):
    return bool(re.fullmatch(r'(\d+)-(\d+)', str(s)))

mask_numeric  = merged_raw['FROM_TO'].apply(from_to_is_numeric)
removed_rows  = merged_raw.loc[~mask_numeric]     # non numeric rows
kept_rows     = merged_raw.loc[mask_numeric]      # numeric rows
# ── 2. comparision of before and after FEEDER_ID  --------------------------
all_feeders_before = set(merged_raw['FEEDER_ID'].dropna().unique())
all_feeders_after  = set(kept_rows['FEEDER_ID'].dropna().unique())
lost_feeders       = sorted(all_feeders_before - all_feeders_after)

print(f"Lost feeders: {len(lost_feeders)}")

# ── 3. LOST feeders full data  --------------------------
lost_data = merged_raw[merged_raw['FEEDER_ID'].isin(lost_feeders)].copy()

lost_ids_path   = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv"
lost_data_path  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv"
# LOST_FEEDER_ID 
pd.Series(lost_feeders, name="LOST_FEEDER_ID").to_csv(lost_ids_path, index=False)
# lost_data 
lost_data.to_csv(lost_data_path, index=False)

print(f"Lost feeder IDs saved  -> {lost_ids_path}")
print(f"Full data for lost feeders saved -> {lost_data_path}")

# ── 4.  kept_rows ------------------------
merged = kept_rows.copy()       

# 6️  EXPORT ------------------------------------------------------------------
cols = ["FEEDER_ID", "FROM_TO", "SOURCE_LOCATION", "DESTINATION_LOCATION", "LOCATION", "RANK", "LATEST_DT_DATE", "DT_LOAD"]
merged.to_csv(OUTPUT_PATH, index=False, columns=cols)
print(f"\nSaved {len(merged):,} rows -> {OUTPUT_PATH}")

if __name__ == "__main__":
    try:
        from IPython.display import display
        display(merged.head())
    except Exception:
        pass 

Tracing 1425 feeders …
  -> 1/1425: None
  -> 100/1425: 41896
  -> 200/1425: 1339
  -> 300/1425: 39622
  -> 400/1425: 41588
  -> 500/1425: 31266
  -> 600/1425: 28673
  -> 700/1425: 35872
  -> 800/1425: 30134
  -> 900/1425: 18092
  -> 1000/1425: 27084
  -> 1100/1425: 36251
  -> 1200/1425: 35037
  -> 1300/1425: 30885
  -> 1400/1425: 19088
  -> 1425/1425: BUSPT

Loading energy-audit …
Lost feeders: 301
Lost feeder IDs saved  -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv
Full data for lost feeders saved -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv

Saved 8,699 rows -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT_connected.csv


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,LATEST_DT_DATE,DT_LOAD,LOCATION
0,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,0,2025-04-04,127.425882,1S-MH-MU-ZST-CL02-1238
1,15454,38195-34116,1S-MH-MU-ZST-CL02-1238,1S-MH-MU-ZST-CL02-0894,1,2025-04-04,233.628927,1S-MH-MU-ZST-CL02-0894
3,15454,34114-32764,1S-MH-MU-ZST-CL02-0894,1S-MH-MU-ZST-CL01-0860,2,2025-04-04,134.062123,1S-MH-MU-ZST-CL01-0860
5,15454,32766-31556,1S-MH-MU-ZST-CL01-0860,1S-MH-MU-ZST-CL02-0815,3,2025-04-04,245.58709,1S-MH-MU-ZST-CL02-0815
7,15454,31555-4467,1S-MH-MU-ZST-CL02-0815,1S-MH-MU-ZST-CL02-0054,4,2025-04-04,364.48599,1S-MH-MU-ZST-CL02-0054


In [27]:
df = pd.read_csv(OUTPUT_PATH)
col = df['FEEDER_ID'].unique()
print("Unique FEEDER_ID values:", len(col))

Unique FEEDER_ID values: 945


REMOVING LEADING 00 FROM THE FEEDERID AND SWITCH_IDS and retain also diconnected switches
and map all the link by source sfl and find all root note and then do bfs


In [4]:
# feeder_trace_latest_audit_with_rank_updated.py
"""
Workflow
========
1. Load HTCABLE.csv, drop unused columns, remove fully‑identical rows.
2. Trace every feeder edge‑by‑edge, annotate with RANK (distance from feeder start).
3. Load ENERGYAUDIT.csv, for each transformer (FUNC_LOC) compute:
   * LATEST_DT_DATE  → most‑recent SYSTEM_DATE
   * DT_LOAD         → average MD_KVA across all rows
4. Merge audit stats onto trace (DESTINATION_LOCATION = FUNC_LOC).
"""

from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, List, Set, Optional
import re

# ── CONFIG ────────────────────────────────────────────────────────────────────
INPUT_HT      = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_ENERGY  = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
# FEEDER_LIST_PATH = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/FEEDERDETAILS.csv"
OUTPUT_PATH = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT_disconnected_component.csv"

FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"   # ≡ FUNC_LOC in audit

FUNC_LOC_COL = "FUNC_LOC"
DATE_COL     = "SYSTEM_DATE"
LOAD_COL     = "MD_KVA"

REDUNDANT_COLS = [

]
# ─────────────────────────────────────────────────────────────────────────────

# 1️  LOAD & CLEAN HT-CABLE ---------------------------------------------------
ht_path = Path(INPUT_HT).expanduser()

ht = pd.read_csv(ht_path, low_memory=False)
ht = ht.drop(columns=[c for c in REDUNDANT_COLS if c in ht.columns], errors="ignore")
# ht = ht.drop_duplicates()  # remove fully-identical rows

# helper to pull token after 2nd underscore
def _feeder_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[2] if len(p) >= 3 and (p[1] == '11kV' or p[1]=='11Kv' or p[1]=='11KV')  else None

ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token)

for col in [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]:
    ht[col] = ht[col].astype(str)

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]
source_idx: Dict[Tuple[str, str], pd.DataFrame] = {
    (k[0], k[1]): g[edge_cols]
    for k, g in ht.groupby([SRC_LOC_COL, "FEEDER_ID"], sort=False)
}

# 2️  FEEDER TRACER (with RANK) -----------------------------------------------from collections import deque
from collections import deque

def trace_feeder(fid: str) -> list:
    rows = []
    visited = set()
    feeder_edges = ht[ht["FEEDER_ID"] == fid][[SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL]].copy()
    feeder_edges[SRC_LOC_COL] = feeder_edges[SRC_LOC_COL].astype(str).str.strip()
    feeder_edges[DST_LOC_COL] = feeder_edges[DST_LOC_COL].astype(str).str.strip()

    from_loc_map = {}
    for _, r in feeder_edges.iterrows():
        from_loc_map.setdefault(r[SRC_LOC_COL], []).append(
            tuple(r[c] for c in [SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL])
        )

    all_from = set(feeder_edges[SRC_LOC_COL])
    all_to = set(feeder_edges[DST_LOC_COL])
    root_candidates = (all_from - all_to) or all_from or set(feeder_edges[SRC_LOC_COL].unique())

    all_edges = set((row[SRC_LOC_COL], row[DST_LOC_COL]) for _, row in feeder_edges.iterrows())
    unvisited_edges = all_edges - visited

    from collections import deque
    while unvisited_edges:
        # Find the next root (or any remaining edge)
        found = False
        for root in root_candidates:
            start_rows = [e for e in feeder_edges.to_records(index=False)
                          if e[0] == root and (e[0], e[1]) in unvisited_edges]
            if start_rows:
                found = True
                break
        if not found:
            # Just pick any edge not yet visited
            start_rows = [e for e in feeder_edges.to_records(index=False)
                          if (e[0], e[1]) in unvisited_edges]
            if not start_rows:
                break
        queue = deque()
        for srow in start_rows:
            queue.append((srow, 0))
        while queue:
            (src_loc, dst_loc, src_sw, dst_sw), rank = queue.popleft()
            if (src_loc, dst_loc) in visited:
                continue
            visited.add((src_loc, dst_loc))
            unvisited_edges.discard((src_loc, dst_loc))
            rows.append({
                "FEEDER_ID": fid,
                "FROM_TO": f"{src_sw}-{dst_sw}",
                "SOURCE_LOCATION": src_loc,
                "DESTINATION_LOCATION": dst_loc,
                "RANK": rank
            })
            for next_edge in from_loc_map.get(dst_loc, []):
                if (next_edge[0], next_edge[1]) not in visited:
                    queue.append((next_edge, rank + 1))
    return rows


# 3️  TRACE ALL FEEDERS -------------------------------------------------------
all_edges: List[dict] = []
feeder_ids = [str(f) for f in ht["FEEDER_ID"].dropna().unique()]
print(f"Tracing {len(feeder_ids)} feeders …")
for i, fid in enumerate(feeder_ids, 1):
    if i % 100 == 0 or i in {1, len(feeder_ids)}:
        print(f"  -> {i}/{len(feeder_ids)}: {fid}")
    all_edges.extend(trace_feeder(fid))

trace_df = pd.DataFrame(all_edges)

# 4️ LOAD ENERGY-AUDIT & AGGREGATE -----------------------------------------
audit_path = Path(INPUT_ENERGY).expanduser()
if not audit_path.exists():
    raise FileNotFoundError(audit_path)

print("\nLoading energy-audit …")
audit = pd.read_csv(audit_path, low_memory=False, parse_dates=[DATE_COL])
audit.columns = [c.upper() for c in audit.columns]

audit[DATE_COL] = pd.to_datetime(audit[DATE_COL], errors="coerce")

audit = audit[[FUNC_LOC_COL, DATE_COL, LOAD_COL]].dropna(subset=[FUNC_LOC_COL])

agg = (audit.groupby(FUNC_LOC_COL)
           .agg(LATEST_DT_DATE=(DATE_COL, "max"),
                DT_LOAD=(LOAD_COL,  "mean"))
           .reset_index())
agg[FUNC_LOC_COL] = agg[FUNC_LOC_COL].astype(str)

# 5️  MERGE TRACE ← AUDIT -----------------------------------------------------
merged = (trace_df.merge(agg, how="left",
                 left_on="DESTINATION_LOCATION",
                 right_on=FUNC_LOC_COL).drop(columns=[FUNC_LOC_COL]))

merged["LATEST_DT_DATE"] = pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date

# Add LOCATION column as a copy of DESTINATION_LOCATION
merged["LOCATION"] = merged["DESTINATION_LOCATION"]

# KEEP ONLY ROWS WHERE FROM_TO IS xxxx-yyyy BOTH NUMERIC
# def from_to_is_numeric(s):
#     match = re.fullmatch(r'(\d+)-(\d+)', str(s))
#     return bool(match)
# merged = merged[merged['FROM_TO'].apply(from_to_is_numeric)]

merged_raw = merged.copy()        \

# ── 1. FROM_TO numeric ----------------------------------------
def from_to_is_numeric(s):
    return bool(re.fullmatch(r'(\d+)-(\d+)', str(s)))

mask_numeric  = merged_raw['FROM_TO'].apply(from_to_is_numeric)
removed_rows  = merged_raw.loc[~mask_numeric]     # non numeric rows
kept_rows     = merged_raw.loc[mask_numeric]      # numeric rows
# ── 2. comparision of before and after FEEDER_ID  --------------------------
all_feeders_before = set(merged_raw['FEEDER_ID'].dropna().unique())
all_feeders_after  = set(kept_rows['FEEDER_ID'].dropna().unique())
lost_feeders       = sorted(all_feeders_before - all_feeders_after)

print(f"Lost feeders: {len(lost_feeders)}")

# ── 3. LOST feeders full data  --------------------------
lost_data = merged_raw[merged_raw['FEEDER_ID'].isin(lost_feeders)].copy()

lost_ids_path   = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv"
lost_data_path  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv"
# LOST_FEEDER_ID 
pd.Series(lost_feeders, name="LOST_FEEDER_ID").to_csv(lost_ids_path, index=False)
# lost_data 
lost_data.to_csv(lost_data_path, index=False)

print(f"Lost feeder IDs saved  -> {lost_ids_path}")
print(f"Full data for lost feeders saved -> {lost_data_path}")

# ── 4.  kept_rows ------------------------
merged = kept_rows.copy()       

# 6️  EXPORT ------------------------------------------------------------------
cols = ["FEEDER_ID", "FROM_TO", "SOURCE_LOCATION", "DESTINATION_LOCATION", "LOCATION", "RANK", "LATEST_DT_DATE", "DT_LOAD"]
merged.to_csv(OUTPUT_PATH, index=False, columns=cols)
print(f"\nSaved {len(merged):,} rows -> {OUTPUT_PATH}")

if __name__ == "__main__":
    try:
        from IPython.display import display
        display(merged.head())
    except Exception:
        pass 

Tracing 1424 feeders …
  -> 1/1424: 15454
  -> 100/1424: 41897
  -> 200/1424: 28223
  -> 300/1424: 39624
  -> 400/1424: 41709
  -> 500/1424: 31267
  -> 600/1424: 28674
  -> 700/1424: 35873
  -> 800/1424: 30135
  -> 900/1424: 18093
  -> 1000/1424: 30031
  -> 1100/1424: 03101
  -> 1200/1424: 35038
  -> 1300/1424: 30886
  -> 1400/1424: 19090
  -> 1424/1424: BUSPT

Loading energy-audit …
Lost feeders: 304
Lost feeder IDs saved  -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv
Full data for lost feeders saved -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv

Saved 9,308 rows -> /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/final_two_column_with_rank_11_withoutDT_disconnected_component.csv


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,LATEST_DT_DATE,DT_LOAD,LOCATION
0,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,0,2025-04-04,127.425882,1S-MH-MU-ZST-CL02-1238
1,15454,38195-34116,1S-MH-MU-ZST-CL02-1238,1S-MH-MU-ZST-CL02-0894,1,2025-04-04,233.628927,1S-MH-MU-ZST-CL02-0894
3,15454,34114-32764,1S-MH-MU-ZST-CL02-0894,1S-MH-MU-ZST-CL01-0860,2,2025-04-04,134.062123,1S-MH-MU-ZST-CL01-0860
5,15454,32766-31556,1S-MH-MU-ZST-CL01-0860,1S-MH-MU-ZST-CL02-0815,3,2025-04-04,245.58709,1S-MH-MU-ZST-CL02-0815
7,15454,31555-4467,1S-MH-MU-ZST-CL02-0815,1S-MH-MU-ZST-CL02-0054,4,2025-04-04,364.48599,1S-MH-MU-ZST-CL02-0054


In [32]:
df = pd.read_csv(OUTPUT_PATH)
col = df['FEEDER_ID'].unique()
print("Unique FEEDER_ID values:", len(col))

Unique FEEDER_ID values: 1120


rank logic new with connected component


In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
feeder_trace_latest_audit_with_rank_updated.py
---------------------------------------------

Outputs a clean, *single‑component* feeder trace with hierarchical RANK labels,
joined to energy‑audit stats and sorted feeder‑by‑feeder by that RANK.
"""

from __future__ import annotations

import sys
import re
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Tuple, Set, Optional

import pandas as pd

# bump recursion depth so deep radials do not crash
sys.setrecursionlimit(200_000)

# ── FILE PATHS ───────────────────────────────────────────────────────────────
INPUT_HT = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_EN = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
OUTPUT    = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_SORTED.csv"

# debug dumps (unchanged)
LOST_IDS_OUT  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv"
LOST_DATA_OUT = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv"

# ── COLUMN NAMES ─────────────────────────────────────────────────────────────
FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"   # ≡ FUNC_LOC in audit

FUNC_LOC_COL = "FUNC_LOC"             # audit
DATE_COL     = "SYSTEM_DATE"
LOAD_COL     = "MD_KVA"

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]

# ─────────────────────────────────────────────────────────────────────────────
print("► 1. load & prep ht_cleaned.csv")
ht = (
    pd.read_csv(Path(INPUT_HT).expanduser(), low_memory=False)
      .drop_duplicates()
)

def _feeder_token(val) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.upper().split("_")
    return p[2].lstrip("0") if len(p) >= 3 and p[1] == "11KV" else None

ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token)

for c in [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]:
    ht[c] = ht[c].astype(str)

# ── 2. build adjacency (deduped) ─────────────────────────────────────────────
def _adjacency(df: pd.DataFrame, fid: str):
    sub = df[df["FEEDER_ID"] == fid][edge_cols].drop_duplicates()
    adj: Dict[str, List[Tuple]] = defaultdict(list)
    for tup in map(tuple, sub.to_records(index=False)):
        adj[tup[2]].append(tup)                       # tup[2] = SRC_LOC
    for k in adj:                                     # deterministic order
        adj[k].sort(key=lambda e: (e[1], e[3]))       # by DEST_SWITCH, DEST_LOC
    return adj

# ── 3. trace only the component rooted at SOURCE_SWITCH == FEEDER_ID ─────────
def trace_feeder(fid: str) -> List[dict]:
    adj = _adjacency(ht, fid)

    # roots == rows where SOURCE_SWITCH_ID == FEEDER_ID
    roots = [
        tuple(t) for t in
        ht[(ht[SRC_SWITCH_COL] == fid) & (ht["FEEDER_ID"] == fid)]
        [edge_cols].to_records(index=False)
    ]
    if not roots:                   # no such row → feeder is malformed → skip
        return []

    visited: Set[Tuple] = set()
    rows:    List[dict] = []
    side_counter: Dict[str, int] = {}
    global_idx = 0

    def dfs(edge: Tuple, prefix: str, spine: bool):
        nonlocal global_idx
        if edge in visited:
            return                    # loop guard
        visited.add(edge)

        f_sw, t_sw, s_loc, d_loc = edge

        # hierarchical RANK assignment
        if prefix == "":
            global_idx += 1
            rank = str(global_idx)
        elif spine:
            *base, last = map(int, prefix.split("."))
            rank = ".".join([*map(str, base), str(last + 1)]) if base else str(last + 1)
        else:
            n = side_counter[prefix] = side_counter.get(prefix, 0) + 1
            rank = f"{prefix}.{n}"

        rows.append({
            "FEEDER_ID": fid,
            "FROM_TO": f"{f_sw}-{t_sw}",
            "SOURCE_LOCATION": s_loc,
            "DESTINATION_LOCATION": d_loc,
            "RANK": rank,
        })

        kids = adj.get(d_loc, [])
        if not kids:
            return
        first, *rest = kids
        dfs(first,  rank, True)       # continue spine
        for ch in rest:               # side branches
            dfs(ch,  rank, False)

    # walk every root (there could be >1 if the feeder splits immediately)
    for r in roots:
        dfs(r, "", True)

    return rows

# ── 4. collect traces for every feeder ───────────────────────────────────────
print("► 2. trace feeders")
traces: List[dict] = []
for i, fid in enumerate(ht["FEEDER_ID"].dropna().unique(), 1):
    if i % 100 == 0 or i == 1:
        print(f"    {i}  FEEDER {fid}")
    traces.extend(trace_feeder(str(fid)))

trace_df = pd.DataFrame(traces)

# ── 5. merge energy‑audit stats ──────────────────────────────────────────────
print("► 3. merge audit")
audit = pd.read_csv(Path(INPUT_EN).expanduser(),
                    low_memory=False,
                    parse_dates=[DATE_COL])
audit.columns = [c.upper() for c in audit.columns]
audit[DATE_COL] = pd.to_datetime(audit[DATE_COL], errors="coerce")
agg = (audit[[FUNC_LOC_COL, DATE_COL, LOAD_COL]]
          .dropna(subset=[FUNC_LOC_COL])
          .groupby(FUNC_LOC_COL)
          .agg(LATEST_DT_DATE=(DATE_COL, "max"), DT_LOAD=(LOAD_COL, "mean"))
          .reset_index())
agg[FUNC_LOC_COL] = agg[FUNC_LOC_COL].astype(str)

merged = (trace_df
          .merge(agg, how="left",
                 left_on="DESTINATION_LOCATION",
                 right_on=FUNC_LOC_COL)
          .drop(columns=[FUNC_LOC_COL]))
merged["LATEST_DT_DATE"] = pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date
merged["LOCATION"] = merged["DESTINATION_LOCATION"]

# ── 6. keep only FROM_TO = digits‑digits & write debug info ─────────────────
def _digdig(s): return bool(re.fullmatch(r"\d+-\d+", str(s)))
mask = merged["FROM_TO"].apply(_digdig)
kept  = merged[mask]
dropped = merged[~mask]

lost = sorted(set(merged["FEEDER_ID"]) - set(kept["FEEDER_ID"]))
pd.Series(lost, name="LOST_FEEDER_ID").to_csv(LOST_IDS_OUT, index=False)
dropped.to_csv(LOST_DATA_OUT, index=False)

# # ── 7. sort by RANK within each feeder ───────────────────────────────────────
def _rank_key(r: str) -> Tuple[int, ...]:
    return tuple(int(x) for x in r.split("."))

kept["_RKEY"] = kept["RANK"].map(_rank_key)
kept = kept.sort_values(by=["FEEDER_ID", "_RKEY"]).drop(columns="_RKEY")


# ── 8. export CSV ────────────────────────────────────────────────────────────
cols_out = ["FEEDER_ID", "FROM_TO",
            "SOURCE_LOCATION", "DESTINATION_LOCATION", "LOCATION",
            "RANK", "LATEST_DT_DATE", "DT_LOAD"]
kept.to_csv(OUTPUT, index=False, columns=cols_out)
print(f"► 4. saved {len(kept):,} rows → {OUTPUT}")

if __name__ == "__main__":          # show a quick peek when run in notebooks
    try:
        from IPython.display import display
        display(kept.head(15))
    except Exception:
        pass


► 1. load & prep ht_cleaned.csv
► 2. trace feeders
    1  FEEDER 15454
    100  FEEDER 41897
    200  FEEDER 28223
    300  FEEDER 39624
    400  FEEDER 41709
    500  FEEDER 31267
    600  FEEDER 28674
    700  FEEDER 35873
    800  FEEDER 30135
    900  FEEDER 18093
    1000  FEEDER 30031
    1100  FEEDER 3101
    1200  FEEDER 35038
    1300  FEEDER 30886
    1400  FEEDER 19090
► 3. merge audit
► 4. saved 8,764 rows → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_SORTED.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kept["_RKEY"] = kept["RANK"].map(_rank_key)


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,LATEST_DT_DATE,DT_LOAD,LOCATION
205,10205,10205-18556,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2382,1,2025-04-04,616.32,1S-MH-MU-ZSC-CL09-2382
206,10205,18558-18559,1S-MH-MU-ZSC-CL09-2382,1S-MH-MU-ZSC-CL09-2383,2,2025-04-04,674.080488,1S-MH-MU-ZSC-CL09-2383
208,10205,10205-10634,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2723,2,2025-03-06,473.442574,1S-MH-MU-ZSC-CL09-2723
214,10205,10632-573,1S-MH-MU-ZSC-CL09-2723,1S-MH-MU-ZSC-CL09-2074,2.1,2025-01-03,207.080874,1S-MH-MU-ZSC-CL09-2074
222,10205,572-39962,1S-MH-MU-ZSC-CL09-2074,1S-MH-MU-ZSC-CL06-3404,2.1.1,NaT,,1S-MH-MU-ZSC-CL06-3404
223,10205,39964-39632,1S-MH-MU-ZSC-CL06-3404,1S-MH-MU-ZSC-CL06-3489,2.1.2,2025-04-04,203.070423,1S-MH-MU-ZSC-CL06-3489
224,10205,39633-5821,1S-MH-MU-ZSC-CL06-3489,1S-MH-MU-ZSC-CL06-2088,2.1.3,2025-04-04,508.627873,1S-MH-MU-ZSC-CL06-2088
215,10205,574-39130,1S-MH-MU-ZSC-CL09-2074,1S-MH-MU-ZSC-CL09-3373,2.2,2025-04-04,50.304,1S-MH-MU-ZSC-CL09-3373
216,10205,39131-28802,1S-MH-MU-ZSC-CL09-3373,1S-MH-MU-ZSC-CL09-3058,2.3,2025-03-06,126.298109,1S-MH-MU-ZSC-CL09-3058
217,10205,28803-32963,1S-MH-MU-ZSC-CL09-3058,1S-MH-MU-ZSC-CL08-3241,2.4,2025-04-04,103.742577,1S-MH-MU-ZSC-CL08-3241


In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
feeder_trace_latest_audit_with_rank_updated.py
---------------------------------------------

Safer, non‑recursive feeder trace with hierarchical RANK labels
(no gaps, longest‐chain first), joined to energy‑audit stats.
"""

from __future__ import annotations
import sys, re
from collections import defaultdict, deque
from pathlib import Path
from typing import Dict, List, Tuple, Set, Optional

import pandas as pd

# bump recursion limit just in case pandas etc. need it
sys.setrecursionlimit(200_000)

# ── FILE PATHS ───────────────────────────────────────────────────────────────
INPUT_HT      = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_EN      = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
OUTPUT        = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/11_WITHOUTDT_CONNECTED_RANKED.csv"
LOST_IDS_OUT  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv"
LOST_DATA_OUT = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv"

# ── COLUMN NAMES ─────────────────────────────────────────────────────────────
FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"       # ≡ FUNC_LOC
FUNC_LOC_COL   = "FUNC_LOC"
DATE_COL       = "SYSTEM_DATE"
LOAD_COL       = "MD_KVA"

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]


# ── 1. load & prep ht_cleaned.csv ─────────────────────────────────────────────
print("1. load & prep ht_cleaned.csv")
ht = (
    pd.read_csv(Path(INPUT_HT).expanduser(), low_memory=False)
      .drop_duplicates()
)

def _feeder_token(val) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.upper().split("_")
    return p[2].lstrip("0") if len(p) >= 3 and p[1] == "11KV" else None

ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token)
for c in edge_cols:
    ht[c] = ht[c].astype(str)



def trace_feeder(fid: str) -> List[dict]:
    # 1) grab only this feeder’s edges
    sub = ht[ht["FEEDER_ID"] == fid][edge_cols].drop_duplicates()
    if sub.empty:
        return []

    # 2) build directed adjacency and reverse‐adjacency
    edges: List[Tuple[str,str,str,str]] = [
        tuple(r) for r in sub.itertuples(index=False, name=None)
    ]
    adj: Dict[str, List[Tuple]] = defaultdict(list)
    # rev_adj: Dict[str, List[str]]  = defaultdict(list)
    for src_sw, dst_sw, src_loc, dst_loc in edges:
        adj[src_loc].append((src_sw, dst_sw, src_loc, dst_loc))
        # rev_adj[dst_loc].append(src_loc)

    # 3) detect root‐edges (SOURCE_SWITCH == feeder_id)
    roots = [e for e in edges if e[0] == fid]
    # if not roots:
    #     # fallback → nodes with zero in‑degree
    #     indeg: Dict[str,int] = defaultdict(int)
    #     nodes: Set[str] = set()
    #     for _,_,s_loc,d_loc in edges:
    #         nodes |= {s_loc, d_loc}
    #         indeg[d_loc] += 1
    #         indeg.setdefault(s_loc, 0)
    #     zeros = [n for n in nodes if indeg[n] == 0]
    #     if zeros:
    #         roots = [e for e in edges if e[2] in zeros]
    #     else:
    #         # truly cyclical/disconnected → just pick the first edge
    #         roots = [edges[0]]

    # 4) prune to the *reachable*, *acyclic* subgraph
    pruned: Dict[str, List[Tuple]] = defaultdict(list)
    visited_locs: Set[str] = set()
    for _,_,start_loc,_ in roots:
        stack = [start_loc]
        while stack:
            loc = stack.pop()
            if loc in visited_locs:
                continue
            visited_locs.add(loc)
            for edge in adj.get(loc, []):
                _,_,_,child = edge
                if child in visited_locs:
                    # skipping would‐be cycle
                    continue
                pruned[loc].append(edge)
                stack.append(child)

    # 5) compute subtree‐depth via Kahn’s algorithm on pruned DAG
    all_nodes = set(pruned) | {e[3] for edges in pruned.values() for e in edges}
    out_deg = {n: len(pruned.get(n, [])) for n in all_nodes}
    rev2: Dict[str,List[str]] = defaultdict(list)
    for parent, child_edges in pruned.items():
        for *_, child in child_edges:
            rev2[child].append(parent)

    depth: Dict[str,int] = {}
    q = deque([n for n,d in out_deg.items() if d == 0])
    for leaf in q:
        depth[leaf] = 1
    while q:
        node = q.popleft()
        for parent in rev2.get(node, []):
            nd = depth[node] + 1
            if depth.get(parent, 0) < nd:
                depth[parent] = nd
            out_deg[parent] -= 1
            if out_deg[parent] == 0:
                q.append(parent)

    # 6) explicit‐stack traversal: longest‐chain first
    rows: List[dict] = []
    side_cnt: Dict[str,int] = defaultdict(int)
    global_cnt = 0
    stack: List[Tuple[Tuple,str,bool]] = [
        (r, "", True) for r in reversed(roots)
    ]

    while stack:
        edge, prefix, spine = stack.pop()
        src_sw, dst_sw, src_loc, dst_loc = edge

        # assign temporary (possibly‐gappy) rank
        if prefix == "":
            global_cnt += 1
            rank_t = str(global_cnt)
        elif spine:
            *parts, last = prefix.split(".")
            rank_t = ".".join([*parts, str(int(last) + 1)]) if parts else str(int(last) + 1)
        else:
            side_cnt[prefix] += 1
            rank_t = f"{prefix}.{side_cnt[prefix]}"

        rows.append({
            "FEEDER_ID": fid,
            "FROM_TO": f"{src_sw}-{dst_sw}",
            "SOURCE_LOCATION": src_loc,
            "DESTINATION_LOCATION": dst_loc,
            "RANK": rank_t,
        })

        # get pruned children, sort by depth desc
        kids = pruned.get(dst_loc, [])
        if not kids:
            continue
        kids_sorted = sorted(
            kids,
            key=lambda e: depth.get(e[3], 1),
            reverse=True
        )
        # push side‐branches *first* (so they execute *after* the spine)
        for side in reversed(kids_sorted[1:]):
            stack.append((side, rank_t, False))
        # then the main trunk
        stack.append((kids_sorted[0], rank_t, True))

    return rows




# ── 4. collect traces ────────────────────────────────────────────────────────
print(" 2. trace feeders")
all_traces: List[dict] = []
for i, fid in enumerate(ht["FEEDER_ID"].dropna().unique(), 1):
    if i == 1 or i % 100 == 0:
        print(f"    {i}. FEEDER {fid}")
    all_traces.extend(trace_feeder(str(fid)))

trace_df = pd.DataFrame(all_traces)


# ── 5. merge audit stats ─────────────────────────────────────────────────────
print("► . merge audit")
audit = pd.read_csv(Path(INPUT_EN).expanduser(),
                    low_memory=False,
                    parse_dates=[DATE_COL])
audit.columns = [c.upper() for c in audit.columns]
audit[DATE_COL] = pd.to_datetime(audit[DATE_COL], errors="coerce")

agg = (
    audit[[FUNC_LOC_COL, DATE_COL, LOAD_COL]]
      .dropna(subset=[FUNC_LOC_COL])
      .groupby(FUNC_LOC_COL)
      .agg(LATEST_DT_DATE=(DATE_COL, "max"),
           DT_LOAD=(LOAD_COL, "mean"))
      .reset_index()
)
agg[FUNC_LOC_COL] = agg[FUNC_LOC_COL].astype(str)

merged = (
    trace_df
      .merge(agg, how="left",
             left_on="DESTINATION_LOCATION",
             right_on=FUNC_LOC_COL)
      .drop(columns=[FUNC_LOC_COL])
)
merged["LATEST_DT_DATE"] = pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date
merged["LOCATION"]        = merged["DESTINATION_LOCATION"]


# ── 6. filter valid FROM_TO, debug dumps ────────────────────────────────────
def _digdig(x): return bool(re.fullmatch(r"\d+-\d+", str(x)))
mask    = merged["FROM_TO"].apply(_digdig)
kept    = merged[mask]
dropped = merged[~mask]

lost = sorted(set(merged["FEEDER_ID"]) - set(kept["FEEDER_ID"]))
pd.Series(lost, name="LOST_FEEDER_ID").to_csv(LOST_IDS_OUT, index=False)
dropped.to_csv(LOST_DATA_OUT, index=False)


# ── 7A. gap‑free renumber per feeder ──────────────────────────────────────────
def _renumber_ranks(df: pd.DataFrame) -> pd.DataFrame:
    mapping: Dict[str,str]        = {}
    counters: Dict[str,int]       = defaultdict(int)
    new_ranks: List[str]          = []

    for old in df["RANK"]:
        prefix_old = ".".join(old.split(".")[:-1])
        prefix_new = mapping.get(prefix_old, "")
        counters[prefix_new] += 1
        idx = counters[prefix_new]
        new = f"{prefix_new}.{idx}" if prefix_new else str(idx)
        mapping[old] = new
        new_ranks.append(new)

    out = df.copy()
    out["RANK"] = new_ranks
    return out

# ── 7. final sort & renumber ─────────────────────────────────────────────────
def _rk(r: str) -> Tuple[int,...]:
    return tuple(int(x) for x in r.split("."))

kept["_RKEY"] = kept["RANK"].map(_rk)
kept = (
    kept.sort_values(["FEEDER_ID","_RKEY"])
        .drop(columns="_RKEY")
        .groupby("FEEDER_ID", group_keys=False)
        .apply(_renumber_ranks)
)


# ── 8. export ────────────────────────────────────────────────────────────────
OUT_COLS = [
    "FEEDER_ID","FROM_TO",
    "SOURCE_LOCATION","DESTINATION_LOCATION","LOCATION",
    "RANK","LATEST_DT_DATE","DT_LOAD"
]
kept.to_csv(OUTPUT, index=False, columns=OUT_COLS)
print(f"► 4. saved {len(kept):,} rows → {OUTPUT}")

if __name__ == "__main__":
    try:
        from IPython.display import display
        display(kept.head(15))
    except ImportError:
        pass


1. load & prep ht_cleaned.csv
 2. trace feeders
    1. FEEDER 15454
    100. FEEDER 41897
    200. FEEDER 28223
    300. FEEDER 39624
    400. FEEDER 41709
    500. FEEDER 31267
    600. FEEDER 28674
    700. FEEDER 35873
    800. FEEDER 30135
    900. FEEDER 18093
    1000. FEEDER 30031
    1100. FEEDER 3101
    1200. FEEDER 35038
    1300. FEEDER 30886
    1400. FEEDER 19090
► . merge audit


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  kept["_RKEY"] = kept["RANK"].map(_rk)


► 4. saved 8,584 rows → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/11_WITHOUTDT_CONNECTED_RANKED.csv


  .apply(_renumber_ranks)


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,LATEST_DT_DATE,DT_LOAD,LOCATION
119,10205,10205-18556,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2382,1.0,2025-04-04,616.32,1S-MH-MU-ZSC-CL09-2382
120,10205,18558-18559,1S-MH-MU-ZSC-CL09-2382,1S-MH-MU-ZSC-CL09-2383,2.0,2025-04-04,674.080488,1S-MH-MU-ZSC-CL09-2383
121,10205,10205-10634,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2723,3.0,2025-03-06,473.442574,1S-MH-MU-ZSC-CL09-2723
130,10205,16951-4853,1S-MH-MU-ZSC-CL09-2723,1S-MH-MU-ZSC-CL09-2366,3.1,2025-03-06,148.855367,1S-MH-MU-ZSC-CL09-2366
131,10205,4855-6089,1S-MH-MU-ZSC-CL09-2366,1S-MH-MU-ZSC-CL09-2348,3.2,2025-03-06,400.698507,1S-MH-MU-ZSC-CL09-2348
132,10205,6088-25212,1S-MH-MU-ZSC-CL09-2348,1S-MH-MU-ZSC-CL09-2019,3.3,2025-03-06,212.974384,1S-MH-MU-ZSC-CL09-2019
122,10205,10632-573,1S-MH-MU-ZSC-CL09-2723,1S-MH-MU-ZSC-CL09-2074,4.0,2025-01-03,207.080874,1S-MH-MU-ZSC-CL09-2074
127,10205,572-39962,1S-MH-MU-ZSC-CL09-2074,1S-MH-MU-ZSC-CL06-3404,4.1,NaT,,1S-MH-MU-ZSC-CL06-3404
128,10205,39964-39632,1S-MH-MU-ZSC-CL06-3404,1S-MH-MU-ZSC-CL06-3489,4.2,2025-04-04,203.070423,1S-MH-MU-ZSC-CL06-3489
129,10205,39633-5821,1S-MH-MU-ZSC-CL06-3489,1S-MH-MU-ZSC-CL06-2088,4.3,2025-04-04,508.627873,1S-MH-MU-ZSC-CL06-2088


NEW CODE EWMOVW DUPICATE ROWS AND HANDLE DTATA TYPE

In [10]:
# ONE-CELL PIPELINE  ─────────────────────────────────────────────────────────
from __future__ import annotations
import sys, re
from collections import defaultdict, deque
from pathlib import Path
from typing import Dict, List, Tuple, Set

import pandas as pd

sys.setrecursionlimit(200_000)

# ───────────────────────── CONFIG ──────────────────────────────────────────
BASE = "/media/sagark24/New Volume/MERGE CDIS"

INPUT_HT      = f"{BASE}/2-Year-data/CLEANED_DATA/ht_cleaned.csv"
INPUT_EN      = f"{BASE}/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv"
OUTPUT        = f"{BASE}/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/11_WITHOUTDT_CONNECTED_RANKED.csv"
LOST_IDS_OUT  = f"{BASE}/IPYNB_FILE/DATA_GENERATION/lost_feeder_ids.csv"
LOST_DATA_OUT = f"{BASE}/IPYNB_FILE/DATA_GENERATION/lost_feeders_full_data.csv"

FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"
FUNC_LOC_COL   = "FUNC_LOC"
DATE_COL       = "SYSTEM_DATE"
LOAD_COL       = "MD_KVA"
edge_cols      = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]

# ─────────────────────── helper: canonicalise ALL string cols ──────────────
def canon_all(df: pd.DataFrame) -> pd.DataFrame:
    obj_cols = df.select_dtypes("object").columns
    for c in obj_cols:
        df[c] = (df[c].astype(str)          # make sure dtype=object
                       .str.strip()          # remove leading/trailing spaces
                       .str.upper()        # <-- uncomment for case-insensitive
                )
    return df

# ─────────────────────── helper: 11 kV feeder token  ───────────────────────
def feeder_token(val) -> str | None:
    s = str(val).upper().split("_")
    return s[2].lstrip("0") if len(s) >= 3 and s[1] == "11KV" else None

# ─────────────────────── load HT cleaned  ──────────────────────────────────
print("1. load + clean HT")
ht = canon_all(pd.read_csv(INPUT_HT, low_memory=False)).drop_duplicates()
ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(feeder_token)


# ── 2) TRACE A SINGLE FEEDER → rows list ──────────────────────────────────
def trace_feeder(fid: str) -> List[dict]:
    sub = ht[ht["FEEDER_ID"] == fid][edge_cols].drop_duplicates()
    if sub.empty:
        return []

    # build adjacency
    edges: List[Tuple[str, str, str, str]] = [tuple(r) for r in sub.itertuples(index=False, name=None)]
    adj: Dict[str, List[Tuple]] = defaultdict(list)
    rev_adj: Dict[str, List[str]] = defaultdict(list)
    for s_sw, d_sw, s_loc, d_loc in edges:
        adj[s_loc].append((s_sw, d_sw, s_loc, d_loc))
        rev_adj[d_loc].append(s_loc)

    # pick roots whose SOURCE_SWITCH == feeder-id
    roots_raw = [e for e in edges if e[0] == fid]
    roots     = list({e[1]: e for e in roots_raw}.values()) or [edges[0]]

    # prune to reachable DAG (skip cycles)
    pruned: Dict[str, List[Tuple]] = defaultdict(list)
    visited: Set[str] = set()
    for _, _, start_loc, _ in roots:
        stack = [start_loc]
        while stack:
            loc = stack.pop()
            if loc in visited:
                continue
            visited.add(loc)
            for edge in adj.get(loc, []):
                _, _, _, child = edge
                if child in visited:
                    continue
                pruned[loc].append(edge)
                stack.append(child)

    # compute depth for trunk-first ordering
    all_nodes = set(pruned) | {e[3] for edges in pruned.values() for e in edges}
    out_deg = {n: len(pruned.get(n, [])) for n in all_nodes}
    rev2: Dict[str, List[str]] = defaultdict(list)
    for parent, child_edges in pruned.items():
        for *_, child in child_edges:
            rev2[child].append(parent)

    depth: Dict[str, int] = {}
    q = deque([n for n, d in out_deg.items() if d == 0])
    for leaf in q:
        depth[leaf] = 1
    while q:
        node = q.popleft()
        for parent in rev2.get(node, []):
            nd = depth[node] + 1
            if depth.get(parent, 0) < nd:
                depth[parent] = nd
            out_deg[parent] -= 1
            if out_deg[parent] == 0:
                q.append(parent)

    rows: List[dict] = []
    side_cnt: Dict[str, int] = defaultdict(int)
    global_cnt = 0
    stack: List[Tuple[Tuple, str, bool]] = [(r, "", True) for r in reversed(roots)]

    while stack:
        edge, prefix, spine = stack.pop()
        src_sw, dst_sw, src_loc, dst_loc = edge

        if prefix == "":
            global_cnt += 1
            rank = str(global_cnt)
        elif spine:
            parts = prefix.split(".")
            rank = ".".join([*parts[:-1], str(int(parts[-1]) + 1)]) if parts[:-1] else str(int(parts[-1]) + 1)
        else:
            side_cnt[prefix] += 1
            rank = f"{prefix}.{side_cnt[prefix]}"

        rows.append(
            dict(
                FEEDER_ID=fid,
                FROM_TO=f"{src_sw}-{dst_sw}",
                SOURCE_LOCATION=src_loc,
                DESTINATION_LOCATION=dst_loc,
                RANK=rank,
                FROM_SWITCH=src_sw,
                TO_SWITCH=dst_sw,
            )
        )

        kids = sorted(pruned.get(dst_loc, []), key=lambda e: depth.get(e[3], 1), reverse=True)
        for s in reversed(kids[1:]):  # side branches first
            stack.append((s, rank, False))
        if kids:
            stack.append((kids[0], rank, True))

    return rows
# ─────────────────────── trace every feeder  ───────────────────────────────
print("2. trace feeders")
traces=[]
for i,fid in enumerate(ht["FEEDER_ID"].dropna().unique(),1):
    if i==1 or i%100==0: print(f"   {i}. FEEDER {fid}")
    traces.extend(trace_feeder(fid))
trace_df=pd.DataFrame(traces)
trace_df= trace_df.drop_duplicates()
# ─────────────────────── load + prep audit  ────────────────────────────────
print("3. merge energy audit")
audit = canon_all(pd.read_csv(INPUT_EN, low_memory=False, parse_dates=[DATE_COL]))
audit = audit.drop_duplicates([FUNC_LOC_COL, DATE_COL, LOAD_COL])
audit.columns=[c.upper() for c in audit.columns]

agg=(audit[[FUNC_LOC_COL,DATE_COL,LOAD_COL]]
        .dropna(subset=[FUNC_LOC_COL])
        .groupby(FUNC_LOC_COL)
        .agg(LATEST_DT_DATE=(DATE_COL,"max"),DT_LOAD=(LOAD_COL,"mean"))
        .reset_index())

merged=(trace_df.merge(agg,how="left",
                       left_on="DESTINATION_LOCATION",
                       right_on=FUNC_LOC_COL)
                 .drop(columns=[FUNC_LOC_COL]))
merged["LATEST_DT_DATE"]=pd.to_datetime(merged["LATEST_DT_DATE"]).dt.date
merged["LOCATION"]=merged["DESTINATION_LOCATION"]

# ─────────────────────── filter malformed FROM_TO  ────────────────────────
is_pair=lambda x: bool(re.fullmatch(r"\d+-\d+",str(x)))
kept   = merged[merged["FROM_TO"].apply(is_pair)]
dropped= merged[~merged["FROM_TO"].apply(is_pair)]

lost=sorted(set(merged["FEEDER_ID"]) - set(kept["FEEDER_ID"]))
pd.Series(lost,name="LOST_FEEDER_ID").to_csv(LOST_IDS_OUT,index=False)
dropped.to_csv(LOST_DATA_OUT,index=False)

# ─────────────────────── gap-free renumber per feeder  ─────────────────────
# def renumber(df: pd.DataFrame) -> pd.DataFrame:
#     mapping,counters,new={},defaultdict(int),[]
#     for old in df["RANK"]:
#         pre=".".join(old.split(".")[:-1]); pre_new=mapping.get(pre,"")
#         counters[pre_new]+=1; idx=counters[pre_new]
#         new_rank=f"{pre_new}.{idx}" if pre_new else str(idx)
#         mapping[old]=new_rank; new.append(new_rank)
#     out=df.copy(); out["RANK"]=new; return out

# kept["_RK"]=kept["RANK"].map(lambda r: tuple(int(x) for x in r.split(".")))
# kept=(kept.sort_values(["FEEDER_ID","_RK"]).drop(columns="_RK")
#           .groupby("FEEDER_ID",group_keys=False).apply(renumber))

# ─────────────────────── export  ───────────────────────────────────────────
OUT_COLS=["FEEDER_ID","FROM_TO","SOURCE_LOCATION","DESTINATION_LOCATION",
          "LOCATION","RANK","LATEST_DT_DATE","DT_LOAD"]
kept = kept.sort_values(by=["FEEDER_ID", "RANK"], na_position="last")
kept.to_csv(OUTPUT,index=False,columns=OUT_COLS)
print(f" saved {len(kept):,} rows → {OUTPUT}")

# preview
from IPython.display import display
display(kept.head(5))


1. load + clean HT
2. trace feeders
   1. FEEDER 15454
   100. FEEDER 41897
   200. FEEDER 28223
   300. FEEDER 39624
   400. FEEDER 41709
   500. FEEDER 31267
   600. FEEDER 28674
   700. FEEDER 35873
   800. FEEDER 30135
   900. FEEDER 18093
   1000. FEEDER 30031
   1100. FEEDER 3101
   1200. FEEDER 35038
   1300. FEEDER 30886
   1400. FEEDER 19090
3. merge energy audit
 saved 8,481 rows → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/11_WITHOUTDT_CONNECTED_RANKED.csv


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,FROM_SWITCH,TO_SWITCH,LATEST_DT_DATE,DT_LOAD,LOCATION
119,10205,10205-18556,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2382,1.0,10205,18556,2025-04-04,616.32,1S-MH-MU-ZSC-CL09-2382
120,10205,18558-18559,1S-MH-MU-ZSC-CL09-2382,1S-MH-MU-ZSC-CL09-2383,2.0,18558,18559,2025-04-04,674.080488,1S-MH-MU-ZSC-CL09-2383
121,10205,10205-10634,1S-MH-MU-ZSC-RSTN-AMBI,1S-MH-MU-ZSC-CL09-2723,2.0,10205,10634,2025-03-06,473.442574,1S-MH-MU-ZSC-CL09-2723
130,10205,16951-4853,1S-MH-MU-ZSC-CL09-2723,1S-MH-MU-ZSC-CL09-2366,2.1,16951,4853,2025-03-06,148.855367,1S-MH-MU-ZSC-CL09-2366
131,10205,4855-6089,1S-MH-MU-ZSC-CL09-2366,1S-MH-MU-ZSC-CL09-2348,2.2,4855,6089,2025-03-06,400.698507,1S-MH-MU-ZSC-CL09-2348


In [2]:
df= pd.read_csv(OUTPUT)
col = df['FEEDER_ID'].unique()
print("Unique FEEDER_ID values:", len(col))


Unique FEEDER_ID values: 1099


In [3]:
# After your 'kept' DataFrame is ready...

# 1. Identify feeders with only main chain (no sub-branches)
def is_main_chain_only(subdf):
    return all('.' not in r for r in subdf['RANK'])

# 2. Group by FEEDER_ID, check for main chain only
main_chain_flags = (
    kept.groupby('FEEDER_ID')
        .apply(is_main_chain_only)
        .rename('IS_MAIN_CHAIN_ONLY')
        .reset_index()
)

# 3. Calculate statistics
n_total = main_chain_flags.shape[0]
n_main  = main_chain_flags['IS_MAIN_CHAIN_ONLY'].sum()
percent_main = 100 * n_main / n_total if n_total else 0

print(f"Feeders with only main chain (no sub-chains): {n_main} of {n_total} ({percent_main:.2f}%)")


Feeders with only main chain (no sub-chains): 467 of 1099 (42.49%)


  .apply(is_main_chain_only)
