BASIC DATA

In [None]:
# feeder_trace_latest_audit_with_rank.py  ✧  July 2025
"""
Workflow (HT-cable only)
========================
1. Load HTCABLE.csv, drop unused columns, remove fully-identical rows.
2. Trace every feeder edge-by-edge, annotate with RANK (distance from feeder start).
3. Add SOURCE_SS / DESTINATION_SS and split FROM_TO → FROM_SWITCH & TO_SWITCH.
4. Export to Excel – nothing from the energy-audit file is touched.
"""
from __future__ import annotations
import pandas as pd
from pathlib import Path
from typing import Dict, Tuple, List, Set, Optional

# ── CONFIG ────────────────────────────────────────────────────────────────────
INPUT_HT    = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/ht_cleaned.csv"                     # adjust path if needed
OUTPUT_PATH = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL.csv"

FEEDER_ID_COL  = "FEEDERID"
SRC_SWITCH_COL = "SOURCE_SWITCH_ID"
DST_SWITCH_COL = "DESTINATION_SWITCH_ID"
SRC_LOC_COL    = "SOURCE_SSFL"
DST_LOC_COL    = "DESTINATION_SSFL"
FEEDER_ID_COL2  = "FEEDERID"
DATE = "DATECREATED"
# ─────────────────────────────────────────────────────────────────────────────
def _feeder_token(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return p[2] if len(p) >= 3 and (p[1] == '11kV' or p[1]=='11Kv' or p[1]=='11KV') else None

def _feeder_token2(val: str | int | float | None) -> Optional[str]:
    if not isinstance(val, str):
        val = str(val) if val is not None else ""
    p = val.split("_")
    return val if len(p) >= 3 and (p[1] == '11kV' or p[1]=='11Kv' or p[1]=='11KV') else None


# 1️  LOAD & CLEAN HT-CABLE ---------------------------------------------------
ht = (pd.read_csv(Path(INPUT_HT).expanduser(), low_memory=False)
        
        .drop_duplicates())
ht["FEEDERID_FULL"] = ht[FEEDER_ID_COL2].apply(_feeder_token2).dropna()
ht["FEEDER_ID"] = ht[FEEDER_ID_COL].apply(_feeder_token)
for col in [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]:
    ht[col] = ht[col].astype(str)

# Ensure SOURCE_SS / DESTINATION_SS exist
if {"SOURCE_SS", "DESTINATION_SS"}.issubset(ht.columns):
    pass
else:
    ht["SOURCE_SS"] = ht[SRC_LOC_COL]
    ht["DESTINATION_SS"] = ht[DST_LOC_COL]

edge_cols = [SRC_SWITCH_COL, DST_SWITCH_COL, SRC_LOC_COL, DST_LOC_COL]
source_idx: Dict[Tuple[str, str], pd.DataFrame] = {
    (k[0], k[1]): g[edge_cols]
    for k, g in ht.groupby([SRC_LOC_COL, "FEEDER_ID"], sort=False)
}

# 2️  FEEDER TRACER -----------------------------------------------------------
# def trace_feeder(fid: str) -> List[dict]:
#     rows: List[dict] = []
#     visited: Set[Tuple[str, str]] = set()

#     start = ht[(ht[SRC_SWITCH_COL] == fid) & (ht["FEEDER_ID"] == fid)][edge_cols]
#     queue = [(row, 0) for row in start.to_records(index=False).tolist()]

#     while queue:
#         (from_sw, to_sw, src_loc, dst_loc), rank = queue.pop(0)
#         if (from_sw, to_sw) in visited:
#             continue
#         visited.add((from_sw, to_sw))

#         rows.append({
#             "FEEDER_ID": fid,
#             "FROM_SWITCH": from_sw,
#             "TO_SWITCH": to_sw,
         
#             "SOURCE_LOCATION": src_loc,
#             "DESTINATION_LOCATION": dst_loc,
#             "RANK": rank
#         })

#         nxt = source_idx.get((dst_loc, fid))
#         if nxt is not None and not nxt.empty:
#             queue.extend([(row, rank + 1)
#                           for row in nxt.to_records(index=False).tolist()])
#     return rows

# from collections import deque

# def trace_feeder(fid: str) -> list:
#     rows = []
#     visited = set()
#     feeder_edges = ht[ht["FEEDER_ID"] == fid][[SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL]].copy()
#     feeder_edges[SRC_LOC_COL] = feeder_edges[SRC_LOC_COL].astype(str).str.strip()
#     feeder_edges[DST_LOC_COL] = feeder_edges[DST_LOC_COL].astype(str).str.strip()

#     from_loc_map = {}
#     for _, r in feeder_edges.iterrows():
#         from_loc_map.setdefault(r[SRC_LOC_COL], []).append(
#             tuple(r[c] for c in [SRC_LOC_COL, DST_LOC_COL, SRC_SWITCH_COL, DST_SWITCH_COL])
#         )

#     all_from = set(feeder_edges[SRC_LOC_COL])
#     all_to = set(feeder_edges[DST_LOC_COL])
#     root_candidates = (all_from - all_to) or all_from or set(feeder_edges[SRC_LOC_COL].unique())

#     all_edges = set((row[SRC_LOC_COL], row[DST_LOC_COL]) for _, row in feeder_edges.iterrows())
#     unvisited_edges = all_edges - visited

#     from collections import deque
#     while unvisited_edges:
#         # Find the next root (or any remaining edge)
#         found = False
#         for root in root_candidates:
#             start_rows = [e for e in feeder_edges.to_records(index=False)
#                           if e[0] == root and (e[0], e[1]) in unvisited_edges]
#             if start_rows:
#                 found = True
#                 break
#         if not found:
#             # Just pick any edge not yet visited
#             start_rows = [e for e in feeder_edges.to_records(index=False)
#                           if (e[0], e[1]) in unvisited_edges]
#             if not start_rows:
#                 break
#         queue = deque()
#         for srow in start_rows:
#             queue.append((srow, 0))
#         while queue:
#             (src_loc, dst_loc, src_sw, dst_sw), rank = queue.popleft()
#             if (src_loc, dst_loc) in visited:
#                 continue
#             visited.add((src_loc, dst_loc))
#             unvisited_edges.discard((src_loc, dst_loc))
#             rows.append({
#                 "FEEDER_ID": fid,
#                 "FROM_TO": f"{src_sw}-{dst_sw}",
#                 "FROM_SWITCH": src_sw,
#                 "TO_SWITCH": dst_sw,
#                 "SOURCE_LOCATION": src_loc,
#                 "DESTINATION_LOCATION": dst_loc,
#                 "RANK": rank
#             })
#             for next_edge in from_loc_map.get(dst_loc, []):
#                 if (next_edge[0], next_edge[1]) not in visited:
#                     queue.append((next_edge, rank + 1))
#     return rows
from collections import defaultdict, deque
from typing import List, Tuple, Dict, Set

def trace_feeder(fid: str) -> List[dict]:
    # 1) grab only this feeder’s edges
    sub = ht[ht["FEEDER_ID"] == fid][edge_cols].drop_duplicates()
    if sub.empty:
        return []

    # 2) build directed adjacency and reverse‐adjacency
    edges: List[Tuple[str,str,str,str]] = [
        tuple(r) for r in sub.itertuples(index=False, name=None)
    ]
    adj: Dict[str, List[Tuple]] = defaultdict(list)
    rev_adj: Dict[str, List[str]]  = defaultdict(list)
    for src_sw, dst_sw, src_loc, dst_loc in edges:
        adj[src_loc].append((src_sw, dst_sw, src_loc, dst_loc))
        rev_adj[dst_loc].append(src_loc)

    # 3) detect root‐edges (SOURCE_SWITCH == feeder_id)
    roots = [e for e in edges if e[0] == fid]
    if not roots:
        # fallback → nodes with zero in‑degree
        indeg: Dict[str,int] = defaultdict(int)
        nodes: Set[str] = set()
        for _,_,s_loc,d_loc in edges:
            nodes |= {s_loc, d_loc}
            indeg[d_loc] += 1
            indeg.setdefault(s_loc, 0)
        zeros = [n for n in nodes if indeg[n] == 0]
        if zeros:
            roots = [e for e in edges if e[2] in zeros]
        else:
            # truly cyclical/disconnected → just pick the first edge
            roots = [edges[0]]

    # 4) prune to the *reachable*, *acyclic* subgraph
    pruned: Dict[str, List[Tuple]] = defaultdict(list)
    visited_locs: Set[str] = set()
    for _,_,start_loc,_ in roots:
        stack = [start_loc]
        while stack:
            loc = stack.pop()
            if loc in visited_locs:
                continue
            visited_locs.add(loc)
            for edge in adj.get(loc, []):
                _,_,_,child = edge
                if child in visited_locs:
                    # skipping would‐be cycle
                    continue
                pruned[loc].append(edge)
                stack.append(child)

    # 5) compute subtree‐depth via Kahn’s algorithm on pruned DAG
    all_nodes = set(pruned) | {e[3] for edges in pruned.values() for e in edges}
    out_deg = {n: len(pruned.get(n, [])) for n in all_nodes}
    rev2: Dict[str,List[str]] = defaultdict(list)
    for parent, child_edges in pruned.items():
        for *_, child in child_edges:
            rev2[child].append(parent)

    depth: Dict[str,int] = {}
    q = deque([n for n,d in out_deg.items() if d == 0])
    for leaf in q:
        depth[leaf] = 1
    while q:
        node = q.popleft()
        for parent in rev2.get(node, []):
            nd = depth[node] + 1
            if depth.get(parent, 0) < nd:
                depth[parent] = nd
            out_deg[parent] -= 1
            if out_deg[parent] == 0:
                q.append(parent)

    # 6) explicit‐stack traversal: longest‐chain first
    rows: List[dict] = []
    side_cnt: Dict[str,int] = defaultdict(int)
    global_cnt = 0
    stack: List[Tuple[Tuple,str,bool]] = [
        (r, "", True) for r in reversed(roots)
    ]

    while stack:
        edge, prefix, spine = stack.pop()
        src_sw, dst_sw, src_loc, dst_loc = edge

        # assign temporary (possibly‐gappy) rank
        if prefix == "":
            global_cnt += 1
            rank_t = str(global_cnt)
        elif spine:
            *parts, last = prefix.split(".")
            rank_t = ".".join([*parts, str(int(last) + 1)]) if parts else str(int(last) + 1)
        else:
            side_cnt[prefix] += 1
            rank_t = f"{prefix}.{side_cnt[prefix]}"

        rows.append({
            "FEEDER_ID": fid,
            "FROM_TO": f"{src_sw}-{dst_sw}",
            "SOURCE_LOCATION": src_loc,
            "DESTINATION_LOCATION": dst_loc,
            "RANK": rank_t,
            "FROM_SWITCH": src_sw,
            "TO_SWITCH": dst_sw,
         
        })

        # get pruned children, sort by depth desc
        kids = pruned.get(dst_loc, [])
        if not kids:
            continue
        kids_sorted = sorted(
            kids,
            key=lambda e: depth.get(e[3], 1),
            reverse=True
        )
        # push side‐branches *first* (so they execute *after* the spine)
        for side in reversed(kids_sorted[1:]):
            stack.append((side, rank_t, False))
        # then the main trunk
        stack.append((kids_sorted[0], rank_t, True))

    return rows

# 3️  TRACE ALL FEEDERS -------------------------------------------------------
trace_df = pd.DataFrame([row
                         for fid in ht["FEEDER_ID"].dropna().unique()
                         for row in trace_feeder(str(fid))])

# 4️  ADD SS COLUMNS ----------------------------------------------------------
trace_df = (trace_df.merge(ht[[SRC_LOC_COL, DST_LOC_COL,
                               "SOURCE_SS", "DESTINATION_SS" ,"FEEDERID_FULL", "NEUTRALMATERIAL" , "CABLECONDUCTORMATERIAL" , "DATECREATED","CABLETYPE" ,"COMMENTS" ,"REMARKS" ,"MEASUREDLENGTH"]]
                           .rename(columns={SRC_LOC_COL: "SOURCE_LOCATION",
                                            DST_LOC_COL: "DESTINATION_LOCATION"})
                           .drop_duplicates(),
                           how="left",
                           on=["SOURCE_LOCATION", "DESTINATION_LOCATION"]))

# 5️  EXPORT ------------------------------------------------------------------
out_cols = ["FEEDER_ID","FEEDERID_FULL",
            "FROM_SWITCH", "TO_SWITCH", 
            "SOURCE_SS", "DESTINATION_SS",
            "SOURCE_LOCATION", "DESTINATION_LOCATION",
            "RANK","DATECREATED", "COMMENTS","NEUTRALMATERIAL" , "CABLECONDUCTORMATERIAL" , "CABLETYPE","REMARKS","MEASUREDLENGTH"]

trace_df.to_csv(OUTPUT_PATH, index=False, columns=out_cols)
print(f"\nSaved {len(trace_df):,} rows → {OUTPUT_PATH}")

# Preview for interactive sessions
if __name__ == "__main__":
    try:
        from IPython.display import display
        display(trace_df.head())
    except Exception:
        pass



Saved 40,142 rows → /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL.csv


Unnamed: 0,FEEDER_ID,FROM_TO,SOURCE_LOCATION,DESTINATION_LOCATION,RANK,FROM_SWITCH,TO_SWITCH,SOURCE_SS,DESTINATION_SS,FEEDERID_FULL,NEUTRALMATERIAL,CABLECONDUCTORMATERIAL,DATECREATED,CABLETYPE,COMMENTS,REMARKS,MEASUREDLENGTH
0,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,1,15454,38196,24TH ROAD REC-STN,GANGA JAMUNA SANGAM,24THRD_11KV_15454,AL,AL,2009-04-13 00:00:00+00:00,XLPE,24TH ROAD REC-STN TO GANGA JAMUNA CHS (FROM SW...,,64.0
1,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,1,15454,38196,24TH ROAD REC-STN,GANGA JAMUNA SANGAM,24THRD_11KV_15454,AL,AL,2016-07-12 00:00:00+00:00,XLPE,24TH ROAD REC-STN TO GANGA JAMUNA CHS (JT.NO.2...,,36.9
2,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,1,15454,38196,24TH ROAD REC-STN,GANGA JAMUNA SANGAM,24THRD_11KV_15454,AL,AL,2009-04-13 00:00:00+00:00,XLPE,24TH ROAD REC-STN TO GANGA JAMUNA CHS (JT.NO.2...,,98.8
3,15454,15454-38196,1S-MH-MU-ZST-RSTN-24TH,1S-MH-MU-ZST-CL02-1238,1,15454,38196,24TH ROAD REC-STN,GANGA JAMUNA SANGAM,24THRD_11KV_15454,AL,AL,2016-09-12 00:00:00+00:00,XLPE,24TH ROAD REC-STN TO GANGA JAMUNA CHS (JT.NO.2...,,43.2
4,15454,38195-34116,1S-MH-MU-ZST-CL02-1238,1S-MH-MU-ZST-CL02-0894,2,38195,34116,GANGA JAMUNA SANGAM,FORTUNE ENCLAVE,24THRD_11KV_15454,AL,AL,2016-09-12 00:00:00+00:00,XLPE,GANGA JAMUNA CHS TO FORTUNE ENCLAVE (FROM SWNO...,,34.0


next file generation

In [3]:
import pandas as pd
from pathlib import Path

# --- Paths ---
AFINAL_PATH        = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL.csv")
ENERGY_AUDIT_PATH  = Path("/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv")
OUTPUT_CSV_PATH    = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv")

# --- 1) LOAD DATA ---
df = pd.read_csv(AFINAL_PATH, low_memory=False)
df.columns = [c.upper() for c in df.columns]

audit = pd.read_csv(
    ENERGY_AUDIT_PATH,
    usecols=["SWITCH_NO", "CLUSTER_TYPE"],
    low_memory=False
).rename(str.upper, axis=1)

audit["SWITCH_NO"] = audit["SWITCH_NO"].astype(str).str.strip()

# Keep only the first CLUSTER_TYPE for each switch, if there are duplicates
audit_cluster = audit.drop_duplicates(subset=["SWITCH_NO"])

# --- 2) MERGE CLUSTER_TYPE ON FROM_SWITCH ---
df["FROM_SWITCH"] = df["FROM_SWITCH"].astype(str).str.strip()

df_cluster = df.merge(
    audit_cluster[["SWITCH_NO", "CLUSTER_TYPE"]],
    how="left",
    left_on="FROM_SWITCH",
    right_on="SWITCH_NO"
).drop(columns=["SWITCH_NO"])

# --- 3) SAVE OUTPUT ---
df_cluster.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Cluster type added: {OUTPUT_CSV_PATH}")


Cluster type added: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv


In [4]:
# import pandas as pd
# from pathlib import Path

# # ── PATHS ────────────────────────────────────────────────────────────────────
# AFINAL_PATH        = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL.csv")
# ENERGY_AUDIT_PATH  = Path("//media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/energyaudit_cleaned.csv")
# FEEDERDETAIL_PATH  = Path("//media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/feederdetails_cleaned.csv")
# OUTPUT_CSV_PATH    = Path("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv")

# # ── 1) LOAD AFINAL -----------------------------------------------------------
# df = pd.read_csv(AFINAL_PATH, low_memory=False)
# df.columns = [c.upper() for c in df.columns]

# # ── 2) SWITCH-LEVEL STATS (ENERGYAUDIT) -------------------------------------
# audit = (pd.read_csv(
#             ENERGY_AUDIT_PATH,
#             usecols=["SWITCH_NO", "LOAD_FACTOR", "Y_INST_VOLTAGE", "CLUSTER_TYPE"],
#             low_memory=False
#          ).rename(str.upper, axis=1))

# audit["SWITCH_NO"]      = audit["SWITCH_NO"].astype(str).str.strip()
# audit["LOAD_FACTOR"]    = pd.to_numeric(audit["LOAD_FACTOR"],    errors="coerce")
# audit["Y_INST_VOLTAGE"] = pd.to_numeric(audit["Y_INST_VOLTAGE"], errors="coerce")

# def first_non_null(x):
#     y = x.dropna()
#     return y.iloc[0] if len(y) else None

# switch_stats = (audit.dropna(subset=["SWITCH_NO"])
#                      .groupby("SWITCH_NO")
#                      .agg(
#                          # LOAD_FACTOR
#                          SWITCH_LOAD_FACTOR_MEAN   = ("LOAD_FACTOR", "mean"),
#                          SWITCH_LOAD_FACTOR_MEDIAN = ("LOAD_FACTOR", "median"),
#                          SWITCH_LOAD_FACTOR_STD    = ("LOAD_FACTOR", "std"),
#                          SWITCH_LOAD_FACTOR_MIN    = ("LOAD_FACTOR", "min"),
#                          SWITCH_LOAD_FACTOR_MAX    = ("LOAD_FACTOR", "max"),
#                          # Y_INST_VOLTAGE
#                          SWITCH_Y_INST_VOLTAGE_MEAN   = ("Y_INST_VOLTAGE", "mean"),
#                          SWITCH_Y_INST_VOLTAGE_MEDIAN = ("Y_INST_VOLTAGE", "median"),
#                          SWITCH_Y_INST_VOLTAGE_STD    = ("Y_INST_VOLTAGE", "std"),
#                          SWITCH_Y_INST_VOLTAGE_MIN    = ("Y_INST_VOLTAGE", "min"),
#                          SWITCH_Y_INST_VOLTAGE_MAX    = ("Y_INST_VOLTAGE", "max"),
#                          # categorical
#                          CLUSTER_TYPE = ("CLUSTER_TYPE", first_non_null)
#                      )
#                      .reset_index())

# # switch_stats[["SWITCH_LOAD_FACTOR_STD","SWITCH_Y_INST_VOLTAGE_STD"]] = \
# #     switch_stats[["SWITCH_LOAD_FACTOR_STD","SWITCH_Y_INST_VOLTAGE_STD"]].fillna(0)

# df["FROM_SWITCH"] = df["FROM_SWITCH"].astype(str).str.strip()
# df = (df.merge(switch_stats, how="left",
#                left_on="FROM_SWITCH", right_on="SWITCH_NO")
#         .drop(columns=["SWITCH_NO"]))

# # ── 3) FEEDER-LEVEL STATS (FEEDERDETAILS) -----------------------------------
# feeder = (pd.read_csv(
#             FEEDERDETAIL_PATH,
#             usecols=["SWITCHID", "FEEDERLOAD", "LOADFACTOR", "LOADLOSSFACTOR"],
#             low_memory=False
#          ).rename(str.upper, axis=1))

# feeder["SWITCHID"]        = feeder["SWITCHID"].astype(str).str.strip()
# for col in ["FEEDERLOAD", "LOADFACTOR", "LOADLOSSFACTOR"]:
#     feeder[col] = pd.to_numeric(feeder[col], errors="coerce")

# feeder_stats = (feeder.dropna(subset=["SWITCHID"])
#                       .groupby("SWITCHID")
#                       .agg(
#                           # FEEDERLOAD
#                           FEEDER_LOAD_MEAN   = ("FEEDERLOAD", "mean"),
#                           FEEDER_LOAD_MEDIAN = ("FEEDERLOAD", "median"),
#                           FEEDER_LOAD_STD    = ("FEEDERLOAD", "std"),
#                           FEEDER_LOAD_MIN    = ("FEEDERLOAD", "min"),
#                           FEEDER_LOAD_MAX    = ("FEEDERLOAD", "max"),
#                           # LOADFACTOR
#                           FEEDER_LOAD_FACTOR_MEAN   = ("LOADFACTOR", "mean"),
#                           FEEDER_LOAD_FACTOR_MEDIAN = ("LOADFACTOR", "median"),
#                           FEEDER_LOAD_FACTOR_STD    = ("LOADFACTOR", "std"),
#                           FEEDER_LOAD_FACTOR_MIN    = ("LOADFACTOR", "min"),
#                           FEEDER_LOAD_FACTOR_MAX    = ("LOADFACTOR", "max"),
#                           # LOADLOSSFACTOR
#                           FEEDER_LOSS_FACTOR_MEAN   = ("LOADLOSSFACTOR", "mean"),
#                           FEEDER_LOSS_FACTOR_MEDIAN = ("LOADLOSSFACTOR", "median"),
#                           FEEDER_LOSS_FACTOR_STD    = ("LOADLOSSFACTOR", "std"),
#                           FEEDER_LOSS_FACTOR_MIN    = ("LOADLOSSFACTOR", "min"),
#                           FEEDER_LOSS_FACTOR_MAX    = ("LOADLOSSFACTOR", "max")
#                       )
#                       .reset_index())

# std_cols = [c for c in feeder_stats.columns if c.endswith("_STD")]
# feeder_stats[std_cols] = feeder_stats[std_cols].fillna(0)

# # ...  merge feeder_stats ---------------------------------------
# df["FEEDER_ID"] = df["FEEDER_ID"].astype(str).str.strip()
# df = (df.merge(feeder_stats, how="left",
#                left_on="FEEDER_ID", right_on="SWITCHID")
#         .drop(columns=["SWITCHID"]))

# # ── KEEP FEEDER-STAT COLUMNS ONLY ON THE FIRST ROW OF EACH FEEDER ───────────
# # (place this right after the merge with feeder_stats and BEFORE step 4)

# # all columns that start with FEEDER_  *except* the ID itself
# feeder_stat_cols = [
#     c for c in df.columns
#     if c.startswith("FEEDER_") and c != "FEEDER_ID"
# ]

# # mask: True on very first row for each FEEDER_ID
# first_row_mask = ~df.duplicated(subset="FEEDER_ID", keep="first")

# # set stats to NA on later rows; FEEDER_ID column is left alone
# df.loc[~first_row_mask, feeder_stat_cols] = pd.NA

# # ── 4) APPEND GLOBAL MIN / MAX ROWS ----------------------------
# num_cols = df.select_dtypes(include="number").columns
# min_row  = df[num_cols].min().rename("GLOBAL_MIN")
# max_row  = df[num_cols].max().rename("GLOBAL_MAX")


# for col in df.columns:
#     if col not in num_cols:
#         min_row[col] = ""
#         max_row[col] = ""

# df_full = pd.concat([df, min_row.to_frame().T, max_row.to_frame().T],
#                     ignore_index=True)

# # ── 5) WRITE SINGLE CSV ------------------------------------------------------
# df_full.to_csv(OUTPUT_CSV_PATH, index=False)
# print(f"  All data   {OUTPUT_CSV_PATH}")


ADD NETWORKDETAILS 

In [5]:
# import pandas as pd

# csv_main    = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
# csv_network = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/CLEANED_DATA/networkdetails_cleaned.csv"

# df = pd.read_csv(csv_main, low_memory=False)
# df.columns = [c.upper() for c in df.columns]

# # ---------------------------------------------------------------------------
# wanted = [
#     "FROM_SWITCHID",        # string
#     # 
#     "NOOFJOINTS", "NOOFSUBSTATION",      # **counts → sum**
   
# ]

# present  = pd.read_csv(csv_network, nrows=0).columns.str.upper()
# use_cols = [c for c in wanted if c in present]

# network = (pd.read_csv(csv_network, usecols=use_cols, low_memory=False)
#              .rename(str.upper, axis=1))

# network["FROM_SWITCHID"] = network["FROM_SWITCHID"].astype(str).str.strip()
# df["FROM_SWITCH"]        = df["FROM_SWITCH"].astype(str).str.strip()

# # force numerics where appropriate
# num_try = []
# for col in num_try + ["NOOFJOINTS","NOOFSUBSTATION"]:
#     if col in network.columns:
#         network[col] = pd.to_numeric(network[col], errors="coerce")

# # ----- aggregation rules ----------------------------------------------------
# def first_non_null(s):
#     x = s.dropna()
#     return x.iloc[0] if len(x) else None

# agg_dict = {
#     # default rule for all numeric columns we coerced earlier
#     **{c: "mean" for c in num_try if c in network.columns},
#     # counts → SUM so they remain integers
#     **{c: "sum"  for c in ["NOOFJOINTS","NOOFSUBSTATION"] if c in network.columns},
# }


# net_agg = (network.groupby("FROM_SWITCHID", dropna=False)
#                   .agg(agg_dict)
#                   .reset_index())

# # join
# df = (df.merge(net_agg, how="left",
#                left_on="FROM_SWITCH", right_on="FROM_SWITCHID")
#         .drop(columns=["FROM_SWITCHID"]))

# # cast summed counts to nullable Int64 for cleanliness
# for c in ["NOOFJOINTS","NOOFSUBSTATION"]:
#     if c in df.columns:
#         df[c] = df[c].round().astype("Int64")

# # save
# df.to_csv(csv_main, index=False)
# print(f"  Updated file written  {csv_main}")

# if "CABLESIZE" in df.columns:
#     print("Rows with CABLESIZE filled:", df["CABLESIZE"].notna().sum())


FAULT DATA 

In [6]:
# import pandas as pd
# from collections import Counter

# # ── FILE LOCATIONS ───────────────────────────────────────────────────────────
# CABLE_PATH  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
# FAULT_PATH  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv"

# OUT_PATH    = CABLE_PATH   # overwrite
# # FEEDER_SUMMARY_PATH = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/FEEDER_FAULT_SUMMARY.csv"

# # ── 1) LOAD ------------------------------------------------------------------
# cables = pd.read_csv(CABLE_PATH, low_memory=False)
# faults = pd.read_csv(FAULT_PATH, low_memory=False)

# cables.columns = [c.upper() for c in cables.columns]
# faults.columns = [c.upper() for c in faults.columns]

# # ── 2) NORMALISE ID STRINGS --------------------------------------------------
# def norm_id(s: pd.Series) -> pd.Series:
#     return (
#         s.astype(str)
#          .str.strip()
#          .str.upper()
#          .str.replace(r"\.0+$", "", regex=True)        # 15454.0 → 15454
#          .str.replace(r"[^A-Z0-9]", "", regex=True)    # keep A-Z,0-9
#          .str.lstrip("0")                              # drop leading zeros
#     )

# cables["FROM_SWITCH_N"] = norm_id(cables["FROM_SWITCH"])
# cables["FEEDER_ID_N"]   = norm_id(cables["FEEDER_ID"])
# faults["FROM_SWITCH_N"] = norm_id(faults["FROM_SWITCH"])
# faults["SWITCH_NO_N"]   = norm_id(faults["SWITCH_NO"])

# # ── 3) PREP FAULT NUMERICS ---------------------------------------------------
# faults["TIME_OUTAGE"]           = pd.to_datetime(faults["TIME_OUTAGE"], errors="coerce")
# faults["TIME_DIFFERENCE_HOURS"] = pd.to_numeric(faults["TIME_DIFFERENCE_HOURS"], errors="coerce")

# def mode(series):
#     nn = series.dropna()
#     return Counter(nn).most_common(1)[0][0] if len(nn) else None

# # ── 4-A) CABLE-LEVEL STATS  (prefix CBL_) -----------------------------------
# cbl_stats = (
#     faults.groupby("FROM_SWITCH_N")
#           .agg(
#               CBL_FAULT_COUNT          = ("FROM_SWITCH_N", "size"),
#               CBL_AVG_REPAIR_HRS       = ("TIME_DIFFERENCE_HOURS", "mean"),
#               CBL_MAX_REPAIR_HRS       = ("TIME_DIFFERENCE_HOURS", "max"),
#               CBL_LATEST_OUTAGE        = ("TIME_OUTAGE", "max"),
#               CBL_COMMON_REASON_CAT    = ("REASON_CATEGORY", mode),
#               CBL_COMMON_REASON_TEXT   = ("REASON_TEXT",    mode),
#               CBL_COMMON_RELAY_FUSE    = ("RELAY_FUSE",     mode),
#           )
#           .reset_index()
# )
# cbl_stats[["CBL_AVG_REPAIR_HRS","CBL_MAX_REPAIR_HRS"]] = (
#     cbl_stats[["CBL_AVG_REPAIR_HRS","CBL_MAX_REPAIR_HRS"]].round(2)
# )

# # ── 4-B) FEEDER-LEVEL STATS  (prefix FDR_) ----------------------------------
# fdr_stats = (
#     faults.groupby("SWITCH_NO_N")
#           .agg(
#               FDR_FAULT_COUNT          = ("SWITCH_NO_N", "size"),
#               FDR_AVG_REPAIR_HRS       = ("TIME_DIFFERENCE_HOURS", "mean"),
#               FDR_MAX_REPAIR_HRS       = ("TIME_DIFFERENCE_HOURS", "max"),
#               FDR_FIRST_OUTAGE         = ("TIME_OUTAGE", "min"),
#               FDR_LAST_OUTAGE          = ("TIME_OUTAGE", "max"),
#               FDR_COMMON_REASON_CAT    = ("REASON_CATEGORY", mode),
#               FDR_COMMON_REASON_TEXT   = ("REASON_TEXT",    mode),
#               FDR_COMMON_RELAY_FUSE    = ("RELAY_FUSE",     mode),
#           )
#           .reset_index()
# )
# fdr_stats[["FDR_AVG_REPAIR_HRS","FDR_MAX_REPAIR_HRS"]] = (
#     fdr_stats[["FDR_AVG_REPAIR_HRS","FDR_MAX_REPAIR_HRS"]].round(2)
# )

 

# # ── 5) MERGE NEW COLUMNS -----------------------------------------------------
# cables = cables.merge(cbl_stats, how="left", on="FROM_SWITCH_N")
# cables = cables.merge(
#             fdr_stats.rename(columns={"SWITCH_NO_N": "FEEDER_ID_N"}),
#             how="left", on="FEEDER_ID_N"
# )

# # ---- NEW: FEEDER-WISE TOTAL FAULTS COLUMN ------------------------------------
# # For each FEEDER_ID, sum all CBL_FAULT_COUNT for cables on that feeder
# # Use the pre-merge cables (to ensure all cables are included)
# cables['FEEDER_FAULT_SUM'] = (
#     cables.groupby('FEEDER_ID')['CBL_FAULT_COUNT']
#           .transform(lambda x: x.fillna(0).sum())
# )

# # Optionally: also provide a count of cables per feeder
# cables['CABLES_PER_FEEDER'] = (
#     cables.groupby('FEEDER_ID')['FROM_SWITCH'].transform('count')
# )

# # ── 6) KEEP FEEDER_… COLUMNS ONLY ON FIRST CABLE OF EACH FEEDER -------------
# feeder_cols = [c for c in cables.columns if c.startswith("FDR_")]
# first_row_mask = ~cables.duplicated(subset="FEEDER_ID_N", keep="first")
# cables.loc[~first_row_mask, feeder_cols] = pd.NA   # blanks after first row

# # ── 7) CLEAN-UP & SAVE -------------------------------------------------------
# cables = cables.drop(columns=["FROM_SWITCH_N","FEEDER_ID_N"])
# cables.to_csv(OUT_PATH, index=False)

# print(" cable rows with CBL_FAULT_COUNT :", cables['CBL_FAULT_COUNT'].notna().sum())
# print(" feeders with FDR_FAULT_COUNT    :", fdr_stats.shape[0])
# print(" updated file                    :", OUT_PATH)



In [7]:
import pandas as pd
import re

# ── FILE LOCATIONS ───────────────────────────────────────────────────────────
CABLE_PATH  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
FAULT_PATH  = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed_without_affected.csv"
OUT_PATH    = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full_path.csv"

# ── 1) LOAD ------------------------------------------------------------------
cables = pd.read_csv(CABLE_PATH, low_memory=False)
faults = pd.read_csv(FAULT_PATH, low_memory=False)

cables.columns = [c.upper() for c in cables.columns]
faults.columns = [c.upper() for c in faults.columns]

# ── 2) NORMALISE ID STRINGS --------------------------------------------------
def norm_id(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.strip()
         .str.upper()
         .str.replace(r"\.0+$", "", regex=True)
         .str.replace(r"[^A-Z0-9]", "", regex=True)
         .str.lstrip("0")
    )

cables["FROM_SWITCH_N"] = norm_id(cables["FROM_SWITCH"])
cables["FEEDER_ID_N"]   = norm_id(cables["FEEDER_ID"])
faults["FROM_SWITCH_N"] = norm_id(faults["FROM_SWITCH"])
faults["TO_SWITCH_N"]   = norm_id(faults["TO_SWITCH"])

# ── 2A) PATH COLUMN FROM GROUPED COMMENTS ------------------------------------
def extract_path_from_comments(comments):
    paths = []
    for comment in comments:
        if not isinstance(comment, str) or not comment.strip():
            continue
        comment_up = comment.upper()
        matches = re.findall(r'(SWNO_\w+|JT\.NO\.\d+[A-Z]?)', comment_up)
        if matches:
            paths.append(" -> ".join(matches))
        else:
            paths.append(comment.strip())
    unique_paths = [p for p in pd.unique(paths) if p]
    return " | ".join(unique_paths)

if set(["COMMENTS", "FROM_SWITCH", "TO_SWITCH"]).issubset(cables.columns):
    cables["PATH"] = (
        cables.groupby(['FROM_SWITCH', 'TO_SWITCH'])['COMMENTS']
        .transform(extract_path_from_comments)
    )
    cables = cables.drop(columns=["COMMENTS"])
else:
    cables["PATH"] = ""

# --- SEGMENT COUNT (NO_OF_SEGMENT) --------------------------------------------
segment_counts = (
    cables
    .groupby(['FROM_SWITCH', 'TO_SWITCH'])
    .size()
    .reset_index(name='NO_OF_SEGMENT')
)
cables = cables.merge(segment_counts, on=['FROM_SWITCH', 'TO_SWITCH'], how='left')

# --- AGGREGATE MEASUREDLENGTH ------------------------------------------------
def is_alphanumeric(s):
    return bool(re.search('[A-Z]', str(s)))

def get_dedup_key(row):
    to_switch = row['TO_SWITCH']
    if is_alphanumeric(to_switch):
        return f"{row['FROM_SWITCH']}|{to_switch}"
    else:
        return str(to_switch)

cables['DEDUP_KEY'] = cables.apply(get_dedup_key, axis=1)

# Ensure MEASUREDLENGTH is numeric
if 'MEASUREDLENGTH' in cables.columns:
    cables['MEASUREDLENGTH'] = pd.to_numeric(cables['MEASUREDLENGTH'], errors='coerce')
else:
    cables['MEASUREDLENGTH'] = float('nan')

agg_lengths = (
    cables.groupby('DEDUP_KEY', dropna=False)['MEASUREDLENGTH']
    .sum()
    .reset_index()
    .rename(columns={'MEASUREDLENGTH': 'AGG_MEASUREDLENGTH'})
)

cables = cables.merge(agg_lengths, on='DEDUP_KEY', how='left')

# --- Fault stats per FROM_SWITCH (as earlier) --------------------------------
cables['FROM_SWITCH'] = cables['FROM_SWITCH'].astype(str).str.strip().str.upper()
cables['TO_SWITCH'] = cables['TO_SWITCH'].astype(str).str.strip().str.upper()
faults['FROM_SWITCH'] = faults['FROM_SWITCH'].astype(str).str.strip().str.upper()
faults['TO_SWITCH']   = faults['TO_SWITCH'].astype(str).str.strip().str.upper()

def stats_for_switch(switch):
    match_faults = faults[(faults['FROM_SWITCH'] == switch) | (faults['TO_SWITCH'] == switch)]
    count = len(match_faults)
    latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
    avg_repair = match_faults['TIME_DIFFERENCE_HOURS'].mean() if count else float('nan')
    max_repair = match_faults['TIME_DIFFERENCE_HOURS'].max() if count else float('nan')
    return pd.Series([count, latest, avg_repair, max_repair])

cables[['FAULT_SWITCH_COUNT', 'CBL_LATEST_OUTAGE', 'CBL_AVG_REPAIR_HRS', 'CBL_MAX_REPAIR_HRS']] = \
    cables['FROM_SWITCH'].apply(stats_for_switch)

# Remove unneeded columns if present
cols_to_remove = [c for c in ['FAULT_SWITCH_MATCH', 'FDR_FAULT_COUNT', 'FDR_COMMON_REASON_CAT', 'FDR_COMMON_REASON_TEXT'] if c in cables.columns]
cables = cables.drop(columns=cols_to_remove)

# KEEP FEEDER_… COLUMNS ONLY ON FIRST CABLE OF EACH FEEDER
feeder_cols = [c for c in cables.columns if c.startswith("FDR_")]
first_row_mask = ~cables.duplicated(subset="FEEDER_ID_N", keep="first")
cables.loc[~first_row_mask, feeder_cols] = pd.NA   # blanks after first row


# DEDUPLICATION LOGIC (with alphanumeric TO_SWITCH vs numeric TO_SWITCH key)
if 'DATECREATED' in cables.columns:
    cables['DATECREATED'] = pd.to_datetime(cables['DATECREATED'], errors='coerce')
    min_dates = cables.groupby('DEDUP_KEY', dropna=False)['DATECREATED'].min().reset_index()
    min_dates = min_dates.rename(columns={'DATECREATED': 'O_DATECREATED'})
    # >>> DO NOT SORT <<<
    cables_nodup = cables.drop_duplicates(subset=['DEDUP_KEY'], keep='first').copy()
    cables_nodup = cables_nodup.merge(min_dates, on='DEDUP_KEY', how='left')
    cables_nodup = cables_nodup.drop(columns=['DATECREATED', 'DEDUP_KEY'])
    cables_final = cables_nodup
else:
    cables_final = cables.drop_duplicates(subset=['DEDUP_KEY'], keep='first').drop(columns=['DEDUP_KEY'])
cables_final = cables_final.drop(columns=['MEASUREDLENGTH'], errors='ignore')
# ── 7) CLEAN-UP & SAVE -------------------------------------------------------

cables_final.to_csv(OUT_PATH, index=False)
print(" cable rows with FAULT_SWITCH_COUNT > 0 :", (cables_final['FAULT_SWITCH_COUNT']>0).sum())
print(" updated file                    :", OUT_PATH)



  unique_paths = [p for p in pd.unique(paths) if p]
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=True).max() if count else pd.NaT
  latest = pd.to_datetime(match_faults['TIME_OUTAGE'], errors='coerce', utc=

 cable rows with FAULT_SWITCH_COUNT > 0 : 2949
 updated file                    : /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full_path.csv


In [8]:
import pandas as pd
import numpy as np
import os, glob, re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor

# ---- PATHS ----
CABLE_PATH = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full_path.csv"
BASE = "/media/sagark24/New Volume/MERGE CDIS"
SCADA_FOLDERS = [
    f"{BASE}/2-Year-data/200/200",
    f"{BASE}/2-Year-data/200-400/200-400",
    f"{BASE}/2-Year-data/400-600/400-600",
    f"{BASE}/2-Year-data/600-759/600-759",
    f"{BASE}/2-Year-data/SCADA_JAN_24_TO_APR_25",
]
OUT_FILE = Path(f"{BASE}/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full_path_with_load.csv")
def norm(x): 
    return np.nan if pd.isna(x) else str(x).strip().upper().lstrip("0")
def norm_volt(v):
    v=str(v).upper().replace(' ','')
    return "11KV" if v in ("11","11KV") else "11KV" if v in ("11","11KV") else v
def mcols(tag): return [f"{tag}_Month_{i:02}" for i in range(1,13)]
cables = pd.read_csv(CABLE_PATH, dtype=str)
cables["FROM_SWITCH"] = cables["FROM_SWITCH"].apply(norm)
SWNO_SET = set(cables["FROM_SWITCH"])
def scada_worker(path):
    try:
        df = pd.read_csv(path,
            usecols=['SYSTIME','SWNO','VOLTAGE','PARA','VALUE'],
            dtype={'SYSTIME':str,'SWNO':str,'VOLTAGE':'category',
                   'PARA':'category','VALUE':'float32'},
            low_memory=True)
        df["SWNO"] = df["SWNO"].astype(str).str.strip().apply(norm)
        df = df[df["SWNO"].isin(SWNO_SET)]
        if df.empty: return None
        df["VOLTAGE"] = df["VOLTAGE"].map(norm_volt)
        df = df[df["VOLTAGE"].isin(["11KV"])]
        df["PARA"] = df["PARA"].str.upper().str.strip()
        df = df[df["PARA"] == "I"]
        df["VALUE"] = pd.to_numeric(df["VALUE"], errors="coerce")
        df = df[df["VALUE"] > 0]
        df["TS"] = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True)
        df["MONTH"] = df["TS"].dt.month.astype("Int8")
        df.dropna(subset=["VALUE","MONTH"], inplace=True)
        if df.empty: return None
        return df
    except Exception:
        return None

files = [f for fld in SCADA_FOLDERS for f in glob.glob(os.path.join(fld, '*.csv'))]
print(" Total SCADA files:", len(files))
parts = []
with ProcessPoolExecutor(max_workers=10) as pool:
    for part in pool.map(scada_worker, files):
        if part is not None:
            parts.append(part)
print(" SCADA files with data:", len(parts))
print(" Total SCADA rows:", sum(len(p) for p in parts))
df = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=['SWNO','VOLTAGE','VALUE','MONTH','TS'])
def pivot_metric(metric, tag):
    p = metric.pivot(index=["SWNO", "VOLTAGE"], columns="MONTH", values=tag)
    p = p.reindex(columns=range(1, 13), fill_value=np.nan)
    p.columns = mcols(tag)
    return p

if not df.empty:
    i_df = df.copy()
    i_df["DATE"] = i_df["TS"].dt.date

    # Monthly mean current
    pI = (i_df.groupby(["SWNO","VOLTAGE","MONTH"], observed=True)["VALUE"]
            .mean().unstack().reindex(columns=range(1,13)).fillna(np.nan))
    pI.columns = mcols("I")
    pI_long = pI.reset_index().melt(id_vars=["SWNO","VOLTAGE"], var_name="MONTH_COL", value_name="AVG")
    pI_long["MONTH"] = pI_long["MONTH_COL"].str.extract(r"(\d+)$").astype(int)
    i_df = i_df.merge(pI_long.drop(columns="MONTH_COL"), on=["SWNO", "VOLTAGE", "MONTH"], how="left")

    # PEAK
    peak = i_df.groupby(["SWNO", "VOLTAGE", "MONTH"], observed=True)["VALUE"].max().reset_index(name="PEAK")

    # CYCLE (ramp-up events)
    day_max = i_df.groupby(["SWNO", "VOLTAGE", "DATE"], observed=True)["VALUE"].max().reset_index()
    day_max.sort_values(["SWNO", "VOLTAGE", "DATE"], inplace=True)
    day_max["PREV"] = day_max.groupby(["SWNO", "VOLTAGE"])["VALUE"].shift()
    day_max["MONTH"] = pd.to_datetime(day_max["DATE"]).dt.month.astype("Int8")
    day_max = day_max.merge(pI_long.drop(columns="MONTH_COL"), on=["SWNO", "VOLTAGE", "MONTH"], how="left")
    day_max["RAMP"] = ((day_max["PREV"] < day_max["AVG"]) & (day_max["VALUE"] >= day_max["AVG"])).astype("int8")
    cycle = day_max.groupby(["SWNO", "VOLTAGE", "MONTH"], observed=True)["RAMP"].sum().reset_index(name="CYCLE")

    # OVR (overload > 2x AVG)
    i_df["OVR"] = (i_df["VALUE"] > 2 * i_df["AVG"]).astype("int8")
    ovr = i_df.groupby(["SWNO", "VOLTAGE", "MONTH"], observed=True)["OVR"].sum().reset_index(name="OVR")

    # Pivot all to wide form
    pPEAK = pivot_metric(peak, "PEAK")
    pCYCLE = pivot_metric(cycle, "CYCLE")
    pOVR = pivot_metric(ovr, "OVR")

    sc_wide = pI.join([pPEAK, pCYCLE, pOVR]).reset_index()
else:
    sc_wide = pd.DataFrame(columns=["SWNO","VOLTAGE"])
cables_with_load = cables.merge(
    sc_wide, how="left", left_on=["FROM_SWITCH"], right_on=["SWNO"]
).drop(columns=["SWNO"])  # remove extra merge column
cables_with_load.to_csv(OUT_FILE, index=False, float_format="%.3f")
print(f" Saved: {OUT_FILE} | Rows: {len(cables_with_load)} | Columns: {len(cables_with_load.columns)}")


 Total SCADA files: 763


  df["TS"] = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True)


 SCADA files with data: 755
 Total SCADA rows: 46807477
 Saved: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full_path_with_load.csv | Rows: 9074 | Columns: 73


In [9]:
# import pandas as pd
# from geopy.geocoders import Nominatim
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time

# df = pd.read_csv("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv")
# ss_names = pd.unique(df[["SOURCE_SS", "DESTINATION_SS"]].values.ravel())
# ss_names = [ss for ss in ss_names if pd.notna(ss) and str(ss).strip()]

# def geocode_ss(ss):
#     geolocator = Nominatim(user_agent="mumbai_cable_map_parallel")
#     try:
#         loc = geolocator.geocode(f"{ss}, Mumbai, India", timeout=10)
#         # Comply with 1 req/sec limit
#         time.sleep(1)
#         if loc:
#             return ss, loc.latitude, loc.longitude
#         else:
#             return ss, None, None
#     except Exception as e:
#         return ss, None, None

# results = []
# with ThreadPoolExecutor(max_workers=4) as executor:
#     futures = {executor.submit(geocode_ss, ss): ss for ss in ss_names}
#     for future in as_completed(futures):
#         results.append(future.result())

# ss_coords = pd.DataFrame(results, columns=["SS", "lat", "lon"])
# ss_coords.to_csv("mumbai_substations_coords.csv", index=False)
# print("Saved to mumbai_substations_coords.csv")


In [10]:
# df = df.merge(ss_coords.rename(columns={'SS':'SOURCE_SS','lat':'SRC_LAT','lon':'SRC_LON'}), on='SOURCE_SS', how='left')
# df = df.merge(ss_coords.rename(columns={'SS':'DESTINATION_SS','lat':'DST_LAT','lon':'DST_LON'}), on='DESTINATION_SS', how='left')


In [11]:
# import folium

# # Set Mumbai as the map center
# mumbai_map = folium.Map(location=[19.0760, 72.8777], zoom_start=12, tiles="CartoDB positron")  # or "Stamen Terrain", "OpenStreetMap", etc.

# # Add substations as markers
# for idx, row in ss_coords.iterrows():
#     if pd.notnull(row['lat']) and pd.notnull(row['lon']):
#         folium.Marker(
#             [row['lat'], row['lon']],
#             popup=row['SS'],
#             icon=folium.Icon(color='blue', icon='bolt')
#         ).add_to(mumbai_map)

# # Add cables as lines
# for idx, row in df.iterrows():
#     if all(pd.notnull([row['SRC_LAT'], row['SRC_LON'], row['DST_LAT'], row['DST_LON']])):
#         # Color by Feeder_ID or any column
#         color = "red"  # Or use a color map by feeder_id
#         # Popup includes Feeder ID, Switch info, and Comments
#         popup_text = (
#             f"<b>Feeder:</b> {row['FEEDER_ID']}<br>"
#             f"<b>From Switch:</b> {row['FROM_SWITCH']}<br>"
#             f"<b>To Switch:</b> {row['TO_SWITCH']}<br>"
#             f"<b>Comment:</b> {row['COMMENTS']}"
#         )
#         folium.PolyLine(
#             locations=[(row['SRC_LAT'], row['SRC_LON']), (row['DST_LAT'], row['DST_LON'])],
#             color=color,
#             weight=5,
#             tooltip=popup_text
#         ).add_to(mumbai_map)

# # Save to HTML (open in browser)
# mumbai_map.save("mumbai_cable_network_map.html")


In [12]:
import pandas as pd
df = pd.read_csv("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv")
col = df['SOURCE_SS'].unique().tolist() + df['DESTINATION_SS'].unique().tolist()
col = [c for c in col if pd.notna(c) and str(c).strip()]
col = list(set(col))  # remove duplicates
print("Substation names:", col)
print("Total substations:", len(col))

  df = pd.read_csv("/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv")


Substation names: ['LAKE HOMES 2', 'CRYSTAL DIAMOND', 'ANAND NIWAS', 'MMRDA R/S', 'JOGESHWARI NAVPADA NO.3', 'STERLING CITY', 'MAROL POLICE TRAINING CAMP', 'MAMATA INDUSTRY', 'DAHANUKAR WADI NO.2', 'UNITED PARK', 'VINAMRA COMPLEX NO 4', 'KARTIK ESTATE', 'CHEMBUR KALPANA PADMA', 'EMINENT', 'IRANI WADI NO.3', 'GOREGAON SHOPPING CENTRE', 'SUBEDAR', 'OMKAR SRA BUILDING NO.2A', 'GORAI MHADA NO 2', 'BACON FACTORY', 'NAGARI NIWARA NO.3', 'MEDONA COLONY NO 2', 'RAJ OIL MILL NO.1', 'JEEVAN UMANG', 'SWASTIK CO-OP 2', 'AWESOME HEIGHT', 'GANESH WADI', 'WADHVAN VILLAGE NO.1', 'POLYCHEM NO.2', 'KATYANI HEIGHTS', 'LAXMI INDUSTRIAL ESTATE NO.1', 'KIA PARK', 'HINDUSTAN KOHINOOR 1', 'PRATHMESH RESIDENCY', 'SANT JANABAI ROAD', 'JAI BHARAT SOCIETY', 'PRABHAT CHS', 'KRISHNA GARDEN NO 2', 'VICKMAN STEEL', 'KHARDEO NGR 1', 'INDRAPRASTA TOWER', 'GOREGAON HOUSING NO.2', 'KANAKIA INTERNATIONAL', 'AMRUTWANI ROAD', 'MODEL INDUSTRIAL COLONY', 'RAJARAM TAWDE ROAD', 'NARAYAN NAGAR', 'CRSC 1', 'EXCEL', 'SAGAR APARTME

In [13]:
col = df['DESTINATION_SS'].unique().tolist()
col = [c for c in col if pd.notna(c) and str(c).strip()]
col = list(set(col))  # remove duplicates
print("Substation names:", col)
print("Total substations:", len(col))

Substation names: ['LAKE HOMES 2', 'CRYSTAL DIAMOND', 'ANAND NIWAS', 'MMRDA R/S', 'JOGESHWARI NAVPADA NO.3', 'STERLING CITY', 'MAROL POLICE TRAINING CAMP', 'MAMATA INDUSTRY', 'DAHANUKAR WADI NO.2', 'UNITED PARK', 'VINAMRA COMPLEX NO 4', 'KARTIK ESTATE', 'CHEMBUR KALPANA PADMA', 'EMINENT', 'IRANI WADI NO.3', 'GOREGAON SHOPPING CENTRE', 'SUBEDAR', 'OMKAR SRA BUILDING NO.2A', 'GORAI MHADA NO 2', 'BACON FACTORY', 'NAGARI NIWARA NO.3', 'MEDONA COLONY NO 2', 'RAJ OIL MILL NO.1', 'JEEVAN UMANG', 'SWASTIK CO-OP 2', 'AWESOME HEIGHT', 'GANESH WADI', 'WADHVAN VILLAGE NO.1', 'POLYCHEM NO.2', 'KATYANI HEIGHTS', 'LAXMI INDUSTRIAL ESTATE NO.1', 'KIA PARK', 'HINDUSTAN KOHINOOR 1', 'PRATHMESH RESIDENCY', 'SANT JANABAI ROAD', 'JAI BHARAT SOCIETY', 'PRABHAT CHS', 'KRISHNA GARDEN NO 2', 'VICKMAN STEEL', 'KHARDEO NGR 1', 'INDRAPRASTA TOWER', 'GOREGAON HOUSING NO.2', 'KANAKIA INTERNATIONAL', 'AMRUTWANI ROAD', 'MODEL INDUSTRIAL COLONY', 'RAJARAM TAWDE ROAD', 'NARAYAN NAGAR', 'CRSC 1', 'EXCEL', 'SAGAR APARTME