In [1]:
import pandas as pd

columns = [
    "Cable_ID","Voltage_Level","Feeder_ID","Sub_Feeder_ID","From_Switch","To_Switch",
    "Cable_Type","Cable_Age_Years","Length_m","Installation_Environment",
    "Soil_Type","Humidity","Proximity_to_Water","Load_History_Avg_Load","Load_History_Peak_Load", 
    "Loading_Cycles","Overload_Events","IR_Measurement_MOhm","Tan_Delta","Partial_Discharge_Frequency",
    "Partial_Discharge_Intensity","Thermal_History_Excursions","Num_Faults","Fault_Type","Repairs_Count",
    "Joint_History","Corrosivity","Water_Ingress","Remarks"
]

# Sample data for different voltage levels. You can add or import your actual data here.
data = [
    [
        "33KV-SW1-SW2", "33kV", "", "", "SW1", "SW2", "XLPE", 7, 1200, "Underground",
        "Sandy", "Medium", "Far", 60, 100, 150, 1, 500, 0.001, 0, 0, 0, 0, "None", 0, "Original", "Low", "No", "Healthy"
    ],
    [
        "22KV-SW3-SW4", "22kV", "", "", "SW3", "SW4", "PILC", 3, 800, "Overhead",
        "Clay", "High", "Near", 50, 90, 100, 0, 400, 0.002, 0, 0, 1, 1, "Earth Fault", 1, "Repaired once", "Medium", "No", "Monitor"
    ],
    [
        "FDR1-DT001", "11kV", "FDR1", "", "", "", "XLPE", 8, 400, "Underground",
        "Loam", "Medium", "Near", 40, 80, 120, 2, 120, 0.003, 1, 3, 1, 2, "Earth Fault", 2, "Multiple joints", "High", "Yes", "Healthy"
    ],
    [
        "FDR2-SUB1-DT005", "11kV", "FDR2", "SUB1", "", "", "XLPE", 4, 220, "Underground",
        "Rocky", "Low", "Far", 30, 50, 90, 0, 200, 0.001, 0, 0, 0, 0, "None", 0, "Original", "Low", "No", ""
    ],
]

df = pd.DataFrame(data, columns=columns)

# Save as Excel and CSV for use
# df.to_excel("/media/sagarkumar/New Volume/SAGAR/DATA_GENERATION/master_cable_data_final.xlsx", index=False)
df.to_csv("/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/master_cable_data_final.csv", index=False)

df.head()  # Display the first few rows of the DataFrame


Unnamed: 0,Cable_ID,Voltage_Level,Feeder_ID,Sub_Feeder_ID,From_Switch,To_Switch,Cable_Type,Cable_Age_Years,Length_m,Installation_Environment,...,Partial_Discharge_Frequency,Partial_Discharge_Intensity,Thermal_History_Excursions,Num_Faults,Fault_Type,Repairs_Count,Joint_History,Corrosivity,Water_Ingress,Remarks
0,33KV-SW1-SW2,33kV,,,SW1,SW2,XLPE,7,1200,Underground,...,0,0,0,0,,0,Original,Low,No,Healthy
1,22KV-SW3-SW4,22kV,,,SW3,SW4,PILC,3,800,Overhead,...,0,0,1,1,Earth Fault,1,Repaired once,Medium,No,Monitor
2,FDR1-DT001,11kV,FDR1,,,,XLPE,8,400,Underground,...,1,3,1,2,Earth Fault,2,Multiple joints,High,Yes,Healthy
3,FDR2-SUB1-DT005,11kV,FDR2,SUB1,,,XLPE,4,220,Underground,...,0,0,0,0,,0,Original,Low,No,


FOR 22 AND 33 MASTER DATA


In [13]:
import pandas as pd
df_22_33 = pd.read_csv("/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/monthly_SWNO_matrix_22KV_33KV.csv")

In [19]:
col = df_22_33['SWNO']

In [20]:
col_df = pd.DataFrame(col)

In [21]:
col_df.to_csv("/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv", index=False)

DATA

SCADA LOGIC

SCADA WITHH ONLY CYCLY COUNT AND LOAD RANGE

CHANGE THE LOGIC FOR THE 7 AND 8 MONTH

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generate wide-format SCADA summary containing **only**
1) Monthly mean daily peak-to-peak variation (VAR_Month_XX)
2) Monthly cycle count using median ×/÷ 1.6 threshold (CYCLE_Month_XX)
for every (SWNO, VOLTAGE).

Corrections:
- Robust timestamp parsing (DD-MM-YYYY first, then fallback)
- Recursive, case-insensitive CSV discovery
- Proper cycle counting = number of *entries* into out-of-band region per month
- Sort by DATE before cycle counting (chronology matters)
"""

import pandas as pd, numpy as np
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

# ─────────────────────────────── CONFIG ────────────────────────────────
BASE = "/media/sagark24/New Volume/MERGE CDIS"
FILE_SWNO   = f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv"
SCADA_FOLDERS = [
    f"{BASE}/2-Year-data/200/200",
    f"{BASE}/2-Year-data/200-400/200-400",
    f"{BASE}/2-Year-data/400-600/400-600",
    f"{BASE}/2-Year-data/600-759/600-759",
    f"{BASE}/2-Year-data/SCADA_JAN_24_TO_APR_25",
    f"{BASE}/SCADA_2023",
]
SCADA_OUT = f"{BASE}/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv"

# ─────────────────────────────── HELPERS ───────────────────────────────
def norm_swno(x: str) -> str:
    """Upper-case, strip, and drop leading 0s. Return NaN for missing."""
    return np.nan if pd.isna(x) else str(x).strip().upper().lstrip("0")

def norm_volt(v: str) -> str:
    v = str(v).upper().replace(" ", "")
    return {"22": "22KV", "22KV": "22KV", "33": "33KV", "33KV": "33KV"}.get(v, v)

def make_month_cols(tag: str):
    return [f"{tag}_Month_{i:02}" for i in range(1, 13)]

def pivot_monthly(df_: pd.DataFrame, col: str) -> pd.DataFrame:
    wide = df_.pivot(index=["SWNO", "VOLTAGE"], columns="MONTH", values=col)
    wide = wide.reindex(columns=range(1, 13))
    wide.columns = make_month_cols(col)
    return wide

# Proper “cycle” definition:
# Count number of 0→1 transitions of the out-of-band indicator where
#   out_of_band = (x >= median*mult) or (x <= median/mult),
# computed on the *daily_max* sequence ordered by DATE within a month.
def count_cycles_ordered(daily_max_series: pd.Series, mult: float = 1.6) -> int:
    x = daily_max_series.to_numpy()
    if x.size == 0 or not np.isfinite(x).any():
        return 0
    med = np.nanmedian(x)
    if not np.isfinite(med) or med <= 0:
        return 0
    upper = med * mult
    lower = med / mult
    out = (x >= upper) | (x <= lower)
    # Count entries into out-of-band (0 -> 1 transitions)
    edges = np.diff(out.astype(np.int8)) == 1
    return int(edges.sum())

# ───────────────────────────── PREP MASTER LIST ────────────────────────
SWNO_SET = set(pd.read_csv(FILE_SWNO, dtype=str)["SWNO"].apply(norm_swno))

# ────────────────────────────── PARSER WORKER ──────────────────────────
def scada_worker(path: str):
    try:
        df = pd.read_csv(
            path,
            usecols=["SYSTIME", "SWNO", "VOLTAGE", "PARA", "VALUE"],
            dtype={"SYSTIME": str, "SWNO": str, "VOLTAGE": "category",
                   "PARA": "category", "VALUE": "float32"},
            low_memory=True,
        )

        # Normalize & filter early
        df["SWNO"] = df["SWNO"].map(norm_swno)
        df = df[df["SWNO"].isin(SWNO_SET)]
        if df.empty:
            return None

        df["VOLTAGE"] = df["VOLTAGE"].map(norm_volt)
        df = df[df["VOLTAGE"].isin(["22KV", "33KV"])]
        df = df[df["PARA"].astype(str).str.strip().str.upper() == "I"]

        # Ensure numeric values and keep positive
        df["VALUE"] = pd.to_numeric(df["VALUE"], errors="coerce")
        df = df[df["VALUE"] > 0]

        # Robust timestamp parsing
        ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
        mask = ts.isna()
        if mask.any():
            ts.loc[mask] = pd.to_datetime(
                df.loc[mask, "SYSTIME"], errors="coerce", utc=True, dayfirst=False
            )
        df["TS"] = ts
        df.dropna(subset=["TS"], inplace=True)

        #  Keep only rows with TS ≤ 2024-12-31
        cutoff = pd.Timestamp("2024-12-31", tz="UTC")
        df = df[df["TS"] <= cutoff]
        if df.empty:
            return None

        df["DATE"]  = df["TS"].dt.date
        df["MONTH"] = df["TS"].dt.month.astype("int8")

        return df[["SWNO", "VOLTAGE", "VALUE", "DATE", "MONTH"]]
    except Exception:
        return None


# ────────────────────────────── LOAD ALL CSVs ──────────────────────────
files = [str(p) for folder in SCADA_FOLDERS for p in Path(folder).rglob("*.[cC][sS][vV]")]

parts = []
with ProcessPoolExecutor(max_workers=8) as pool:
    for part in pool.map(scada_worker, files):
        if part is not None and not part.empty:
            parts.append(part)

df = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame(columns=["SWNO","VOLTAGE","VALUE","DATE","MONTH"])

# ──────────────────────────── AGGREGATIONS ─────────────────────────────
if not df.empty:
    # Daily peak & trough ➜ variation
    daily = (
        df.groupby(["SWNO", "VOLTAGE", "DATE", "MONTH"], observed=True)
          .agg(daily_max=("VALUE", "max"), daily_min=("VALUE", "min"))
          .reset_index()
    )
    daily["variation"] = daily["daily_max"] - daily["daily_min"]

    # Sort by DATE for correct cycle counting within month
    daily.sort_values(["SWNO", "VOLTAGE", "MONTH", "DATE"], inplace=True)

    # Monthly stats: VAR = mean(daily peak-to-peak), CYCLE = entries into out-of-band
    def monthly_agg(g: pd.DataFrame) -> pd.Series:
        return pd.Series(
            {
                "VAR": float(g["variation"].mean()) if len(g) else np.nan,
                "CYCLE": count_cycles_ordered(g["daily_max"])
            }
        )

    monthly = (
        daily.groupby(["SWNO", "VOLTAGE", "MONTH"], observed=True)
             .apply(monthly_agg)
             .reset_index()
    )

    # Pivot to wide
    wide_df = pivot_monthly(monthly, "VAR").join(pivot_monthly(monthly, "CYCLE")).reset_index()
else:
    wide_df = pd.DataFrame(columns=["SWNO","VOLTAGE"] + make_month_cols("VAR") + make_month_cols("CYCLE"))

# ─────────────────────────────── OUTPUT ────────────────────────────────
wide_df.to_csv(SCADA_OUT, index=False, float_format="%.3f")
print(f"Saved: {SCADA_OUT} | Rows: {len(wide_df)} | Columns: {len(wide_df.columns)}")


  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce", utc=True, dayfirst=True)
  ts = pd.to_datetime(df["SYSTIME"], errors="coerce"

Saved: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv | Rows: 281 | Columns: 26


  .apply(monthly_agg)


new 2023 data

In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Generate wide-format SCADA summary containing **only**
1) Monthly mean daily peak-to-peak variation (VAR_Month_XX)
2) Monthly cycle count using median ×/÷ 1.6 threshold (CYCLE_Month_XX)
for every (SWNO, VOLTAGE).

Corrections:
- Robust timestamp parsing (DD-MM-YYYY first, then fallback)
- Recursive, case-insensitive CSV discovery
- Proper cycle counting = number of *entries* into out-of-band region per month
- Sort by DATE before cycle counting (chronology matters)
- Do **not** create or carry a separate VALUE or PARA column—use AVG_I directly.
"""

import pandas as pd
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from pathlib import Path

# ─────────────────────────────── CONFIG ────────────────────────────────
BASE = "/media/sagark24/New Volume/MERGE CDIS"
FILE_SWNO     = f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv"
SCADA_FOLDERS = [
    f"{BASE}/SCADA_2023",
    # add more folders if desired...
]
SCADA_OUT = f"{BASE}/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv"

# ─────────────────────────────── HELPERS ───────────────────────────────
def norm_swno(x: str) -> str:
    """Upper-case, strip, and drop leading 0s. Return NaN for missing."""
    return np.nan if pd.isna(x) else str(x).strip().upper().lstrip("0")

def norm_volt(v: str) -> str:
    v = str(v).upper().replace(" ", "")
    return {"22": "22KV", "22KV": "22KV", "33": "33KV", "33KV": "33KV"}.get(v, v)

def make_month_cols(tag: str):
    return [f"{tag}_Month_{i:02}" for i in range(1, 13)]

def pivot_monthly(df_: pd.DataFrame, col: str) -> pd.DataFrame:
    wide = df_.pivot(index=["SWNO", "VOLTAGE"], columns="MONTH", values=col)
    wide = wide.reindex(columns=range(1, 13))
    wide.columns = make_month_cols(col)
    return wide

def count_cycles_ordered(daily_max_series: pd.Series, mult: float = 1.6) -> int:
    """
    Count the number of 0→1 transitions of the out-of-band indicator where:
      out_of_band = (x >= median*mult) or (x <= median/mult)
    Computed on the daily_max sequence ordered by DATE within a month.
    """
    x = daily_max_series.to_numpy()
    if x.size == 0 or not np.isfinite(x).any():
        return 0
    med = np.nanmedian(x)
    if not np.isfinite(med) or med <= 0:
        return 0
    upper = med * mult
    lower = med / mult
    out = (x >= upper) | (x <= lower)
    edges = np.diff(out.astype(np.int8)) == 1
    return int(edges.sum())

# ───────────────────────────── PREP MASTER LIST ────────────────────────
SWNO_SET = set(
    pd.read_csv(FILE_SWNO, dtype=str)["SWNO"]
      .apply(norm_swno)
)

# ────────────────────────────── PARSER WORKER ──────────────────────────
def scada_worker(path: str):
    try:
        df = pd.read_csv(
            path,
            usecols=["SYSDATE", "SWNO", "VOLTAGE", "VALUE"],
            dtype={"SYSDATE": str, "SWNO": str, "VOLTAGE": "category", "VALUE": "float32"},
            low_memory=True,
        )

        # Normalize & filter SWNO
        df["SWNO"] = df["SWNO"].map(norm_swno)
        df = df[df["SWNO"].isin(SWNO_SET)]
        if df.empty:
            return None

        # Normalize voltage and filter
        df["VOLTAGE"] = df["VOLTAGE"].map(norm_volt)
        df = df[df["VOLTAGE"].isin(["22KV", "33KV"])]
        if df.empty:
            return None

        # Ensure AVG_I is numeric and keep positive readings
        df["VALUE"] = pd.to_numeric(df["VALUE"], errors="coerce")
        df = df[df["VALUE"] > 0]
        if df.empty:
            return None

        # Robust timestamp parsing: dayfirst then fallback
        ts = pd.to_datetime(df["SYSDATE"], errors="coerce", utc=True, dayfirst=True)
        mask = ts.isna()
        if mask.any():
            ts.loc[mask] = pd.to_datetime(
                df.loc[mask, "SYSDATE"], errors="coerce", utc=True, dayfirst=False
            )
        df["TS"] = ts
        df.dropna(subset=["TS"], inplace=True)

        df["DATE"]  = df["TS"].dt.date
        df["MONTH"] = df["TS"].dt.month.astype("int8")

        # Return only needed columns
        return df[["SWNO", "VOLTAGE", "VALUE", "DATE", "MONTH"]]
    except Exception:
        return None

# ────────────────────────────── LOAD ALL CSVs ──────────────────────────
files = [
    str(p)
    for folder in SCADA_FOLDERS
    for p in Path(folder).rglob("*.[cC][sS][vV]")
]

parts = []
with ProcessPoolExecutor(max_workers=8) as pool:
    for part in pool.map(scada_worker, files):
        if part is not None and not part.empty:
            parts.append(part)

if parts:
    df = pd.concat(parts, ignore_index=True)
else:
    df = pd.DataFrame(columns=["SWNO","VOLTAGE","VALUE","DATE","MONTH"])

# ──────────────────────────── AGGREGATIONS ─────────────────────────────
if not df.empty:
    # Daily peak & trough ➜ variation
    daily = (
        df.groupby(["SWNO", "VOLTAGE", "DATE", "MONTH"], observed=True)
          .agg(daily_max=("VALUE", "max"), daily_min=("VALUE", "min"))
          .reset_index()
    )
    daily["variation"] = daily["daily_max"] - daily["daily_min"]

    # Sort by DATE for correct cycle counting
    daily.sort_values(["SWNO","VOLTAGE","MONTH","DATE"], inplace=True)

    # Monthly stats: VAR = mean(daily variation), CYCLE = count_cycles_ordered
    def monthly_agg(g: pd.DataFrame) -> pd.Series:
        return pd.Series({
            "VAR": float(g["variation"].mean()) if len(g) else np.nan,
            "CYCLE": count_cycles_ordered(g["daily_max"])
        })

    monthly = (
        daily.groupby(["SWNO","VOLTAGE","MONTH"], observed=True)
             .apply(monthly_agg)
             .reset_index()
    )

    # Pivot to wide and drop any stray columns
    wide_df = (
        pivot_monthly(monthly, "VAR")
        .join(pivot_monthly(monthly, "CYCLE"))
        .reset_index()
    )
else:
    wide_df = pd.DataFrame(columns=["SWNO","VOLTAGE"]
                           + make_month_cols("VAR")
                           + make_month_cols("CYCLE"))

# ─────────────────────────────── OUTPUT ────────────────────────────────
wide_df.to_csv(SCADA_OUT, index=False, float_format="%.3f")
print(f"Saved: {SCADA_OUT} | Rows: {len(wide_df)} | Columns: {len(wide_df.columns)}")


Saved: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv | Rows: 113 | Columns: 26


  .apply(monthly_agg)


In [4]:
col = df['SWNO']
col_uniq = col.unique()
print(len(col_uniq))

113


In [5]:
BASE = "/media/sagark24/New Volume/MERGE CDIS"
FILE_SWNO     = f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv"
SCADA_FOLDERS = [
    f"{BASE}/SCADA_2023",
    # add more folders if desired...
]
files = [
    str(p)
    for folder in SCADA_FOLDERS
    for p in Path(folder).rglob("*.[cC][sS][vV]")
]
df = pd.read_csv(files[0])
print(df.columns)

Index(['SYSDATE', 'RSNAME', 'SWNO', 'VOLTAGE', 'MAX_I', 'MIN_I', 'VALUE',
       'MAX_KW', 'MIN_KW', 'AVG_KW', 'MAX_V', 'MIN_V', 'AVG_V', 'MAX_MVA',
       'MIN_MVA', 'AVG_MVA', 'MAX_PF', 'MIN_PF', 'AVG_PF'],
      dtype='object')


In [7]:
df["SWNO"]= pd.to_numeric(df["SWNO"], errors="coerce")
col = df['SWNO']
col_uniq = col.unique()
print(len(col_uniq))

120


AFTER THE SCADA FILE USED ATO THE AGRREGATION

USING THE SECONDA SCADA DATA ONLY 

In [None]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Build SWNO‑level master table.
Joins:
• SCADA → on `SWNO` (equal to `DESTINATION_SWITCH_ID`)
• Faults → on `DESTINATION_SWITCH_ID`

Note → No chained‑assignment warnings: we avoid `.fillna(..., inplace=True)` on a Series.
"""
import pandas as pd, numpy as np, re
from pathlib import Path

# ─────────────────────────── CONFIG ────────────────────────────
BASE = "/media/sagark24/New Volume/MERGE CDIS"
FILE_SWNO  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv")
FILE_CABLE = Path(f"{BASE}/2-Year-data/CLEANED_DATA/ht_cleaned.csv")
FAULT_FILE = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/FAULT DATA/HT_fault_cable_info_processed_without_affected.csv")
SCADA_OUT  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv")
FINAL_OUT  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33KV CABLE FULL DATA/SWNO_MASTER_COMBINED_FULL_FINAL3.csv")

# ─────────────────────────── HELPERS ───────────────────────────

def norm(x):
    return np.nan if pd.isna(x) else str(x).strip().upper().lstrip("0")

def extract_ordered_path(comment: str) -> str:
    if not isinstance(comment, str):
        return ""
    m = re.findall(r"(SWNO_\w+|JT\.NO\.\d+[A-Z]?)", comment.upper())
    return " → ".join(m) if m else comment.strip()

def drop_all_empty_strict(df: pd.DataFrame) -> pd.DataFrame:
    def empty(v):
        s = str(v).strip().lower()
        return s in {"", "0", "nan"} or pd.isna(v)
    return df.loc[~df.apply(lambda r: all(empty(x) for x in r), axis=1)].reset_index(drop=True)

# ─────────────────────────── LOAD SWNO MASTER ──────────────────
swno_master = pd.read_csv(FILE_SWNO, dtype=str)
swno_master = drop_all_empty_strict(swno_master)
swno_master["SWNO"] = swno_master["SWNO"].apply(norm)
SWNO_SET = set(swno_master["SWNO"])

# ─────────────────────────── CABLE TABLE ──────────────────────
cable = pd.read_csv(FILE_CABLE, dtype=str)
cable = drop_all_empty_strict(cable)

cable.rename(columns={
    "CABEL_ID": "CABLEID",
    "CABLECOUNTUCTOR": "CABLECONDUCTORMATERIAL",
    "NO.OFCORES": "NUMBEROFCORES",
    "NEUTRAL_MATERIAL": "NEUTRALMATERIAL",
    "coments": "COMMENTS",
}, inplace=True)

cable["DESTINATION_SWITCH_ID"] = cable["DESTINATION_SWITCH_ID"].apply(norm)
cable["SOURCE_SWITCH_ID"]      = cable["SOURCE_SWITCH_ID"].apply(norm)

cable = cable[
    cable["DESTINATION_SWITCH_ID"].notna() &
    (cable["DESTINATION_SWITCH_ID"].str.strip() != "") &
    cable["DESTINATION_SWITCH_ID"].isin(SWNO_SET)
]

# COMMISSIONEDDATE fallback
if "COMMISSIONEDDATE" not in cable.columns and "DATECREATED" in cable.columns:
    cable["COMMISSIONEDDATE"] = cable["DATECREATED"]

KEEP = [
    "DESTINATION_SWITCH_ID", "SOURCE_SWITCH_ID", "COMMENTS", "CABLETYPE", "MEASUREDLENGTH",
    "NUMBEROFCORES", "ARMOURED", "NEUTRALMATERIAL", "CABLEID", "CABLECONDUCTORMATERIAL",
    "DIVISIONCODE", "ZONECODE", "REMARKS", "SOURCE_SS", "DESTINATION_SS", "COMMISSIONEDDATE"
]
cable = cable[[c for c in KEEP if c in cable.columns]].copy()

cable["COMMENTS"].fillna("", inplace=True)
cable["PATH"] = cable["COMMENTS"].apply(extract_ordered_path)

segment_comment_counts = (
    cable.assign(COMMENTS=cable["COMMENTS"].str.strip())
          .groupby(["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], dropna=False)["COMMENTS"]
          .apply(lambda s: (s != "").sum())
          .reset_index(name="NO_OF_SEGMENT")
)

cable["MEASUREDLENGTH"] = pd.to_numeric(cable["MEASUREDLENGTH"], errors="coerce")
agg = {
    "MEASUREDLENGTH": "sum", "PATH": lambda x: " -> ".join(sorted(set(x))),
    "CABLETYPE": "first", "NUMBEROFCORES": "first", "ARMOURED": "first",
    "NEUTRALMATERIAL": "first", "CABLEID": "first", "CABLECONDUCTORMATERIAL": "first",
    "DIVISIONCODE": "first", "ZONECODE": "first", "REMARKS": "first",
    "SOURCE_SS": "first", "DESTINATION_SS": "first", "COMMISSIONEDDATE": "first"
}

cable = (cable.groupby(["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], dropna=False)
               .agg(agg).reset_index())

cable = cable.merge(segment_comment_counts, on=["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], how="left")
cable["SWNO"] = cable["DESTINATION_SWITCH_ID"]

# ─────────────────────────── FAULT TABLE ──────────────────────
fault = pd.read_csv(FAULT_FILE, dtype=str)
fault = drop_all_empty_strict(fault)
fault["SWNO"] = fault["TO_SWITCH"].apply(norm)

DEST_SWITCH_SET = set(cable["DESTINATION_SWITCH_ID"].dropna())
fault = fault[fault["SWNO"].isin(DEST_SWITCH_SET)]

raw_fault_cols = ["TIME_OUTAGE", "FROM_SWITCH", "TO_SWITCH", "VOLTAGE", "TIME_DIFFERENCE_HOURS", "DIVISION"]
fault.rename(columns={c: f"FAULT_{c}" for c in raw_fault_cols}, inplace=True)
FAULT_COLS = [f"FAULT_{c}" for c in raw_fault_cols]

fault["FAULT_TIME_DIFFERENCE_HOURS"] = pd.to_numeric(fault["FAULT_TIME_DIFFERENCE_HOURS"], errors="coerce")
fault["FAULT_TIME_OUTAGE"]          = pd.to_datetime(fault["FAULT_TIME_OUTAGE"], errors="coerce")

fault_cnt  = fault.groupby("SWNO", observed=True).size().reset_index(name="Num_Faults")
fault_agg  = fault.groupby("SWNO", observed=True).agg(lambda s: " | ".join(pd.unique(s.dropna().astype(str)))).reset_index()
mean_td    = fault.groupby("SWNO")["FAULT_TIME_DIFFERENCE_HOURS"].mean().reset_index(name="FAULT_TIME_DIFFERENCE_HOURS_AVG")
latest_out = fault.groupby("SWNO")["FAULT_TIME_OUTAGE"].max().reset_index(name="FAULT_LATEST_OUTAGE_TIME")

fault_data = fault_cnt.merge(fault_agg, on="SWNO", how="left")
fault_data = fault_data.merge(mean_td,    on="SWNO", how="left")
fault_data = fault_data.merge(latest_out,  on="SWNO", how="left")

fault_data.rename(columns={"SWNO": "DESTINATION_SWITCH_ID"}, inplace=True)

# ─────────────────────────── SCADA TABLE ──────────────────────
scada_df = pd.read_csv(SCADA_OUT, dtype=str)
scada_df = drop_all_empty_strict(scada_df)

# ─────────────────────────── MERGES ───────────────────────────
full = cable.merge(scada_df, on="SWNO", how="left")
full = full.merge(fault_data, on="DESTINATION_SWITCH_ID", how="left")

# ─────────────────────────── CLEANUP ─────────────────────────
full["Num_Faults"] = full["Num_Faults"].fillna(0).astype("int16")  # no chained inplace
for col in FAULT_COLS:
    if col in full.columns:
        full[col] = full[col].fillna("")



full.drop(columns=["DATECREATED"], errors="ignore", inplace=True)
UNWANTED = ["STD_CABLE_SIZE", "CABLE_TYPE", "RELAY_FUSE_B", "RELAY_FUSE_N", "RELAY_FUSE_Y", "RELAY_FUSE", "AFFECTED_STATION", "AFFECTED_SWITCH", "REASON_CATEGORY", "REASON_TEXT", "STATION_NAME", "FAULT_TIME_OUTAGE", "FAULT_TIME_DIFFERENCE_HOURS"]
full.drop(columns=[c for c in UNWANTED if c in full.columns], inplace=True)

full = drop_all_empty_strict(full)
if "MEASUREDLENGTH" in full.columns:
    full = full[pd.to_numeric(full["MEASUREDLENGTH"], errors="coerce") != 0]

full.reset_index(drop=True, inplace=True)

# ─────────────────────────── SAVE ────────────────────────────
full.to_csv(FINAL_OUT, index=False, float_format="%.3f")
print(f"Saved: {FINAL_OUT} | Rows: {len(full)} | Columns: {len(full.columns)}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  cable["COMMENTS"].fillna("", inplace=True)


Saved: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/22_33KV CABLE FULL DATA/SWNO_MASTER_COMBINED_FULL_FINAL3.csv | Rows: 318 | Columns: 57


REMOVE FAULT FILE BEACUSE THAT IS NOT NEEDED IN THIS FILE IN THIS WE NEEDED ONLY SCADA DATA

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Build SWNO-level master table (no Fault CSV).
• SCADA → join on `SWNO`
• COMMISSIONEDDATE → take the OLDEST date per SWNO from ht_cleaned.csv
"""
import pandas as pd, numpy as np, re
from pathlib import Path

# ─────────────────────────── CONFIG ────────────────────────────
BASE = "/media/sagark24/New Volume/MERGE CDIS"
FILE_SWNO  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33_SWNO.csv")
FILE_CABLE = Path(f"{BASE}/2-Year-data/CLEANED_DATA/ht_cleaned.csv")
SCADA_OUT  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/SCADA_CYCLE_VARIATION.csv")
FINAL_OUT  = Path(f"{BASE}/IPYNB_FILE/DATA_GENERATION/22_33KV CABLE FULL DATA/FINAL_22_33KV_TILL_SCADA.csv")

# ─────────────────────────── HELPERS ───────────────────────────
def norm(x):
    return np.nan if pd.isna(x) else str(x).strip().upper().lstrip("0")

def extract_ordered_path(comment: str) -> str:
    if not isinstance(comment, str):
        return ""
    m = re.findall(r"(SWNO_\w+|JT\.NO\.\d+[A-Z]?)", comment.upper())
    return " → ".join(m) if m else comment.strip()

def drop_all_empty_strict(df: pd.DataFrame) -> pd.DataFrame:
    def empty(v):
        s = str(v).strip().lower()
        return s in {"", "0", "nan"} or pd.isna(v)
    return df.loc[~df.apply(lambda r: all(empty(x) for x in r), axis=1)].reset_index(drop=True)

def parse_date_any(s):
    """Robust date parser → pandas.Timestamp or NaT."""
    if pd.isna(s):
        return pd.NaT
    s = str(s).strip()
    if not s:
        return pd.NaT
    # First try dayfirst (common in these sheets), then fallback
    dt = pd.to_datetime(s, errors="coerce", dayfirst=True)
    if pd.isna(dt):
        dt = pd.to_datetime(s, errors="coerce", dayfirst=False)
    return dt

# ─────────────────────────── LOAD SWNO MASTER ──────────────────
swno_master = pd.read_csv(FILE_SWNO, dtype=str)
swno_master = drop_all_empty_strict(swno_master)
swno_master["SWNO"] = swno_master["SWNO"].apply(norm)
SWNO_SET = set(swno_master["SWNO"])

# ─────────────────────────── CABLE TABLE ──────────────────────
cable = pd.read_csv(FILE_CABLE, dtype=str)
cable = drop_all_empty_strict(cable)

cable.rename(columns={
    "CABEL_ID": "CABLEID",
    "CABLECOUNTUCTOR": "CABLECONDUCTORMATERIAL",
    "NO.OFCORES": "NUMBEROFCORES",
    "NEUTRAL_MATERIAL": "NEUTRALMATERIAL",
    "coments": "COMMENTS",
}, inplace=True)

cable["DESTINATION_SWITCH_ID"] = cable["DESTINATION_SWITCH_ID"].apply(norm)
cable["SOURCE_SWITCH_ID"]      = cable["SOURCE_SWITCH_ID"].apply(norm)

cable = cable[
    cable["DESTINATION_SWITCH_ID"].notna() &
    (cable["DESTINATION_SWITCH_ID"].str.strip() != "") &
    cable["DESTINATION_SWITCH_ID"].isin(SWNO_SET)
]

# COMMISSIONEDDATE fallback
if "COMMISSIONEDDATE" not in cable.columns and "DATECREATED" in cable.columns:
    cable["COMMISSIONEDDATE"] = cable["DATECREATED"]

# Parse commission dates now; keep original string, store parsed in helper col
if "COMMISSIONEDDATE" in cable.columns:
    cable["COMMISSIONEDDATE_DT"] = cable["COMMISSIONEDDATE"].apply(parse_date_any)
else:
    cable["COMMISSIONEDDATE_DT"] = pd.NaT

KEEP = [
    "DESTINATION_SWITCH_ID", "SOURCE_SWITCH_ID", "COMMENTS", "CABLETYPE", "MEASUREDLENGTH",
    "NUMBEROFCORES", "ARMOURED", "NEUTRALMATERIAL", "CABLEID", "CABLECONDUCTORMATERIAL",
    "DIVISIONCODE", "ZONECODE", "REMARKS", "SOURCE_SS", "DESTINATION_SS",
    "COMMISSIONEDDATE", "COMMISSIONEDDATE_DT"
]
cable = cable[[c for c in KEEP if c in cable.columns]].copy()

cable["COMMENTS"] = cable["COMMENTS"].fillna("")
cable["PATH"] = cable["COMMENTS"].apply(extract_ordered_path)

segment_comment_counts = (
    cable.assign(COMMENTS=cable["COMMENTS"].str.strip())
          .groupby(["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], dropna=False)["COMMENTS"]
          .apply(lambda s: (s != "").sum())
          .reset_index(name="NO_OF_SEGMENT")
)

cable["MEASUREDLENGTH"] = pd.to_numeric(cable["MEASUREDLENGTH"], errors="coerce")

# Aggregate per (SOURCE,S_Dest) — note we take MIN of parsed date within this pair
agg = {
    "MEASUREDLENGTH": "sum",
    "PATH": lambda x: " -> ".join(sorted(set(x))),
    "CABLETYPE": "first", "NUMBEROFCORES": "first", "ARMOURED": "first",
    "NEUTRALMATERIAL": "first", "CABLEID": "first", "CABLECONDUCTORMATERIAL": "first",
    "DIVISIONCODE": "first", "ZONECODE": "first", "REMARKS": "first",
    "SOURCE_SS": "first", "DESTINATION_SS": "first",
    "COMMISSIONEDDATE_DT": "min",
}

cable = (cable.groupby(["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], dropna=False)
               .agg(agg).reset_index())

cable = cable.merge(segment_comment_counts, on=["SOURCE_SWITCH_ID", "DESTINATION_SWITCH_ID"], how="left")

# Now compute OLDEST commission date across ALL rows sharing the same SWNO
cable["SWNO"] = cable["DESTINATION_SWITCH_ID"]
oldest_per_swno = (
    cable.groupby("SWNO", dropna=False)["COMMISSIONEDDATE_DT"]
         .min()
         .rename("COMMISSIONEDDATE_DT_OLDEST")
         .reset_index()
)

cable = cable.merge(oldest_per_swno, on="SWNO", how="left")

# Present COMMISSIONEDDATE as string (YYYY-MM-DD); blank if NaT
def fmt_date(d):
    if pd.isna(d):
        return ""
    return pd.Timestamp(d).date().isoformat()

cable["COMMISSIONEDDATE"] = cable["COMMISSIONEDDATE_DT_OLDEST"].apply(fmt_date)

# ─────────────────────────── SCADA TABLE ──────────────────────
scada_df = pd.read_csv(SCADA_OUT, dtype=str)
scada_df = drop_all_empty_strict(scada_df)

# ─────────────────────────── MERGE (no faults) ─────────────────
full = cable.merge(scada_df, on="SWNO", how="left")

# ─────────────────────────── CLEANUP ─────────────────────────
# Drop helper datetime cols; keep final COMMISSIONEDDATE (oldest per SWNO)
full.drop(columns=["DATECREATED", "COMMISSIONEDDATE_DT", "COMMISSIONEDDATE_DT_OLDEST"],
          errors="ignore", inplace=True)

UNWANTED = [
    "STD_CABLE_SIZE", "CABLE_TYPE",
    "RELAY_FUSE_B", "RELAY_FUSE_N", "RELAY_FUSE_Y", "RELAY_FUSE",
    "AFFECTED_STATION", "AFFECTED_SWITCH",
    "REASON_CATEGORY", "REASON_TEXT",
]
full.drop(columns=[c for c in UNWANTED if c in full.columns], inplace=True)

full = drop_all_empty_strict(full)
if "MEASUREDLENGTH" in full.columns:
    full = full[pd.to_numeric(full["MEASUREDLENGTH"], errors="coerce") != 0]

full.reset_index(drop=True, inplace=True)

# ─────────────────────────── SAVE ────────────────────────────
full.to_csv(FINAL_OUT, index=False, float_format="%.3f")
print(f"Saved: {FINAL_OUT} | Rows: {len(full)} | Columns: {len(full.columns)}")


  dt = pd.to_datetime(s, errors="coerce", dayfirst=True)


Saved: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/22_33KV CABLE FULL DATA/FINAL_22_33KV_TILL_SCADA.csv | Rows: 312 | Columns: 43
