In [3]:
#!/usr/bin/env python3
# monthly_avg_loader.py  –  add MONTHLY_AVG_LOAD once per feeder
# -------------------------------------------------------------
#  • Reads consolidated feeder file (with FEEDER_ID column)
#  • Scans all CSVs in data_folders
#      – UTF-8 → Latin-1 fallback
#      – skips rows with broken quotes, etc.
#      – keeps only PARA == 'I'
#  • SWNO == FEEDER_ID match
#  • Averages VALUE month-wise → overall monthly average
#  • Writes that number in first row for each feeder
#  • Saves final_with_monthly_avg.csv

from __future__ import annotations
import pandas as pd
from pathlib import Path
from glob import glob

# ── paths ────────────────────────────────────────────────────────
CONSOLIDATED_CSV = "final_two_column_with_rank_11_withoutDT.csv"

DATA_FOLDERS = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
]

OUTPUT_CSV = "final_with_monthly_avg.csv"

# ── 1. consolidated feeder table ────────────────────────────────
feeder_df = pd.read_csv(CONSOLIDATED_CSV)
feeder_df["FEEDER_ID"] = feeder_df["FEEDER_ID"].astype(str)

# ── 2. load all CSVs with encoding + parser fallback ─────────────
needed = {"SWNO", "PARA", "VALUE", "SYSTIME"}
frames = []

for folder in DATA_FOLDERS:
    for csv_path in glob(str(Path(folder) / "*.csv")):
        loaded = None
        for enc in ("utf-8", "latin-1"):
            for engine in ("c", "python"):
                try:
                    print(f"Loading {csv_path} with encoding {enc} and engine {engine}...")
                    loaded = pd.read_csv(
                        csv_path,
                        low_memory=False,
                        usecols=lambda c: c.upper() in needed,
                        encoding=enc,
                       
                        on_bad_lines="skip",   # drop malformed rows
                    )
                    break
                except (UnicodeDecodeError, pd.errors.ParserError):
                    continue
            if loaded is not None:
                break
        if loaded is None:
            print(f"⚠️  skipped (decode/parse errors): {csv_path}")
            continue
        if "PARA" not in loaded.columns:
            continue

        mask = loaded["PARA"].str.upper() == "I"
        frames.append(loaded.loc[mask, ["SWNO", "VALUE", "SYSTIME"]])

if not frames:
    raise RuntimeError("No usable load rows (PARA == 'I') found in folders.")

load = pd.concat(frames, ignore_index=True)
load["SWNO"] = load["SWNO"].astype(str)
load["SYSTIME"] = pd.to_datetime(load["SYSTIME"], errors="coerce", utc=True)
load = load.dropna(subset=["SYSTIME"])

# ── 3. monthly means → overall average per feeder ───────────────
load["YEAR_MONTH"] = load["SYSTIME"].dt.to_period("M")

month_means = (
    load.groupby(["SWNO", "YEAR_MONTH"])["VALUE"]
        .mean()
        .reset_index()
)

overall_avg = (
    month_means.groupby("SWNO")["VALUE"]
        .mean()
        .reset_index()
        .rename(columns={"SWNO": "FEEDER_ID",
                         "VALUE": "MONTHLY_AVG_LOAD"})
)

overall_avg["FEEDER_ID"] = overall_avg["FEEDER_ID"].astype(str)

# ── 4. merge → write once per feeder ────────────────────────────
merged = feeder_df.merge(overall_avg, how="left", on="FEEDER_ID")

merged["MONTHLY_AVG_LOAD"] = (
    merged.groupby("FEEDER_ID")["MONTHLY_AVG_LOAD"]
          .transform(lambda s: [s.iat[0]] + [pd.NA] * (len(s) - 1))
)

# ── 5. export ────────────────────────────────────────────────────
merged.to_csv(OUTPUT_CSV, index=False)
print(f"Saved {len(merged):,} rows → {OUTPUT_CSV}")


Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000182.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000183.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000184.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000185.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000186.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000187.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000188.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000189.csv with encoding utf-8 and engine c...
Loading /media/sagarkumar/New Volume/SAG

: 

In [None]:
import pandas as pd
from pathlib import Path
import multiprocessing as mp

# Update these paths
feeder_file = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/feeder_ids_list_11.csv"  # This is your input file with FEEDERID column
output_file = "scada_dara_matched_with_feeder.csv"

data_folders = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
]

# Step 1: Load FEEDERIDs as a set of strings (for fast matching)
feeder_ids = pd.read_csv(feeder_file, dtype=str)['FEEDER'].dropna().astype(str).unique().tolist()
feeder_ids_set = set(feeder_ids)

def process_file(file):
    try:
        matches = []
        for chunk in pd.read_csv(file, chunksize=200_000, dtype=str, encoding='utf-8', engine='python', on_bad_lines='skip'):
            filtered = chunk[chunk['SWNO'].astype(str).isin(feeder_ids_set)]
            if not filtered.empty:
                matches.append(filtered)
        if matches:
            return pd.concat(matches, ignore_index=True)
    except Exception as e:
        print(f"{file} skipped: {e}")
    return None

if __name__ == '__main__':
    # Step 2: Collect all csv files
    all_files = []
    for folder in data_folders:
        all_files.extend(Path(folder).glob("*.csv"))

    # Step 3: Multiprocessing for fast parallel reads
    with mp.Pool(mp.cpu_count()) as pool:
        results = pool.map(process_file, all_files)

    # Step 4: Combine results, add FEEDERID column, and sort
    dfs = [df for df in results if df is not None and not df.empty]
    if dfs:
        final_df = pd.concat(dfs, ignore_index=True)
        # Merge to add the FEEDERID column (optional, if you want to keep the mapping)
        final_df = final_df.merge(
            pd.DataFrame({'FEEDERID': feeder_ids}),
            left_on='SWNO', right_on='FEEDERID', how='left'
        )
        final_df = final_df.sort_values('FEEDERID')
        final_df.to_csv(output_file, index=False)
        print(f"Matched and sorted rows saved to {output_file}")
    else:
        print("No matches found.")


/media/sagarkumar/New Volume/SAGAR/200-400/200-400/2025-05-07_SCADA000000000231.csv skipped: line contains NUL
/media/sagarkumar/New Volume/SAGAR/200-400/200-400/2025-05-07_SCADA000000000233.csv skipped: line contains NUL
/media/sagarkumar/New Volume/SAGAR/400-600/400-600/2025-05-07_SCADA000000000410.csv skipped: 'utf-8' codec can't decode byte 0xe6 in position 4099: invalid continuation byte


free(): invalid pointer


/media/sagarkumar/New Volume/SAGAR/400-600/400-600/2025-05-07_SCADA000000000414.csv skipped: 'utf-8' codec can't decode byte 0xc8 in position 4099: invalid continuation byte
/media/sagarkumar/New Volume/SAGAR/400-600/400-600/2025-05-07_SCADA000000000415.csv skipped: 'utf-8' codec can't decode byte 0xe9 in position 3: invalid continuation byte
