USE EDA ON THE DATA

In [1]:
import pandas as pd
import glob
import os
import gc
from concurrent.futures import ProcessPoolExecutor

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]

def process_file(file):
    voltage_counts = {}
    min_date, max_date = None, None
    year_month_counts = {}
    voltage_month_counts = {}
    unique_para = set()   # <--- Added for PARA column
    try:
        df = pd.read_csv(
            file,
            usecols=['SYSTIME', 'VOLTAGE', 'PARA'],  # <--- Added PARA column
            dtype={'VOLTAGE': 'category', 'SYSTIME': str, 'PARA': str},
            low_memory=True,
            memory_map=True
        )
        # Unique PARA values
        unique_para.update(df['PARA'].dropna().unique())
        # Normalize VOLTAGE
        df['VOLTAGE'] = df['VOLTAGE'].str.upper().str.strip()
        df['VOLTAGE'] = df['VOLTAGE'].apply(
            lambda v: f"{v}KV" if (pd.notna(v) and v in ['11','22','33']) else v
        )
        df = df.dropna(subset=['VOLTAGE'])
        # Parse SYSTIME and drop missing
        df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')
        df = df.dropna(subset=['SYSTIME'])
        file_min, file_max = None, None
        if not df.empty:
            file_min, file_max = df['SYSTIME'].min(), df['SYSTIME'].max()
        # Count voltages
        for v, count in df['VOLTAGE'].value_counts().items():
            voltage_counts[v] = count
        # Year/month
        df['YEAR'] = df['SYSTIME'].dt.year.astype('int16')
        df['MONTH'] = df['SYSTIME'].dt.month.astype('int8')
        for y, m in zip(df['YEAR'], df['MONTH']):
            year_month_counts[(y, m)] = year_month_counts.get((y, m), 0) + 1
        # Voltage/month combo
        for v, m in zip(df['VOLTAGE'], df['MONTH']):
            voltage_month_counts[(v, m)] = voltage_month_counts.get((v, m), 0) + 1
        del df
        gc.collect()
        return voltage_counts, file_min, file_max, year_month_counts, voltage_month_counts, unique_para
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return {}, None, None, {}, {}, set()

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

results = []
with ProcessPoolExecutor(max_workers=10) as executor:
    for result in executor.map(process_file, all_files):
        results.append(result)

# Aggregate all results
voltage_counts = {}
min_date, max_date = None, None
year_month_counts = {}
voltage_month_counts = {}
all_para_values = set()   # <--- Store all PARA unique values

for vcounts, file_min, file_max, ymc, vmc, unique_para in results:
    for v, c in vcounts.items():
        voltage_counts[v] = voltage_counts.get(v, 0) + c
    if file_min is not None:
        min_date = file_min if min_date is None else min(file_min, min_date)
    if file_max is not None:
        max_date = file_max if max_date is None else max(file_max, max_date)
    for k, c in ymc.items():
        year_month_counts[k] = year_month_counts.get(k, 0) + c
    for k, c in vmc.items():
        voltage_month_counts[k] = voltage_month_counts.get(k, 0) + c
    all_para_values.update(unique_para)   # <--- Add unique PARA values

# Print EDA results
print("\nUnique VOLTAGE values and counts:")
for v, c in voltage_counts.items():
    print(f"  {v}: {c}")

print(f"\nDate range: {min_date} to {max_date}")

print("\nRecord counts per YEAR-MONTH (first 10):")
for (y, m), c in list(year_month_counts.items())[:10]:
    print(f"  {y}-{m:02}: {c}")

print("\nRecord counts per VOLTAGE and MONTH (first 10):")
for (v, m), c in list(voltage_month_counts.items())[:10]:
    print(f"  {v}, month {m:02}: {c}")

print("\nMonths missing for each VOLTAGE (among all files):")
all_months = set(range(1, 13))
voltages = list(voltage_counts.keys())
for v in voltages:
    months_present = set([m for (volt, m) in voltage_month_counts if volt == v])
    missing = sorted(all_months - months_present)
    if missing:
        print(f"{v}: missing months {missing}")
    else:
        print(f"{v}: no missing months")

# PARA column unique values and count
print(f"\nUnique PARA values count: {len(all_para_values)}")
print("Some PARA values:")
for val in list(all_para_values)[:20]:
    print("  ", val)

gc.collect()


Found 763 files to process.


  df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')



Unique VOLTAGE values and counts:
  11KV: 252537820
  33KV: 168273218
  22KV: 28334840
  14: 22442

Date range: 2024-01-01 00:00:00+00:00 to 2025-04-18 23:45:00+00:00

Record counts per YEAR-MONTH (first 10):
  2025-03: 30544286
  2024-03: 46339573
  2025-02: 26543330
  2024-04: 42862642
  2024-08: 3060444
  2024-10: 27462412
  2024-06: 37997242
  2024-05: 43640867
  2025-04: 2901799
  2024-07: 5279796

Record counts per VOLTAGE and MONTH (first 10):
  11KV, month 03: 42827054
  33KV, month 03: 28965273
  22KV, month 03: 5088472
  14, month 03: 3060
  11KV, month 02: 38034644
  33KV, month 02: 26914674
  22KV, month 02: 4643755
  11KV, month 04: 19486669
  22KV, month 04: 3528327
  33KV, month 04: 22746585

Months missing for each VOLTAGE (among all files):
11KV: no missing months
33KV: no missing months
22KV: no missing months
14: missing months [9, 10, 11, 12]

Unique PARA values count: 6
Some PARA values:
   I
   Temp
   MVA
   PF
   KW
   V


0

In [1]:
import pandas as pd
import glob
import os
import re
from collections import Counter
from concurrent.futures import ProcessPoolExecutor

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]

def guess_format(dt_str):
    if re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", dt_str):
        return "%Y-%m-%d %H:%M:%S"
    if re.match(r"\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}", dt_str):
        return "%Y/%m/%d %H:%M:%S"
    if re.match(r"\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}", dt_str):
        return "%d-%m-%Y %H:%M:%S"
    if re.match(r"\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2}", dt_str):
        return "%d/%m/%Y %H:%M:%S"
    if re.match(r"\d{4}-\d{2}-\d{2}", dt_str):
        return "%Y-%m-%d"
    if re.match(r"\d{2}/\d{2}/\d{4}", dt_str):
        return "%d/%m/%Y"
    if re.match(r"\d{2}-\d{2}-\d{4}", dt_str):
        return "%d-%m-%Y"
    return "unknown"

def process_file(file):
    try:
        df = pd.read_csv(file, usecols=['SYSTIME'], dtype=str, low_memory=True, memory_map=True)
        systimes = df['SYSTIME'].dropna()
        format_counter = Counter()
        total = 0
        unknown = 0
        unknowns = set()
        for val in systimes:
            fmt = guess_format(val)
            format_counter[fmt] += 1
            total += 1
            if fmt == "unknown":
                unknown += 1
                unknowns.add(val)
        return format_counter, total, unknown, list(unknowns)[:20]
    except Exception as e:
        print(f"Error reading {file}: {e}")
        return Counter(), 0, 0, []

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

overall_format_counter = Counter()
total_systime = 0
total_unknown = 0
all_unknowns = set()

# Multiprocessing!
with ProcessPoolExecutor(max_workers=10) as executor:
    for fmt_counter, total, unknown, unknown_list in executor.map(process_file, all_files):
        overall_format_counter.update(fmt_counter)
        total_systime += total
        total_unknown += unknown
        all_unknowns.update(unknown_list)

print("\nOverall detected SYSTIME formats:")
for fmt, count in overall_format_counter.most_common():
    print(f"  {fmt}: {count}")

print(f"\nTotal SYSTIME entries: {total_systime}")
print(f"Unknown SYSTIME formats: {total_unknown} ({100*total_unknown/total_systime:.2f}%)")

print("\nSample unknown SYSTIME values:")
for v in list(all_unknowns)[:20]:
    print(" ", v)


Found 763 files to process.

Overall detected SYSTIME formats:
  %Y-%m-%d %H:%M:%S: 449382883

Total SYSTIME entries: 449382883
Unknown SYSTIME formats: 0 (0.00%)

Sample unknown SYSTIME values:


FOR THE 11KV VOLTAGE VALUE

In [1]:
import pandas as pd
import numpy as np
import glob
import os
from concurrent.futures import ProcessPoolExecutor
import gc

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]
output_csv = '/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/monthly_SWNO_matrix_11KV.csv'

def process_file(file):
    try:
        dtype_dict = {
            'SYSTIME': str,
            'SWNO': str,
            'VOLTAGE': 'category',
            'PARA': 'category',
            'VALUE': 'float32'
        }
        df = pd.read_csv(
            file,
            usecols=['SYSTIME', 'SWNO', 'VOLTAGE', 'PARA', 'VALUE'],
            dtype=dtype_dict,
            low_memory=True,
            memory_map=True
        )
        # Clean columns
        df['VOLTAGE'] = df['VOLTAGE'].str.upper().str.strip()
        df['PARA'] = df['PARA'].str.upper().str.strip()
        df['SWNO'] = df['SWNO'].astype(str).str.strip()

        # Map all voltage '11' and '11KV' to '11KV'
        df.loc[df['VOLTAGE'].isin(['11', '11KV']), 'VOLTAGE'] = '11KV'

        # Filter
        df = df[(df['VOLTAGE'] == '11KV') & (df['PARA'] == 'I')]
        if df.empty:
            return None
        df['VALUE'] = pd.to_numeric(df['VALUE'], errors='coerce')
        df['MONTH'] = pd.to_datetime(df['SYSTIME'], errors='coerce').dt.month.astype('Int8')
        df = df.dropna(subset=['MONTH', 'SWNO', 'VALUE'])
        grouped = df.groupby(['SWNO', 'MONTH'], observed=True)['VALUE'].mean().reset_index()
        del df
        gc.collect()
        return grouped
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

results = []
with ProcessPoolExecutor(max_workers=10) as executor:
    for grouped in executor.map(process_file, all_files):
        if grouped is not None and not grouped.empty:
            results.append(grouped)
            del grouped
            gc.collect()

if results:
    final_df = pd.concat(results, ignore_index=True)
    del results
    gc.collect()
    # Group by SWNO and MONTH again (all files together)
    grouped = final_df.groupby(['SWNO', 'MONTH'], observed=True)['VALUE'].mean().reset_index()
    del final_df
    gc.collect()
    # Pivot to get SWNO x 12 months
    pivot = grouped.pivot(index='SWNO', columns='MONTH', values='VALUE')
    # Ensure all 12 months are present as columns
    for i in range(1, 13):
        if i not in pivot.columns:
            pivot[i] = np.nan
    pivot = pivot[[i for i in range(1, 13)]]
    pivot.columns = [f'Month_{i:02}' for i in range(1, 13)]
    pivot.index.name = 'SWNO'
    pivot.to_csv(output_csv, float_format='%.3f')
    print(f"Saved matrix to: {output_csv}")
    del pivot
    gc.collect()
else:
    print("No data processed.")


Found 763 files to process.


  df['MONTH'] = pd.to_datetime(df['SYSTIME'], errors='coerce').dt.month.astype('Int8')


Saved matrix to: /media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/monthly_SWNO_matrix_11KV.csv


FOR 22 AND 33 KV

In [None]:
import pandas as pd
import numpy as np
import glob
import os
from concurrent.futures import ProcessPoolExecutor
import gc

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]
output_csv = '/media/sagark24/New Volume/MERGE CDIS/DATA_GENERATION/monthly_SWNO_matrix_22KV_33KV.csv'

def normalize_voltage(v):
    v = str(v).upper().replace(" ", "")
    if v in ['22', '22KV']:
        return '22KV'
    if v in ['33', '33KV']:
        return '33KV'
    return v

def process_file(file):
    try:
        dtype_dict = {
            'SYSTIME': str,
            'SWNO': str,
            'VOLTAGE': 'category',
            'PARA': 'category',
            'VALUE': 'float32'
        }
        df = pd.read_csv(
            file,
            usecols=['SYSTIME', 'SWNO', 'VOLTAGE', 'PARA', 'VALUE'],
            dtype=dtype_dict,
            low_memory=True,
            memory_map=True
        )
        # Clean and normalize
        df['VOLTAGE'] = df['VOLTAGE'].map(normalize_voltage)
        df['PARA'] = df['PARA'].str.upper().str.strip()
        df['SWNO'] = df['SWNO'].astype(str).str.strip()
        # Only for 22KV and 33KV and PARA I
        df = df[df['VOLTAGE'].isin(['22KV', '33KV']) & (df['PARA'] == 'I')]
        if df.empty:
            return None
        df['VALUE'] = pd.to_numeric(df['VALUE'], errors='coerce')
        df['MONTH'] = pd.to_datetime(df['SYSTIME'], errors='coerce').dt.month.astype('Int8')
        df = df.dropna(subset=['MONTH', 'SWNO', 'VALUE', 'VOLTAGE'])
        grouped = df.groupby(['SWNO', 'VOLTAGE', 'MONTH'], observed=True)['VALUE'].mean().reset_index()
        del df
        gc.collect()
        return grouped
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

results = []
with ProcessPoolExecutor(max_workers=10) as executor:
    for grouped in executor.map(process_file, all_files):
        if grouped is not None and not grouped.empty:
            results.append(grouped)
            del grouped
            gc.collect()

if results:
    final_df = pd.concat(results, ignore_index=True)
    del results
    gc.collect()
    grouped = final_df.groupby(['SWNO', 'VOLTAGE', 'MONTH'], observed=True)['VALUE'].mean().reset_index()
    del final_df
    gc.collect()
    pivot = grouped.pivot(index=['SWNO', 'VOLTAGE'], columns='MONTH', values='VALUE')
    # Ensure all months are present
    for i in range(1, 13):
        if i not in pivot.columns:
            pivot[i] = np.nan
    pivot = pivot[[i for i in range(1, 13)]]
    pivot.columns = [f'Month_{i:02}' for i in range(1, 13)]
    pivot.reset_index(inplace=True)
    pivot.to_csv(output_csv, float_format='%.3f', index=False)
    print(f"Saved matrix to: {output_csv}")
    del pivot
    gc.collect()
else:
    print("No data processed.")


Found 763 files to process.


  df['MONTH'] = pd.to_datetime(df['SYSTIME'], errors='coerce').dt.month.astype('Int8')


Saved matrix to: monthly_SWNO_matrix_22KV_33KV.csv


EVERY DAY DATA AVERAGE FROM SCADA

In [2]:
import pandas as pd
import numpy as np
import glob
import os
from concurrent.futures import ProcessPoolExecutor
import gc

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]
output_csv_template = '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/daily_SWNO_matrix_11KV_YEAR{}.csv'

def process_file(file):
    try:
        dtype_dict = {
            'SYSTIME': str,
            'VOLTAGE': str,
            'PARA': str,
            'VALUE': 'float32',
            # SWNO as string, if present in file
            'SWNO': str,
        }
        usecols = ['SYSTIME', 'SWNO', 'VOLTAGE', 'PARA', 'VALUE']
        df = pd.read_csv(file, usecols=usecols, dtype=dtype_dict, low_memory=True, memory_map=True)
        df['VOLTAGE'] = df['VOLTAGE'].str.upper().str.strip()
        df['PARA'] = df['PARA'].str.upper().str.strip()
        df['SWNO'] = df['SWNO'].astype(str).str.strip()

        # Standardize voltage
        df.loc[df['VOLTAGE'].isin(['11', '11KV']), 'VOLTAGE'] = '11KV'
        df = df[(df['VOLTAGE'] == '11KV') & (df['PARA'] == 'I')]
        if df.empty:
            return None

        # Convert to datetime, extract YEAR and DAYOFYEAR (1–366)
        df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')
        df = df.dropna(subset=['SYSTIME', 'SWNO', 'VALUE'])
        df['YEAR'] = df['SYSTIME'].dt.year.astype('int32')
        df['DAY'] = df['SYSTIME'].dt.dayofyear.astype('int16')
        # Ignore day 361+ (for non-leap years), only keep day 1–360
        df = df[(df['DAY'] >= 1) & (df['DAY'] <= 360)]

        # Make value positive
        df['VALUE'] = df['VALUE'].abs()

        grouped = df.groupby(['YEAR', 'SWNO', 'DAY'], observed=True)['VALUE'].mean().reset_index()
        del df
        gc.collect()
        return grouped
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

results = []
with ProcessPoolExecutor(max_workers=10) as executor:
    for grouped in executor.map(process_file, all_files):
        if grouped is not None and not grouped.empty:
            results.append(grouped)
            del grouped
            gc.collect()

if results:
    final_df = pd.concat(results, ignore_index=True)
    del results
    gc.collect()

    # Group again to handle duplicates across files: for each YEAR, SWNO, DAY, take mean
    grouped = final_df.groupby(['YEAR', 'SWNO', 'DAY'], observed=True)['VALUE'].mean().reset_index()
    del final_df
    gc.collect()

    # Now for each YEAR, pivot to SWNO x 360 days
    for year, group in grouped.groupby('YEAR'):
        pivot = group.pivot(index='SWNO', columns='DAY', values='VALUE')
        # Ensure all 360 days are present as columns
        for d in range(1, 361):
            if d not in pivot.columns:
                pivot[d] = np.nan
        pivot = pivot[[d for d in range(1, 361)]]
        pivot.columns = [f'Day_{d:03d}' for d in range(1, 361)]
        pivot.index.name = 'SWNO'
        output_csv = output_csv_template.format(year)
        pivot.to_csv(output_csv, float_format='%.3f')
        print(f"Saved {pivot.shape[0]} SWNO × {pivot.shape[1]} days for year {year} to: {output_csv}")
        del pivot
        gc.collect()
else:
    print("No data processed.")



Found 763 files to process.


  df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')


Saved 1876 SWNO × 360 days for year 2024 to: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/daily_SWNO_matrix_11KV_YEAR2024.csv


  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan
  pivot[d] = np.nan


Saved 1745 SWNO × 360 days for year 2025 to: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/11_KV_FEEDER_ALL DATA/daily_SWNO_matrix_11KV_YEAR2025.csv
