In [1]:
import pandas as pd
import glob
import os
import gc
from concurrent.futures import ProcessPoolExecutor

# Folders containing your data
folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]
output_path = '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/average_by_swno_voltage_ALL.csv'

def process_file(file):
    try:
        dtype_dict = {
            'SYSTIME': str,
            'SWNO': str,
            'VOLTAGE': str,
            'PARA': str,
            'VALUE': str  # Always as string, convert to float later
        }
        df = pd.read_csv(
            file,
            usecols=['SYSTIME', 'SWNO', 'VOLTAGE', 'PARA', 'VALUE'],
            dtype=dtype_dict,
            low_memory=False
        )
        df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')
        df = df.dropna(subset=['SYSTIME'])
        df['MONTH_NUM'] = df['SYSTIME'].dt.month
        df['VOLTAGE'] = df['VOLTAGE'].str.upper().str.strip()
        df['PARA'] = df['PARA'].str.upper().str.strip()
        df['SWNO'] = df['SWNO'].str.strip()  # Remove spaces in SWNO
        # Convert VALUE to float, errors become NaN
        df['VALUE'] = pd.to_numeric(df['VALUE'], errors='coerce')
        filtered = df[
            (df['VOLTAGE'].isin(['22KV', '33KV'])) &
            (df['PARA'] == 'I')
        ]
        grouped = (
            filtered.groupby(['SWNO', 'VOLTAGE', 'MONTH_NUM'], as_index=False)['VALUE']
            .mean()
        )
        del df, filtered
        gc.collect()
        return grouped
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

# Multiprocessing with only 2 cores
results = []
with ProcessPoolExecutor(max_workers=9) as executor:
    for grouped in executor.map(process_file, all_files):
        if grouped is not None and not grouped.empty:
            results.append(grouped)

if results:
    final_result = pd.concat(results, ignore_index=True)
    # Group again to ensure only one row per SWNO/VOLTAGE/MONTH_NUM
    final_result = (
        final_result.groupby(['SWNO', 'VOLTAGE', 'MONTH_NUM'], as_index=False)['VALUE']
        .mean()
    )
    # Pivot table to have months 1..12 as columns
    pivot = final_result.pivot_table(
        index=['SWNO', 'VOLTAGE'],
        columns='MONTH_NUM',
        values='VALUE',
        aggfunc='mean'
    ).reset_index()
    # Clean up column names (optional: convert month columns to strings)
    pivot.columns.name = None
    month_map = {i: str(i) for i in range(1, 13)}
    pivot = pivot.rename(columns=month_map)
    # Ensure unique SWNO/VOLTAGE
    pivot = pivot.drop_duplicates(subset=['SWNO', 'VOLTAGE'])
    pivot.to_csv(output_path, index=False)
    print(f"All done! Result saved to {output_path}")
else:
    print("No data processed.")


Found 763 files to process.


  df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')


All done! Result saved to /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/average_by_swno_voltage_ALL.csv


In [1]:
import pandas as pd
import glob
import os
import gc
from concurrent.futures import ProcessPoolExecutor

folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200/200',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/600-759/600-759',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/SCADA_JAN_24_TO_APR_25'
]
output_path = '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/average_by_swno_voltage_11kv.csv'

def process_file(file):
    try:
        dtype_dict = {
            'SYSTIME': str,
            'SWNO': str,
            'VOLTAGE': str,
            'PARA': str,
            'VALUE': str
        }
        df = pd.read_csv(
            file,
            usecols=['SYSTIME', 'SWNO', 'VOLTAGE', 'PARA', 'VALUE'],
            dtype=dtype_dict,
            low_memory=False
        )
        df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')
        df = df.dropna(subset=['SYSTIME'])
        df['MONTH_NUM'] = df['SYSTIME'].dt.month
        df['VOLTAGE'] = df['VOLTAGE'].str.upper().str.strip()
        df['PARA'] = df['PARA'].str.upper().str.strip()
        df['SWNO'] = df['SWNO'].str.strip()
        df['VALUE'] = pd.to_numeric(df['VALUE'], errors='coerce')
        filtered = df[
            (df['VOLTAGE'] == '11KV') &
            (df['PARA'] == 'I')
        ]
        grouped = (
            filtered.groupby(['SWNO', 'VOLTAGE', 'MONTH_NUM'], as_index=False)['VALUE']
            .mean()
        )
        del df, filtered
        gc.collect()
        return grouped
    except Exception as e:
        print(f"Error processing {file}: {e}")
        return None

# Gather all files
all_files = []
for folder in folders:
    all_files.extend(glob.glob(os.path.join(folder, '*.csv')))
print(f"Found {len(all_files)} files to process.")

# Multiprocessing with only 2 cores
results = []
with ProcessPoolExecutor(max_workers=9) as executor:
    for grouped in executor.map(process_file, all_files):
        if grouped is not None and not grouped.empty:
            results.append(grouped)

if results:
    final_result = pd.concat(results, ignore_index=True)
    final_result = (
        final_result.groupby(['SWNO', 'VOLTAGE', 'MONTH_NUM'], as_index=False)['VALUE']
        .mean()
    )
    pivot = final_result.pivot_table(
        index=['SWNO', 'VOLTAGE'],
        columns='MONTH_NUM',
        values='VALUE',
        aggfunc='mean'
    ).reset_index()
    pivot.columns.name = None
    month_map = {i: str(i) for i in range(1, 13)}
    pivot = pivot.rename(columns=month_map)
    pivot = pivot.drop_duplicates(subset=['SWNO', 'VOLTAGE'])
    pivot.to_csv(output_path, index=False)
    print(f"All done! Result saved to {output_path}")
else:
    print("No data processed.")


Found 763 files to process.


  df['SYSTIME'] = pd.to_datetime(df['SYSTIME'], errors='coerce')


All done! Result saved to /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/average_by_swno_voltage_11kv.csv


In [2]:
import pandas as pd
import glob
import os
import gc
from concurrent.futures import ProcessPoolExecutor


folders = [
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/



200/200',    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/200-400/200-400',
    '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/400-600/400-600',
    '/media/sagark      

SyntaxError: unterminated string literal (detected at line 9) (2208624628.py, line 9)