In [2]:
import pandas as pd
from pathlib import Path
import multiprocessing as mp

data_folders = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
]

# ──────────────────────────────────────────────────────────
# build the flat list of CSV paths
# ──────────────────────────────────────────────────────────
csv_files = [p for folder in data_folders
               for p in Path(folder).glob("*.csv")
               if Path(folder).is_dir()]





In [3]:
import gc

# Create sets to hold unique PARA and VOLTAGE values
unique_para = set()
unique_voltage = set()


for folder in data_folders:
    folder_path = Path(folder)
    for csv_path in folder_path.glob("*.csv"):
        try:
            df = pd.read_csv(csv_path, dtype=str, low_memory=False)

            if "PARA" in df.columns:
                unique_para.update(df["PARA"].dropna().unique())
                print(f"Processed {csv_path} for PARA values.")
            if "VOLTAGE" in df.columns:
                unique_voltage.update(df["VOLTAGE"].dropna().unique())
              
        except Exception as e:
            print(f"Error reading {csv_path}: {e}")
     

print("Unique PARA values:")
print(unique_para)
print("\nUnique VOLTAGE values:")
print(unique_voltage)

Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000182.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000183.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000184.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000185.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000186.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000187.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000188.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000189.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000190.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA0000

In [2]:
# load the master list of SWNO values
import pandas as pd
df_all = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/all_unique_SWNO.csv', dtype=str)


# load the per-file SWNO list (each row contains semicolon-separated SWNO strings)
df_each = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv')
set_feeder = set(df_each["FEEDER_ID"].dropna().unique())

In [13]:
df_each["feeder_match"] = df_all["SWNO"].isin(set_feeder)

print(f"Rows with matching FEEDER_ID: {df_each['feeder_match'].sum()}")

Rows with matching FEEDER_ID: 1091


In [3]:
len(df_each["FEEDER_ID"].unique())

1100

USING PANDAS

In [None]:
import pandas as pd
from pathlib import Path
import concurrent.futures
import os

# List of folders containing the CSV files
data_folders = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
]

# --- Configuration ---
# The name of the column that contains the switch numbers.
SWNO_COLUMN_NAME = 'SWNO'

# Path for the output file
output_file = "/media/sagarkumar/New Volume/SAGAR/unique_swno_faster.csv"


def process_file(file_path):
    """
    Worker function to be run in parallel.
    Reads a single CSV file and returns a set of unique 'swno' values.
    """
    try:
        # Use pandas to read only the specific column. This is memory-efficient.
        df = pd.read_csv(file_path, usecols=[SWNO_COLUMN_NAME], on_bad_lines='skip', engine='c',low_memeory = False)
        
        # Return a set of unique, non-null values from the column
        return set(df[SWNO_COLUMN_NAME].dropna().unique())
    except ValueError:
        # This error happens if the SWNO_COLUMN_NAME is not in the CSV
        print(f"Warning: Column '{SWNO_COLUMN_NAME}' not found in {file_path.name}. Skipping file.")
    except Exception as e:
        # Catch any other potential errors during file processing
        print(f"Error processing file {file_path.name}: {e}")
    # Return an empty set if there was an error or no values were found
    return set()


if __name__ == "__main__":
    print("Gathering all CSV file paths...")
    # First, collect a list of all files to be processed.
    all_files_to_process = []
    for folder_path in data_folders:
        folder = Path(folder_path)
        if folder.is_dir():
            all_files_to_process.extend(list(folder.glob('*.csv')))
        else:
            print(f"Warning: Folder not found at {folder_path}. Skipping.")
    
    if not all_files_to_process:
        print("No CSV files found to process. Exiting.")
    else:
        print(f"Found {len(all_files_to_process)} total files. Starting parallel processing...")
        
        # This master set will hold all unique values from all files
        master_unique_swno_set = set()

        # Use ProcessPoolExecutor to run the 'process_file' function in parallel
        # It will automatically use the available CPU cores
        with concurrent.futures.ProcessPoolExecutor() as executor:
            # map() applies the function to each item in the list and returns results as they complete
            results = executor.map(process_file, all_files_to_process)
            
            # As each worker process finishes, its resulting set is merged into the master set
            for unique_set in results:
                if unique_set: # Ensure the set is not empty
                    master_unique_swno_set.update(unique_set)

        print("\nAll files processed. Consolidating and saving results...")

        # Convert the final set of unique values to a DataFrame
        unique_df = pd.DataFrame(list(master_unique_swno_set), columns=[SWNO_COLUMN_NAME])
        
        # Sort the values for cleaner output
        unique_df = unique_df.sort_values(by=SWNO_COLUMN_NAME).reset_index(drop=True)

        # Save the DataFrame to the specified output CSV file
        try:
            unique_df.to_csv(output_file, index=False)
            print(f"\n✅ Success! All unique 'swno' values have been saved to:")
            print(output_file)
            print(f"Total unique values found: {len(unique_df)}")
        except Exception as e:
            print(f"\n❌ Error: Could not save the output file. Reason: {e}")

