In [2]:
import pandas as pd
from pathlib import Path
import multiprocessing as mp

data_folders = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
]

# ──────────────────────────────────────────────────────────
# build the flat list of CSV paths
# ──────────────────────────────────────────────────────────
csv_files = [p for folder in data_folders
               for p in Path(folder).glob("*.csv")
               if Path(folder).is_dir()]





In [3]:
import gc

# Create sets to hold unique PARA and VOLTAGE values
unique_para = set()
unique_voltage = set()


for folder in data_folders:
    folder_path = Path(folder)
    for csv_path in folder_path.glob("*.csv"):
        try:
            df = pd.read_csv(csv_path, dtype=str, low_memory=False)

            if "PARA" in df.columns:
                unique_para.update(df["PARA"].dropna().unique())
                print(f"Processed {csv_path} for PARA values.")
            if "VOLTAGE" in df.columns:
                unique_voltage.update(df["VOLTAGE"].dropna().unique())
              
        except Exception as e:
            print(f"Error reading {csv_path}: {e}")
     

print("Unique PARA values:")
print(unique_para)
print("\nUnique VOLTAGE values:")
print(unique_voltage)

Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000182.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000183.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000184.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000185.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000186.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000187.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000188.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000189.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA000000000190.csv for PARA values.
Processed /media/sagarkumar/New Volume/SAGAR/200/200/2025-05-07_SCADA0000

In [2]:
# load the master list of SWNO values
import pandas as pd
df_all = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/all_unique_SWNO.csv', dtype=str)


# load the per-file SWNO list (each row contains semicolon-separated SWNO strings)
df_each = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv')
set_feeder = set(df_each["FEEDER_ID"].dropna().unique())

In [13]:
df_each["feeder_match"] = df_all["SWNO"].isin(set_feeder)

print(f"Rows with matching FEEDER_ID: {df_each['feeder_match'].sum()}")

Rows with matching FEEDER_ID: 1091


In [3]:
len(df_each["FEEDER_ID"].unique())

1100

USING PANDAS

In [17]:
import pandas as pd
df_all = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/unique_swno_from200-759.csv', dtype=str)


# load the per-file SWNO list (each row contains semicolon-separated SWNO strings)
df_each = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv')
set_feeder = set(df_each["FEEDER_ID"].dropna().unique())

In [18]:
df_each["feeder_match"] = df_all["SWNO"].isin(set_feeder)

print(f"Rows with matching FEEDER_ID: {df_each['feeder_match'].sum()}")

Rows with matching FEEDER_ID: 0


In [19]:
len(df_all)

286

In [20]:
len(set_feeder)

1100

In [23]:
import pandas as pd

# --- Configuration ---
# Please specify the full paths to your two input files below.
file1_path = "/media/sagarkumar/New Volume/SAGAR/all_unique_SWNO.csv"
file2_path = "/media/sagarkumar/New Volume/SAGAR/unique_swno_from200-759.csv"

# Specify the path for the final combined and sorted output file.
output_file_path = "combined_unique_swno.csv"

# The name of the column you want to combine. This is case-insensitive.
COLUMN_NAME_TARGET = 'SWNO'
# --- End of Configuration ---


def find_actual_column_name(columns, target_name):
    """Finds the actual column name in a case-insensitive way."""
    for col in columns:
        if col.lower() == target_name.lower():
            return col
    return None

def read_swno_from_file(filepath):
    """Reads the target column from a single file, returning a set of values."""
    try:
        print(f"Reading file: {filepath}")
        # Read only the header first to find the correct column name
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip')
        actual_col_name = find_actual_column_name(header_df.columns, COLUMN_NAME_TARGET)

        if not actual_col_name:
            print(f"  - Warning: Column '{COLUMN_NAME_TARGET}' not found in this file. Skipping.")
            return set()

        # Read the full column using the correct name
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip')
        
        # Clean the data: convert to numeric, dropping non-numeric values
        s = pd.to_numeric(df[actual_col_name], errors='coerce')
        cleaned_values = s.dropna()
        
        print(f"  - Found {len(cleaned_values)} valid entries.")
        return set(cleaned_values)

    except FileNotFoundError:
        print(f"  - Error: File not found at {filepath}. Please check the path.")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred while reading {filepath}: {e}")
        return set()

# Use a set to automatically handle uniqueness as we combine the files
combined_swno_set = set()

# Read and process the first file
swno_from_file1 = read_swno_from_file(file1_path)
combined_swno_set.update(swno_from_file1)

# Read and process the second file
swno_from_file2 = read_swno_from_file(file2_path)
combined_swno_set.update(swno_from_file2)

print("\nCombining and sorting results...")

if not combined_swno_set:
    print("No valid SWNO data was found in either file. Output file will not be created.")
else:
    # Convert the final set to a DataFrame for sorting and saving
    unique_df = pd.DataFrame(list(combined_swno_set), columns=[COLUMN_NAME_TARGET])
    
    # Ensure the column is integer type for correct sorting
    unique_df[COLUMN_NAME_TARGET] = unique_df[COLUMN_NAME_TARGET].astype(int)
    
    # Sort the values in ascending order
    unique_df = unique_df.sort_values(by=COLUMN_NAME_TARGET).reset_index(drop=True)
    
    # Save the final DataFrame to a new CSV file
    try:
        unique_df.to_csv(output_file_path, index=False)
        print(f"\n✅ Success! Combined unique values have been saved to:")
        print(output_file_path)
        print(f"Total unique values found: {len(unique_df)}")
    except Exception as e:
        print(f"\n❌ Error: Could not save the output file. Reason: {e}")



Reading file: /media/sagarkumar/New Volume/SAGAR/all_unique_SWNO.csv
  - Found 286 valid entries.
Reading file: /media/sagarkumar/New Volume/SAGAR/unique_swno_from200-759.csv
  - Found 2236 valid entries.

Combining and sorting results...

✅ Success! Combined unique values have been saved to:
combined_unique_swno.csv
Total unique values found: 2240


In [None]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/combined_unique_swno.csv"
file1_column_name = "SWNO" 

# File 2 Details
file2_path = "/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv"
file2_column_name = "FEEDER_ID" 

# Output File Path
# This file will contain only the values that exist in BOTH files.
output_file_path = "/path/to/your/output/matching_values.csv"

# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name):
    """
    Reads a file, extracts unique values from a specific column,
    cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip')
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip')
        
        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')
        
        cleaned_values = set(s.dropna().astype(int))
        
        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name)
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name)

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)
    
    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)


Processing file: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/combined_unique_swno.csv...
  - Found 2240 unique, clean values.
Processing file: /media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv...
  - Found 1100 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/combined_unique_swno.csv' (Column: SWNO): 2240
Unique values in '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_withoutDT.csv' (Column: FEEDER_ID): 1100
-------------------------
Number of values that matched: 1091
-------------------------


In [39]:
import csv
import re
import os

def find_and_save_matching_rows(input_file_path, output_file_path, column_name='FROM_TO'):
    """
    Reads a CSV file, finds rows where the 'FROM_TO' column matches
    a 'xxxx-A...' format (four digits, a hyphen, then an alphabet character),
    and saves them to a new CSV file.

    Args:
        input_file_path (str): The path to the input CSV file.
        output_file_path (str): The path to the output CSV file.
        column_name (str): The name of the column to check.
    """
    # This regular expression checks for a string that starts with four digits,
    # followed by a hyphen, and then followed by an alphabet character.
    # ^\d{4}   - starts with exactly four digits
    # -        - followed by a literal hyphen
    # [a-zA-Z] - followed by any single uppercase or lowercase letter
    # .* - matches any character (except for line terminators) afterward
    pattern = re.compile(r'^\d{4}-[a-zA-Z].*')

    matching_rows = []

    try:
        with open(input_file_path, mode='r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)

            # Read the header row to find the index of the target column
            header = next(reader)
            if column_name not in header:
                print(f"Error: Column '{column_name}' not found in the CSV file.")
                return

            from_to_index = header.index(column_name)
            
            # Prepare the header for the output file, including a column for the original line number
            output_header = ['Original_Line'] + header
            matching_rows.append(output_header)

            # Iterate over each row in the CSV file
            for i, row in enumerate(reader):
                line_number = i + 2  # Line numbers are 1-based, and we skipped the header
                
                if len(row) > from_to_index:
                    cell_value = row[from_to_index].strip()

                    # Check if the cell value MATCHES the specified pattern
                    if pattern.match(cell_value):
                        # Prepend the original line number and add it to our list
                        row_with_line_num = [line_number] + row
                        matching_rows.append(row_with_line_num)

    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found.")
        return
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return

    # Save the matching rows to the output file
    if len(matching_rows) > 1:  # Check if any matches were found besides the header
        try:
            with open(output_file_path, mode='w', newline='', encoding='utf-8') as outfile:
                writer = csv.writer(outfile)
                writer.writerows(matching_rows)
            print(f"Found {len(matching_rows) - 1} matching rows. Results saved to '{output_file_path}'.")
        except Exception as e:
            print(f"An error occurred while writing to the output file: {e}")
    else:
        print(f"No rows matching the pattern were found. No output file was created.")


# --- How to use this script --

# 2. Specify the path to your input and output CSV files.
input_csv = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_full.csv'
output_csv = 'mismatched_rows.csv'

# 3. Run the function.
find_and_save_mismatched_rows(input_csv, output_csv)


Found 16540 mismatched rows. Results saved to 'mismatched_rows.csv'.


In [48]:
import pandas as pd
import re

# Function to check if a string is strictly digit-digit (e.g., 123-456)
def from_to_is_numeric(s):
    return bool(re.fullmatch(r'\d+-\d+', str(s)))

# Path to your input and output CSV files
input_csv = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/final_two_column_with_rank_11_full.csv'
output_csv = 'from_to_not_numeric_rows.csv'

# Read the CSV into a DataFrame
merged = pd.read_csv(input_csv)

# Remove rows where FROM_TO is digit-digit
filtered = merged[~merged['FROM_TO'].apply(from_to_is_numeric)]

# Save the result to a new CSV
filtered.to_csv(output_csv, index=False)

print(f"Rows where FROM_TO is digit-digit are removed. Output saved to '{output_csv}'.")


Rows where FROM_TO is digit-digit are removed. Output saved to 'from_to_not_numeric_rows.csv'.


In [50]:
df_n = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/from_to_not_numeric_rows.csv')

In [56]:
import pandas as pd

# Load your DataFrame
df_n = pd.read_csv('/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/from_to_not_numeric_rows.csv')

# --- Replace 'column_A' and 'column_B' with your actual column names ---
column1_name = 'SOURCE_LOCATION'
column2_name = 'DESTINATION_LOCATION'

# Count the number of rows where the values in the two columns are the same
same_value_count = (df_n[column1_name] == df_n[column2_name]).sum()

print(f"The number of rows where '{column1_name}' and '{column2_name}' have the same value is: {same_value_count}")

The number of rows where 'SOURCE_LOCATION' and 'DESTINATION_LOCATION' have the same value is: 7196


In [None]:
import pandas as pd

# Define your input file path
input_file = '/media/sagarkumar/New Volume/SAGAR/IPYNB_FILE/from_to_not_numeric_rows.csv'
# Define where you want to save the output file
output_file = 'non_matching_rows.csv'


try:
    # Load your DataFrame
    df = pd.read_csv(input_file)

    # --- Replace 'column_A' and 'column_B' with your actual column names ---
    column1_name = 'SOURCE_LOCATION'
    column2_name = 'DESTINATION_LOCATION'

    # Check if the columns exist in the DataFrame
    if column1_name not in df.columns or column2_name not in df.columns:
        print(f"Error: One or both columns ('{column1_name}', '{column2_name}') not found in the file.")
        print(f"Available columns are: {list(df.columns)}")
    else:
        # Find the rows that do NOT match
        # The `!=` operator checks for inequality
        non_matching_rows_df = df[df[column1_name] != df[column2_name]]

        # Save the resulting DataFrame to a new CSV file
        # index=False prevents pandas from writing the DataFrame index as a column
        non_matching_rows_df.to_csv(output_file, index=False)

        print(f"Found {len(non_matching_rows_df)} non-matching rows.")
        print(f"These rows have been saved to '{output_file}'.")

except FileNotFoundError:
    print(f"Error: The file was not found at '{input_file}'")
except Exception as e:


    print(f"An error occurred: {e}")

Found 652 non-matching rows.
These rows have been saved to 'non_matching_rows.csv'.


In [9]:
import pandas as pd
import os

def find_unique_swno_in_folder(folder_path):
    """
    Finds and prints the unique values in the 'SWNO' column from all CSV files
    in a specified folder.

    Args:
        folder_path (str): The full path to the folder containing the CSV files.
    """
    # Check if the provided path is a valid directory
    if not os.path.isdir(folder_path):
        print(f"Error: The directory '{folder_path}' does not exist.")
        return

    # Get a list of all files in the directory
    all_files = os.listdir(folder_path)

    # Filter for files that end with .csv
    csv_files = [f for f in all_files if f.endswith('.csv')]

    if not csv_files:
        print(f"No CSV files were found in '{folder_path}'.")
        return

    # A set to store all the unique SWNO values
    unique_swno_values = set()

    print(f"Reading CSV files from: {folder_path}")

    # Loop through each CSV file
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        try:
            # Read the CSV file into a pandas DataFrame
            df = pd.read_csv(file_path)

            # Check if 'SWNO' column exists in the DataFrame
            if 'SWNO' in df.columns:
                # Get the unique values from the column and add them to our set
                unique_swno_values.update(df['SWNO'].unique())
            else:
                print(f"  - Warning: 'SWNO' column not found in {csv_file}")

        except Exception as e:
            print(f"  - Error processing file {csv_file}: {e}")

    # After checking all files, print the unique values
    if unique_swno_values:
        print("\n--- Unique SWNO Values Found ---")
        for swno in sorted(list(unique_swno_values)):
            print(swno)
        print("---------------------------------")
    else:
        print("\nNo 'SWNO' values were found in any of the files.")


if __name__ == '__main__':
    # --- IMPORTANT ---
    # YOU MUST REPLACE THE TEXT BELOW WITH THE PATH TO YOUR FOLDER
    data_folder = "/media/sagarkumar/New Volume/SAGAR/Newdata"
    # -----------------

    find_unique_swno_in_folder(data_folder)

Reading CSV files from: /media/sagarkumar/New Volume/SAGAR/Newdata

--- Unique SWNO Values Found ---
1
11
21
1175
1285
33001
33002
33003
33004
33006
33008
33009
33010
33011
33012
33015
33018
33019
33020
33021
33022
33024
33026
33027
33030
33031
33032
33035
33036
33037
33038
33039
33040
33041
33043
33044
33045
33049
33050
33051
33052
33055
33056
33059
33060
33065
33066
33067
33070
33074
33075
33080
33081
33082
33084
33086
33087
33088
33090
33093
33095
33096
33097
33098
33099
33101
33102
33104
33105
33106
33107
33109
33112
33115
33116
33118
33119
33121
33122
33125
33126
33127
33128
33129
33131
33132
33133
33134
33135
33138
33139
33140
33150
33151
33152
33155
33156
33157
33158
33159
33163
33166
33167
33168
33169
33170
33171
33172
33173
33174
33175
33176
33177
33178
33179
33181
33184
33185
33186
33188
33189
33190
33199
33200
33201
33203
33204
33205
33206
33208
33211
33212
33213
33215
33218
33219
33222
33223
33224
33226
33227
33228
33229
33230
33231
33232
33233
33234
33235
33236
33238
33239