MATCHING ENERGY AUDIT VS FROM SWITCH

In [None]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/AFINAL.csv"
file1_column_name = "FROM_SWITCH" 
# This file processed correctly before, so 'utf-8' is likely correct.
file1_encoding = 'utf-8'  # <--- ADDED

# File 2 Details
file2_path = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/ENERGYAUDIT.csv"
file2_column_name = "SWITCH_NO"
# This is the file that had an error. 'latin1' is a safe choice.
file2_encoding = 'utf-8' # <--- ADDED ('windows-1252' is also a good option)

# Output File Path
# This file will contain only the values that exist in BOTH files.
# Changed to a simpler path. It will save in the same directory you run the script.
output_file_path = "matching_values.csv" # <--- CHANGED for simplicity


# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name, encoding): # <--- CHANGED (added encoding)
    """
    Reads a file with a specific encoding, extracts unique values from a
    column, cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        # Pass the encoding parameter here
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip', encoding=encoding) # <--- CHANGED
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name and encoding
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED

        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')

        cleaned_values = set(s.dropna().astype(int))

        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files, passing the encoding for each
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name, file1_encoding) # <--- CHANGED
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name, file2_encoding) # <--- CHANGED

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)

    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)

    # --- Save the results to the output file --- # <--- ADDED SECTION
    if matching_values:
        # Convert the set of matching values to a DataFrame
        matching_df = pd.DataFrame(sorted(list(matching_values)), columns=['Matching_Switch_Numbers'])
        # Save the DataFrame to a CSV file
      
        print(f"Success! Matching values have been saved to: {output_file_path}")
    else:
        print("No matching values were found to save.")

Processing file: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/AFINAL.csv...
  - Found 13868 unique, clean values.
Processing file: /media/sagark24/New Volume/MERGE CDIS/2-Year-data/ENERGYAUDIT.csv...
  - Found 9954 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/AFINAL.csv' (Column: FROM_SWITCH): 13868
Unique values in '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/ENERGYAUDIT.csv' (Column: SWITCH_NO): 9954
-------------------------
Number of values that matched: 6355
-------------------------
Success! Matching values have been saved to: matching_values.csv


  df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED


In [None]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
file1_column_name = "FROM_SWITCH" 
# This file processed correctly before, so 'utf-8' is likely correct.
file1_encoding = 'utf-8'  # <--- ADDED

# File 2 Details
file2_path = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv"
file2_column_name = "FROM_SWITCHID"
# This is the file that had an error. 'latin1' is a safe choice.
file2_encoding = 'utf-8' # <--- ADDED ('windows-1252' is also a good option)

# Output File Path
# This file will contain only the values that exist in BOTH files.
# Changed to a simpler path. It will save in the same directory you run the script.
output_file_path = "matching_values.csv" # <--- CHANGED for simplicity


# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name, encoding): # <--- CHANGED (added encoding)
    """
    Reads a file with a specific encoding, extracts unique values from a
    column, cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        # Pass the encoding parameter here
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip', encoding=encoding) # <--- CHANGED
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name and encoding
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED

        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')

        cleaned_values = set(s.dropna().astype(int))

        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files, passing the encoding for each
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name, file1_encoding) # <--- CHANGED
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name, file2_encoding) # <--- CHANGED

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)

    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)

    # --- Save the results to the output file --- # <--- ADDED SECTION
    if matching_values:
        # Convert the set of matching values to a DataFrame
        matching_df = pd.DataFrame(sorted(list(matching_values)), columns=['Matching_Switch_Numbers'])
        # Save the DataFrame to a CSV file
      
        print(f"Success! Matching values have been saved to: {output_file_path}")
    else:
        print("No matching values were found to save.")

Processing file: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv...
  - Found 13663 unique, clean values.
Processing file: /media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv...
  - Found 15423 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv' (Column: FROM_SWITCH): 13663
Unique values in '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv' (Column: FROM_SWITCHID): 15423
-------------------------
Number of values that matched: 8030
-------------------------
Success! Matching values have been saved to: matching_values.csv


  df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED


In [None]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
file1_column_name = "SOURCE_SS" 
# This file processed correctly before, so 'utf-8' is likely correct.
file1_encoding = 'utf-8'  # <--- ADDED

# File 2 Details
file2_path = "/media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv"
file2_column_name = "FROM_STN"
# This is the file that had an error. 'latin1' is a safe choice.
file2_encoding = 'utf-8' # <--- ADDED ('windows-1252' is also a good option)

# Output File Path
# This file will contain only the values that exist in BOTH files.
# Changed to a simpler path. It will save in the same directory you run the script.
output_file_path = "matching_values.csv" # <--- CHANGED for simplicity


# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name, encoding): # <--- CHANGED (added encoding)
    """
    Reads a file with a specific encoding, extracts unique values from a
    column, cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        # Pass the encoding parameter here
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip', encoding=encoding) # <--- CHANGED
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name and encoding
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED

        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')

        cleaned_values = set(s.dropna().astype(int))

        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files, passing the encoding for each
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name, file1_encoding) # <--- CHANGED
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name, file2_encoding) # <--- CHANGED

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)

    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)

    # --- Save the results to the output file --- # <--- ADDED SECTION
    if matching_values:
        # Convert the set of matching values to a DataFrame
        matching_df = pd.DataFrame(sorted(list(matching_values)), columns=['Matching_Switch_Numbers'])
        # Save the DataFrame to a CSV file
      
        print(f"Success! Matching values have been saved to: {output_file_path}")
    else:
        print("No matching values were found to save.")

Processing file: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv...
  - Found 63 unique, clean values.
Processing file: /media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv...
  - Found 74 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv' (Column: SOURCE_SS): 63
Unique values in '/media/sagark24/New Volume/MERGE CDIS/2-Year-data/NETWORKDETAILS.csv' (Column: FROM_STN): 74
-------------------------
Number of values that matched: 60
-------------------------
Success! Matching values have been saved to: matching_values.csv


FAULT DATA 

In [None]:
import pandas as pd

# --- CONFIGURATION: PLEASE EDIT THIS SECTION ---

# File 1 Details
file1_path = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv"
file1_column_name = "FEEDER_ID" 
# This file processed correctly before, so 'utf-8' is likely correct.
file1_encoding = 'utf-8'  # <--- ADDED

# File 2 Details
file2_path = "/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv"
file2_column_name = "SWITCH_NO"
# This is the file that had an error. 'latin1' is a safe choice.
file2_encoding = 'utf-8' # <--- ADDED ('windows-1252' is also a good option)

# Output File Path
# This file will contain only the values that exist in BOTH files.
# Changed to a simpler path. It will save in the same directory you run the script.
output_file_path = "matching_values.csv" # <--- CHANGED for simplicity


# --- END OF CONFIGURATION ---


def find_actual_column_name(columns, target_name):
    """Helper function to find a column name, ignoring case."""
    for col in columns:
        if str(col).lower() == str(target_name).lower():
            return col
    return None

def get_unique_values_from_file(filepath, column_name, encoding): # <--- CHANGED (added encoding)
    """
    Reads a file with a specific encoding, extracts unique values from a
    column, cleans them, and returns them as a set.
    """
    print(f"Processing file: {filepath}...")
    try:
        # Read just the header to find the correct column name (case-insensitive)
        # Pass the encoding parameter here
        header_df = pd.read_csv(filepath, nrows=0, on_bad_lines='skip', encoding=encoding) # <--- CHANGED
        actual_col_name = find_actual_column_name(header_df.columns, column_name)

        if not actual_col_name:
            print(f"  - Error: Column '{column_name}' not found. Please check the column name.")
            return set()

        # Read the full column using the correct name and encoding
        df = pd.read_csv(filepath, usecols=[actual_col_name], on_bad_lines='skip', encoding=encoding) # <--- CHANGED

        # --- Data Cleaning ---
        # Convert all values to string, extract digits, and then convert to numbers.
        # This handles mixed data types (e.g., '123' vs 123) and text prefixes (e.g., 'SW-123').
        s = pd.Series(df[actual_col_name].dropna().unique(), dtype=str)
        # s = s.str.extract('(\d+)').iloc[:, 0]
        s = pd.to_numeric(s, errors='coerce')

        cleaned_values = set(s.dropna().astype(int))

        print(f"  - Found {len(cleaned_values)} unique, clean values.")
        return cleaned_values

    except FileNotFoundError:
        print(f"  - Error: File not found. Please check the path: {filepath}")
        return set()
    except Exception as e:
        print(f"  - An unexpected error occurred: {e}")
        return set()

# --- Main Script ---

# Get the unique values from both files, passing the encoding for each
unique_values_from_file1 = get_unique_values_from_file(file1_path, file1_column_name, file1_encoding) # <--- CHANGED
unique_values_from_file2 = get_unique_values_from_file(file2_path, file2_column_name, file2_encoding) # <--- CHANGED

# --- Comparison and Reporting ---

print("\n--- Comparison Report ---")
if not unique_values_from_file1 or not unique_values_from_file2:
    print("Could not perform comparison because one of the files could not be processed or contained no valid data.")
else:
    # Use set intersection to find the values that exist in both sets
    matching_values = unique_values_from_file1.intersection(unique_values_from_file2)

    # Print the final report
    print(f"Unique values in '{file1_path}' (Column: {file1_column_name}): {len(unique_values_from_file1)}")
    print(f"Unique values in '{file2_path}' (Column: {file2_column_name}): {len(unique_values_from_file2)}")
    print("-" * 25)
    print(f"Number of values that matched: {len(matching_values)}")
    print("-" * 25)

    # --- Save the results to the output file --- # <--- ADDED SECTION
    if matching_values:
        # Convert the set of matching values to a DataFrame
        matching_df = pd.DataFrame(sorted(list(matching_values)), columns=['Matching_Switch_Numbers'])
        # Save the DataFrame to a CSV file
      
        print(f"Success! Matching values have been saved to: {output_file_path}")
    else:
        print("No matching values were found to save.")

Processing file: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv...
  - Found 1199 unique, clean values.
Processing file: /media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv...
  - Found 1263 unique, clean values.

--- Comparison Report ---
Unique values in '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/11_KV_FINAL_HEALTH/AFINAL_full.csv' (Column: FEEDER_ID): 1199
Unique values in '/media/sagark24/New Volume/MERGE CDIS/IPYNB_FILE/DATA_GENERATION/HT_fault_cable_info_processed2.csv' (Column: SWITCH_NO): 1263
-------------------------
Number of values that matched: 826
-------------------------
Success! Matching values have been saved to: matching_values.csv
