SAMPLE OUTPUT OF 100 LINES

In [6]:
import pandas as pd
import os

# Define folder path where SCADA .csv is stored
folder_path = "2-Year-data/200/200"

# Automatically fetch the SCADA CSV file
scada_file = None
for file in os.listdir(folder_path):
    if file.lower().endswith(".csv"):
        scada_file = os.path.join(folder_path, file)
        break

if scada_file is None:
    raise FileNotFoundError("No SCADA .csv file found in the folder!")

# Load SCADA and HT cable files
scada = pd.read_csv(scada_file)
htcable = pd.read_csv("HTCABLE.csv")

# Filter SCADA by voltage containing "11"
scada_filtered = scada[scada['VOLTAGE'].astype(str).str.contains("11", case=False)]

# Create a DataFrame for results
columns = ['SOURCE_SWITCH_ID', 'DESTINATION_SWITCH_ID', 'SOURCE_SSFL', 'DESTINATION_SSFL']
results = pd.DataFrame(columns=columns)

# Track processed switches and total rows added
processed_switches = set()
row_limit = 200

# Traverse each Swno
for swno in scada_filtered['SWNO'].unique():
    if swno in processed_switches or len(results) >= row_limit:
        break

    current_from_switch = swno
    first_match = htcable[htcable['SOURCE_SWITCH_ID'] == current_from_switch]

    if first_match.empty:
        continue

    for _, row in first_match.iterrows():
        temp_chain = []

        source = row['SOURCE_SSFL']
        dest = row['DESTINATION_SSFL']
        to_switch = row['DESTINATION_SWITCH_ID'] if 'DESTINATION_SWITCH_ID' in row else None
        temp_chain.append([current_from_switch, to_switch, source, dest])
        processed_switches.add(current_from_switch)

        # Continue chain traversal
    visited_ssfl = set()  # Add this before the while loop

    # Inside the loop:
  # Inside the loop:
    visited_ssfl = set()

    while len(results) + len(temp_chain) < row_limit:
        next_match = htcable[htcable['SOURCE_SSFL'] == dest]

        if next_match.empty or dest in visited_ssfl:
            break

        next_row = next_match.iloc[0]
        source = next_row['SOURCE_SSFL']
        dest = next_row['DESTINATION_SSFL']

        #  Prevent self-loop
        if source == dest:
            break

        visited_ssfl.add(source)

        current_from_switch = next_row['SOURCE_SWITCH_ID']
        to_switch = next_row['DESTINATION_SWITCH_ID'] if 'DESTINATION_SWITCH_ID' in next_row else None

        temp_chain.append([current_from_switch, to_switch, source, dest])
        processed_switches.add(current_from_switch)

        # Append chain, checking if limit exceeded
        for item in temp_chain:
            if len(results) >= row_limit:
                break
            results.loc[len(results)] = item

        if len(results) >= row_limit:
            break

# Save results
results.to_excel("scada_htcable_chain.xlsx", index=False)
print("Saved first 100 rows to scada_htcable_chain.xlsx ")


  scada = pd.read_csv(scada_file)
  htcable = pd.read_csv("HTCABLE.csv")


Saved first 100 rows to scada_htcable_chain.xlsx 


IT FILTERS THE SCADA DATA BY VOLTAGE AND FULL SCADA DATA LOADS

In [4]:
import pandas as pd
import os

# List all SCADA folders you want to process
target_folders = [
    "2-Year-data/200/200",
    "2-Year-data/200-400/200-400",
    "2-Year-data/400-600/400-600",
    "2-Year-data/600-759/600-759"
]

# Load HT cable file once
htcable = pd.read_csv("HTCABLE.csv")

# Prepare final results DataFrame with updated column names
columns = ['SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID', 'SOURCE_SSFL', 'DESTINATION_SSFL', 'SFL']
final_results = pd.DataFrame(columns=columns)

# Process each SCADA folder
for folder_path in target_folders:
    print(f" Processing folder: {folder_path}")

    # Loop through all SCADA CSV files in the folder
    for file in os.listdir(folder_path):
        if not file.lower().endswith(".csv"):
            continue  # Skip non-CSV files

        scada_file = os.path.join(folder_path, file)
        print(f"   Processing SCADA file: {file}")
        
        try:
            scada = pd.read_csv(scada_file)
        except Exception as e:
            print(f"    Error reading {file}: {e}")
            continue

        if 'VOLTAGE' not in scada.columns or 'SWNO' not in scada.columns:
            print(f"    Missing required columns in {file}")
            continue

        # Filter SCADA by voltage containing "11"
        scada_filtered = scada[scada['VOLTAGE'].astype(str).str.contains("11", case=False)]

        processed_switches = set()

        # Traverse each Swno
        for swno in scada_filtered['SWNO'].unique():
            if swno in processed_switches:
                continue

            current_from_switch = swno
            first_match = htcable[htcable['SOURCE_SWITCH_ID'] == current_from_switch]

            if first_match.empty:
                continue

            for _, row in first_match.iterrows():
                temp_chain = []

                source = row['SOURCE_SSFL']
                dest = row['DESTINATION_SSFL']
                to_switch = row.get('DESTINATION_SWITCH_ID', None)
                temp_chain.append([current_from_switch, to_switch, source, dest])
                processed_switches.add(current_from_switch)

                # Begin chain traversal
                visited_ssfl = set()

                while True:
                    next_match = htcable[htcable['SOURCE_SSFL'] == dest]

                    if next_match.empty or dest in visited_ssfl:
                        break

                    next_row = next_match.iloc[0]
                    source = next_row['SOURCE_SSFL']
                    dest = next_row['DESTINATION_SSFL']

                    if source == dest:
                        break  # Prevent self-loop

                    visited_ssfl.add(source)

                    current_from_switch = next_row['SOURCE_SWITCH_ID']
                    to_switch = next_row.get('DESTINATION_SWITCH_ID', None)

                    temp_chain.append([current_from_switch, to_switch, source, dest])
                    processed_switches.add(current_from_switch)

                # Add this chain to final results
                for item in temp_chain:
                    source_switch = item[0]
                    dest_switch = item[1]
                    source_ssfl = item[2]
                    dest_ssfl = item[3]
                    combined_id = f"{source_switch}-{dest_switch}"
                    final_results.loc[len(final_results)] = [combined_id, source_ssfl, dest_ssfl, dest_ssfl]

# Save combined results
final_results.to_excel("scada_htcable_chain_all_folders.xlsx", index=False)
print(f"\n Done! Saved {len(final_results)} rows from all folders to scada_htcable_chain_all_folders.xlsx")


  htcable = pd.read_csv("HTCABLE.csv")


📁 Processing folder: 2-Year-data/200/200
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000182.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000183.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000184.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000185.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000186.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000187.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000188.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000189.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000190.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000191.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000192.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000193.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000194.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000195.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000196.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000197.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000198.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000199.csv


  scada = pd.read_csv(scada_file)


   🔍 Processing SCADA file: 2025-05-07_SCADA000000000000.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000001.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000002.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000003.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000004.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000005.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000006.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000007.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000008.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000009.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000010.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000011.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000013.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000014.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000015.csv
   🔍 Processing SCADA file: 2025-05-07_SCADA000000000016.csv


KeyboardInterrupt: 

REMOVE THE SCADA DATA FILTER

In [None]:
import pandas as pd
import os

# List all SCADA folders you want to process
target_folders = [
    "2-Year-data/200/200",
    "2-Year-data/200-400/200-400",
    "2-Year-data/400-600/400-600",
    "2-Year-data/600-759/600-759"
]

# Load HT cable file once
htcable = pd.read_csv("HTCABLE.csv")

# Prepare final results DataFrame with updated column names
columns = ['SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID', 'SOURCE_SSFL', 'DESTINATION_SSFL', 'SFL']
final_results = pd.DataFrame(columns=columns)

# Process each SCADA folder
for folder_path in target_folders:
    print(f" Processing folder: {folder_path}")

    # Loop through all SCADA CSV files in the folder
    for file in os.listdir(folder_path):
        if not file.lower().endswith(".csv"):
            continue  # Skip non-CSV files

        scada_file = os.path.join(folder_path, file)
        print(f"    Processing SCADA file: {file}")
        
        try:
            scada = pd.read_csv(scada_file)
        except Exception as e:
            print(f"    Error reading {file}: {e}")
            continue

        if 'SWNO' not in scada.columns:
            print(f"    Missing 'SWNO' column in {file}")
            continue

        processed_switches = set()

        # Traverse each Swno
        for swno in scada['SWNO'].unique():
            if swno in processed_switches:
                continue

            current_from_switch = swno
            first_match = htcable[htcable['SOURCE_SWITCH_ID'] == current_from_switch]

            if first_match.empty:
                continue

            for _, row in first_match.iterrows():
                temp_chain = []

                source = row['SOURCE_SSFL']
                dest = row['DESTINATION_SSFL']
                to_switch = row.get('DESTINATION_SWITCH_ID', None)
                temp_chain.append([current_from_switch, to_switch, source, dest])
                processed_switches.add(current_from_switch)

                # Begin chain traversal
                visited_ssfl = set()

                while True:
                    next_match = htcable[htcable['SOURCE_SSFL'] == dest]

                    if next_match.empty or dest in visited_ssfl:
                        break

                    next_row = next_match.iloc[0]
                    source = next_row['SOURCE_SSFL']
                    dest = next_row['DESTINATION_SSFL']

                    if source == dest:
                        break  # Prevent self-loop

                    visited_ssfl.add(source)

                    current_from_switch = next_row['SOURCE_SWITCH_ID']
                    to_switch = next_row.get('DESTINATION_SWITCH_ID', None)

                    temp_chain.append([current_from_switch, to_switch, source, dest])
                    processed_switches.add(current_from_switch)

                # Add this chain to final results
                for item in temp_chain:
                    source_switch = item[0]
                    dest_switch = item[1]
                    source_ssfl = item[2]
                    dest_ssfl = item[3]
                    combined_id = f"{source_switch}-{dest_switch}"
                    final_results.loc[len(final_results)] = [combined_id, source_ssfl, dest_ssfl, dest_ssfl]

# Save combined results
final_results.to_excel("scada_htcable_chain_all_folders.xlsx", index=False)
print(f"\n Done! Saved {len(final_results)} rows from all folders to scada_htcable_chain_all_folders.xlsx")


CREATE UNIQUE CHAINS TO REMOVE DUPLICATES 

In [5]:
import pandas as pd
import os

# List all SCADA folders you want to process
target_folders = [
    "2-Year-data/200/200",
    "2-Year-data/200-400/200-400",
    "2-Year-data/400-600/400-600",
    "2-Year-data/600-759/600-759"
]

# Load HT cable file once and drop any exact duplicates
htcable = pd.read_csv("HTCABLE.csv").drop_duplicates()

# Prepare final results DataFrame
columns = ['SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID', 'SOURCE_SSFL', 'DESTINATION_SSFL', 'SFL']
final_results = pd.DataFrame(columns=columns)

# To store only unique chains (row-level uniqueness)
unique_chains = set()

# Process each SCADA folder
for folder_path in target_folders:
    print(f" Processing folder: {folder_path}")

    # Loop through all SCADA CSV files in the folder
    for file in os.listdir(folder_path):
        if not file.lower().endswith(".csv"):
            continue  # Skip non-CSV files

        scada_file = os.path.join(folder_path, file)
        print(f"    Processing SCADA file: {file}")
        
        try:
            scada = pd.read_csv(scada_file)
        except Exception as e:
            print(f"    Error reading {file}: {e}")
            continue

        if 'SWNO' not in scada.columns:
            print(f"    Missing 'SWNO' column in {file}")
            continue

        processed_switches = set()

        # Traverse each Swno
        for swno in scada['SWNO'].unique():
            if swno in processed_switches:
                continue

            current_from_switch = swno
            first_match = htcable[htcable['SOURCE_SWITCH_ID'] == current_from_switch]

            if first_match.empty:
                continue

            for _, row in first_match.iterrows():
                temp_chain = []

                source = row['SOURCE_SSFL']
                dest = row['DESTINATION_SSFL']
                to_switch = row.get('DESTINATION_SWITCH_ID', None)
                temp_chain.append([current_from_switch, to_switch, source, dest])
                processed_switches.add(current_from_switch)

                # Begin chain traversal
                visited_ssfl = set()

                while True:
                    next_match = htcable[htcable['SOURCE_SSFL'] == dest]

                    if next_match.empty or dest in visited_ssfl:
                        break

                    next_row = next_match.iloc[0]
                    source = next_row['SOURCE_SSFL']
                    dest = next_row['DESTINATION_SSFL']

                    if source == dest:
                        break  # Prevent self-loop

                    visited_ssfl.add(source)

                    current_from_switch = next_row['SOURCE_SWITCH_ID']
                    to_switch = next_row.get('DESTINATION_SWITCH_ID', None)

                    temp_chain.append([current_from_switch, to_switch, source, dest])
                    processed_switches.add(current_from_switch)

                # Add only unique chain links to final results
                for item in temp_chain:
                    chain_tuple = tuple(item)
                    if chain_tuple in unique_chains:
                        continue  # Skip duplicate
                    unique_chains.add(chain_tuple)

                    source_switch = item[0]
                    dest_switch = item[1]
                    source_ssfl = item[2]
                    dest_ssfl = item[3]
                    combined_id = f"{source_switch}-{dest_switch}"

                    final_results.loc[len(final_results)] = [combined_id, source_ssfl, dest_ssfl, dest_ssfl]

# Save combined results
final_results.to_excel("scada_htcable_chain_all_folders_deduped.xlsx", index=False)
print(f"\n Done! Saved {len(final_results)} unique rows to scada_htcable_chain_all_folders_deduped.xlsx")


  htcable = pd.read_csv("HTCABLE.csv").drop_duplicates()


 Processing folder: 2-Year-data/200/200
    Processing SCADA file: 2025-05-07_SCADA000000000182.csv


  scada = pd.read_csv(scada_file)


    Processing SCADA file: 2025-05-07_SCADA000000000183.csv


  scada = pd.read_csv(scada_file)


    Processing SCADA file: 2025-05-07_SCADA000000000184.csv


  scada = pd.read_csv(scada_file)


KeyboardInterrupt: 

SANMPLE OF ABOVE SCRIPT IN 100 LINES

In [7]:
import pandas as pd
import os

# List all SCADA folders you want to process
target_folders = [
    "2-Year-data/200/200",
    "2-Year-data/200-400/200-400",
    "2-Year-data/400-600/400-600",
    "2-Year-data/600-759/600-759"
]

# Load HT cable file once and drop any exact duplicates
htcable = pd.read_csv("HTCABLE.csv").drop_duplicates()

# Prepare final results DataFrame
columns = ['SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID', 'SOURCE_SSFL', 'DESTINATION_SSFL', 'SFL']
final_results = pd.DataFrame(columns=columns)

# Track unique chain links
unique_chains = set()

# Row limit
ROW_LIMIT = 200

# Flag to exit outer loops
done = False

for folder_path in target_folders:
    if done:
        break
    print(f"Processing folder: {folder_path}")

    for file in os.listdir(folder_path):
        if done:
            break
        if not file.lower().endswith(".csv"):
            continue

        scada_file = os.path.join(folder_path, file)
        print(f"    Processing SCADA file: {file}")

        try:
            scada = pd.read_csv(scada_file)
        except Exception as e:
            print(f"   Error reading {file}: {e}")
            continue

        if 'SWNO' not in scada.columns:
            print(f"   Missing 'SWNO' column in {file}")
            continue

        processed_switches = set()

        for swno in scada['SWNO'].unique():
            if done:
                break
            if swno in processed_switches:
                continue

            current_from_switch = swno
            first_match = htcable[htcable['SOURCE_SWITCH_ID'] == current_from_switch]

            if first_match.empty:
                continue

            for _, row in first_match.iterrows():
                temp_chain = []

                source = row['SOURCE_SSFL']
                dest = row['DESTINATION_SSFL']
                to_switch = row.get('DESTINATION_SWITCH_ID', None)
                temp_chain.append([current_from_switch, to_switch, source, dest])
                processed_switches.add(current_from_switch)

                visited_ssfl = set()

                while True:
                    next_match = htcable[htcable['SOURCE_SSFL'] == dest]

                    if next_match.empty or dest in visited_ssfl:
                        break

                    next_row = next_match.iloc[0]
                    source = next_row['SOURCE_SSFL']
                    dest = next_row['DESTINATION_SSFL']

                    if source == dest:
                        break

                    visited_ssfl.add(source)

                    current_from_switch = next_row['SOURCE_SWITCH_ID']
                    to_switch = next_row.get('DESTINATION_SWITCH_ID', None)
                    temp_chain.append([current_from_switch, to_switch, source, dest])
                    processed_switches.add(current_from_switch)

                # Add unique items to result
                for item in temp_chain:
                    chain_tuple = tuple(item)
                    if chain_tuple in unique_chains:
                        continue

                    unique_chains.add(chain_tuple)
                    source_switch = item[0]
                    dest_switch = item[1]
                    source_ssfl = item[2]
                    dest_ssfl = item[3]
                    combined_id = f"{source_switch}-{dest_switch}"

                    final_results.loc[len(final_results)] = [combined_id, source_ssfl, dest_ssfl, dest_ssfl]

                    # Stop after 200 rows
                    if len(final_results) >= ROW_LIMIT:
                        done = True
                        break

# Save the limited results
final_results.to_excel("scada_htcable_chain_200_rows.xlsx", index=False)
print(f"\n Done! Saved {len(final_results)} rows to scada_htcable_chain_200_rows.xlsx")


  htcable = pd.read_csv("HTCABLE.csv").drop_duplicates()


Processing folder: 2-Year-data/200/200
    Processing SCADA file: 2025-05-07_SCADA000000000182.csv


  scada = pd.read_csv(scada_file)



 Done! Saved 201 rows to scada_htcable_chain_200_rows.xlsx


NEW LOGIC HANDLE THE MULTOLE SAME ROWS THA ARE IN THE HTCABLE FILE

In [None]:
import pandas as pd
import os


# 1.  PRE-PROCESS THE HT-CABLE FILE

cols_to_drop = ["COMMENTS", "GLOBALID", "MEASUREDLENGTH",
                "Unnamed: 0", "OBJECTID"]

htcable = (
    pd.read_csv("HTCABLE.csv", low_memory=False)
      .drop(columns=cols_to_drop, errors="ignore")            # throw away noisy cols
      .drop_duplicates(subset=["SOURCE_SWITCH_ID",
                               "DESTINATION_SWITCH_ID",
                               "SOURCE_SSFL",
                               "DESTINATION_SSFL"])           # keep only unique edges
      .reset_index(drop=True)
)


# 2.  PREPARE OUTPUT CONTAINER

out_cols = ["SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID",
            "SOURCE_SSFL", "DESTINATION_SSFL", "SFL"]
final_results = pd.DataFrame(columns=out_cols)
unique_edges = set()              # guarantees row-level uniqueness


# 3.  SCADA → HT-CABLE WALK

target_folders = [
    "/media/sagarkumar/New Volume1/SAGAR/200/200",
    "/media/sagarkumar/New Volume1/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume1/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume1/SAGAR/600-759/600-759"
]

for folder in target_folders:
    print(f"  {folder}")
    for f in filter(lambda x: x.lower().endswith(".csv"), os.listdir(folder)):
        scada_path = os.path.join(folder, f)
        try:
            scada = pd.read_csv(scada_path, low_memory=False)
        except Exception as e:
            print(f"   ↳ skipped {f}: {e}")
            continue
        if "SWNO" not in scada.columns:
            print(f"   ↳ skipped {f}: no SWNO column")
            continue

        seen_swno = set()         # don’t start the same chain twice

        # ── for each SW in the SCADA file
        for swno in scada["SWNO"].unique():
            if swno in seen_swno:
                continue
            seen_swno.add(swno)

            # first leg(s) : SOURCE_SWITCH_ID == SCADA swno
            first_legs = htcable[htcable["SOURCE_SWITCH_ID"] == swno]
            if first_legs.empty:        # nothing to start with
                continue

            for _, first in first_legs.iterrows():
                chain = []                              # holds one complete path
                visited_ssfl = set()                    # loop guard

                # push the very first edge
                chain.append(first)

                # walk forward
                current_dest_ssfl = first["DESTINATION_SSFL"]
                while True:
                    if current_dest_ssfl in visited_ssfl:
                        break
                    visited_ssfl.add(current_dest_ssfl)

                    # Grab *all* next legs that start from this SSFL
                    next_edges = (htcable
                                  [htcable["SOURCE_SSFL"] == current_dest_ssfl]
                                  .sort_values(["SOURCE_SWITCH_ID",
                                                "DESTINATION_SWITCH_ID"]))
                    if next_edges.empty:
                        break

                    for _, edge in next_edges.iterrows():
                        chain.append(edge)
                        current_dest_ssfl = edge["DESTINATION_SSFL"]

                    # and loop again with the **last** destination just appended

                # dump chain to output, edge by edge
                for edge in chain:
                    tup = (edge["SOURCE_SWITCH_ID"],
                           edge["DESTINATION_SWITCH_ID"],
                           edge["SOURCE_SSFL"],
                           edge["DESTINATION_SSFL"])
                    if tup in unique_edges:     # skip dup across all folders
                        continue
                    unique_edges.add(tup)

                    final_results.loc[len(final_results)] = [
                        f"{edge['SOURCE_SWITCH_ID']}-{edge['DESTINATION_SWITCH_ID']}",
                        edge["SOURCE_SSFL"],
                        edge["DESTINATION_SSFL"],
                        edge["DESTINATION_SSFL"]   # per your original spec
                    ]


# 4.  SAVE THE MERGED CHAINS

out_file = "scada_htcable_chain_all_folders_deduped.xlsx"
final_results.to_excel(out_file, index=False)
print(f"\n  Done – {len(final_results)} unique rows written to {out_file}")


SAMLE OF ABOVE CODE

In [None]:
import pandas as pd
import os

# 1.  PRE-PROCESS THE HT-CABLE FILE
cols_to_drop = ["COMMENTS", "GLOBALID", "MEASUREDLENGTH",
                "Unnamed: 0", "OBJECTID"]

try:
    htcable = (
        pd.read_csv("HTCABLE.csv", low_memory=False)
          .drop(columns=cols_to_drop, errors="ignore")      # throw away noisy cols
          .drop_duplicates(subset=["SOURCE_SWITCH_ID",
                                    "DESTINATION_SWITCH_ID",
                                    "SOURCE_SSFL",
                                    "DESTINATION_SSFL"])      # keep only unique edges
          .reset_index(drop=True)
    )
except FileNotFoundError:
    print("Error: HTCABLE.csv not found. Please ensure the file is in the correct directory.")
    exit()
except Exception as e:
    print(f"Error reading or processing HTCABLE.csv: {e}")
    exit()


# 2.  PREPARE OUTPUT CONTAINER
out_cols = ["SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID",
            "SOURCE_SSFL", "DESTINATION_SSFL", "SFL"]
final_results = pd.DataFrame(columns=out_cols)
unique_edges = set()          # guarantees row-level uniqueness
output_limit = 10000 # Define the output limit
limit_reached = False # Flag to indicate if the limit has been reached

# 3.  SCADA → HT-CABLE WALK
target_folders = [
    "/media/sagarkumar/New Volume/SAGAR/200/200",
    "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
    "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
    "/media/sagarkumar/New Volume/SAGAR/600-759/600-759"
]

for folder in target_folders:
    if limit_reached:
        break
    print(f"folder  {folder}")
    # Check if the folder exists
    if not os.path.isdir(folder):
        print(f"  ↳ skipped folder {folder}: does not exist or is not a directory")
        continue

    for f in filter(lambda x: x.lower().endswith(".csv"), os.listdir(folder)):
        if limit_reached:
            break
        scada_path = os.path.join(folder, f)
        try:
            scada = pd.read_csv(scada_path, low_memory=False)
        except Exception as e:
            print(f"    ↳ skipped {f}: {e}")
            continue
        if "SWNO" not in scada.columns:
            print(f"    ↳ skipped {f}: no SWNO column")
            continue

        seen_swno = set()      # don’t start the same chain twice

        # ── for each SW in the SCADA file
        for swno in scada["SWNO"].unique():
            if limit_reached:
                break
            if swno in seen_swno:
                continue
            seen_swno.add(swno)

            # first leg(s) : SOURCE_SWITCH_ID == SCADA swno
            first_legs = htcable[htcable["SOURCE_SWITCH_ID"] == swno]
            if first_legs.empty:      # nothing to start with
                continue

            for _, first in first_legs.iterrows():
                if limit_reached:
                    break
                chain = []                                  # holds one complete path
                visited_ssfl = set()                        # loop guard

                # push the very first edge
                chain.append(first)

                # walk forward
                current_dest_ssfl = first["DESTINATION_SSFL"]
                while True: # Inner loop for walking the chain
                    if current_dest_ssfl in visited_ssfl:
                        break # Break from inner while loop (loop guard)
                    visited_ssfl.add(current_dest_ssfl)

                    # Grab *all* next legs that start from this SSFL
                    next_edges = (htcable
                                  [htcable["SOURCE_SSFL"] == current_dest_ssfl]
                                  .sort_values(["SOURCE_SWITCH_ID",
                                                "DESTINATION_SWITCH_ID"]))
                    if next_edges.empty:
                        break # Break from inner while loop (no more edges)

                    # In the original script, it iterates through all next_edges and appends the last one's destination.
                    # This logic implies that a chain can branch, but the original code only follows the last branch.
                    # For simplicity and to match the original apparent intent, we'll take the last edge from next_edges.
                    # If multiple branches need to be explored independently, the logic here would need significant changes (e.g., recursion or a stack).

                    # Append all edges found from this SSFL (if this was the intent for multiple branches)
                    # For now, let's stick to the original logic where current_dest_ssfl is updated by the last edge in next_edges.
                    last_edge_appended_to_chain = False
                    for _, edge in next_edges.iterrows():
                        chain.append(edge)
                        current_dest_ssfl = edge["DESTINATION_SSFL"] # This will be updated multiple times if next_edges has more than one row.
                                                                    # The while loop then continues with the DESTINATION_SSFL of the *last* edge.
                        last_edge_appended_to_chain = True

                    if not last_edge_appended_to_chain: # Should not happen if next_edges was not empty
                        break
                    # and loop again with the **last** destination just appended

                # dump chain to output, edge by edge
                for edge in chain:
                    if len(final_results) >= output_limit:
                        limit_reached = True
                        break # Break from chain dump loop

                    tup = (edge["SOURCE_SWITCH_ID"],
                           edge["DESTINATION_SWITCH_ID"],
                           edge["SOURCE_SSFL"],
                           edge["DESTINATION_SSFL"])
                    if tup in unique_edges:      # skip dup across all folders
                        continue
                    unique_edges.add(tup)

                    final_results.loc[len(final_results)] = [
                        f"{edge['SOURCE_SWITCH_ID']}-{edge['DESTINATION_SWITCH_ID']}",
                        edge["SOURCE_SSFL"],
                        edge["DESTINATION_SSFL"],
                        edge["DESTINATION_SSFL"]  # per your original spec
                    ]
                if limit_reached: # propagate break
                    break # from first_legs loop
            if limit_reached: # propagate break
                break # from swno loop
        if limit_reached: # propagate break
            break # from files loop
    if limit_reached: # propagate break
        break # from folders loop


# 4.  SAVE THE MERGED CHAINS
out_file = "scada_htcable_chain_limited_output.xlsx"
try:
    final_results.to_excel(out_file, index=False)
    print(f"\n  Done – {len(final_results)} rows written to {out_file}")
    if limit_reached and len(final_results) >= output_limit:
        print(f"Output was limited to {output_limit} rows.")
except Exception as e:
    print(f"Error writing to Excel file {out_file}: {e}")



folder  /media/sagarkumar/New Volume/SAGAR/200/200


In [None]:
import pandas as pd
import os
from concurrent.futures import ProcessPoolExecutor, as_completed

# (Assume `htcable` is already loaded as before.)

def main():
    data_folders = [
        "/media/sagarkumar/New Volume/SAGAR/200/200",
        "/media/sagarkumar/New Volume/SAGAR/200-400/200-400",
        "/media/sagarkumar/New Volume/SAGAR/400-600/400-600",
        "/media/sagarkumar/New Volume/SAGAR/600-759/600-759",
    ]

    # 1) Gather all CSV paths
    all_scada_paths = []
    for folder in data_folders:
        if not os.path.isdir(folder):
            print(f"↳ Skipping nonexistent folder {folder}")
            continue
        for fname in os.listdir(folder):
            if fname.lower().endswith(".csv"):
                all_scada_paths.append(os.path.join(folder, fname))

    if not all_scada_paths:
        print("No SCADA files found. Exiting.")
        return

    # 2) Kick off parallel workers
    max_workers = min(os.cpu_count() or 1, 8)
    futures = []
    with ProcessPoolExecutor(max_workers=max_workers) as exe:
        for scada_path in all_scada_paths:
            futures.append(
                exe.submit(process_single_scada_file, scada_path, htcable)
            )

        # 3) As each worker finishes, collect its rows
        all_partial = []
        for fut in as_completed(futures):
            try:
                rows = fut.result()
            except Exception as e:
                print("  ↳ Worker error:", e)
                continue
            if rows:
                all_partial.extend(rows)

    # 4) Globally dedupe + enforce 20 000‐row limit
    unique_global = set()
    final_rows = []
    limit = 20000
    for row in all_partial:
        if len(final_rows) >= limit:
            break
        src, dst = row["SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID"].split("-")
        tup = (int(src), int(dst),
               float(row["SOURCE_SSFL"]),
               float(row["DESTINATION_SSFL"]))
        if tup in unique_global:
            continue
        unique_global.add(tup)
        final_rows.append(row)

    # 5) Build DataFrame & write Excel
    df_out = pd.DataFrame(final_rows, columns=[
        "SOURCE_SWITCH_ID-DESTINATION_SWITCH_ID",
        "SOURCE_SSFL", "DESTINATION_SSFL", "SFL"
    ])
    df_out.to_excel("scada_htcable_chain_limited_outputp.xlsx", index=False)
    print(f"Exported {len(df_out)} rows.{' (LIMITED to 20 000.)' if len(df_out) >= limit else ''}")

if __name__ == "__main__":
    main()
