Restarted .conda (Python 3.9.19)

In [1]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [2]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [3]:
tadsFileAddr = os.path.join(rawDataFolder, "TADS 2024 AC Inventory.csv")
dfTads0 = pd.read_csv(tadsFileAddr)
sizeTads0 = dfTads0.shape
print(f"Size of TADS db before filtering: {sizeTads0[0]}, {sizeTads0[1]}")
companyNamesTads0 = set(dfTads0.CompanyName)
numCompaniesTads0 = len(companyNamesTads0)
print(f"There are {numCompaniesTads0} unique companies owning tlines in the entire TADS database.")
# display(dftads)

Size of TADS db before filtering: 301152, 47
There are 304 unique companies owning tlines in the entire TADS database.


  dfTads0 = pd.read_csv(tadsFileAddr)


In [4]:
location = "chicago-ohare"
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\tlines-near-chicago-ohare-raw.xlsx
Size of velocity suite db before any filtering: 524, 21


In [5]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 459, 21
There are 6 named companies owning the tlines near chicago-ohare
Their names are:
{'Undetermined Company', 'Northern Indiana Public Service Co LLC', 'Northern Municipal Power Agency', 'AmerenIP', 'American Transmission Co LLC', 'Commonwealth Edison Co'}


In [6]:
print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire TADS database:")

print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.")

companyNamesVelo2Tads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# Replace the element using the 'discard' method (more efficient for sets)
companyNamesVelo2Tads.discard("Commonwealth Edison Co")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
companyNamesVelo2Tads.discard("AmerenIP")
companyNamesVelo2Tads.add("Ameren Services Company")
companyNamesVelo2Tads.discard("American Transmission Co LLC")
companyNamesVelo2Tads.add("American Transmission Company")
companyNamesVelo2Tads.discard("Northern Indiana Public Service Co LLC")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Northern Municipal Power Agency")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Undetermined Company")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
print(companyNamesVelo2Tads)

dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)

veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-Chicago-Ohare-Sorted.xlsx")
dfVeloSorted.to_excel(veloSortedAddr)

Now let's see how many tlines are owned by these 6 companies in the entire TADS database:
But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.
{'American Transmission Company', 'Commonwealth Edison Company', 'Ameren Services Company', 'Northern Indiana Public Service Company [BA'}


In [7]:
dfTads = dfTads0.copy()
dfTads = dfTads[dfTads['CompanyName'].isin(companyNamesVelo2Tads)]
voltageClassesTads0 = set(dfTads['VoltageClassCodeName'])
print(voltageClassesTads0)
voltageClassesAllowedTads = voltageClassesTads0.copy()
voltageClassesAllowedTads.discard("0-99 kV")

dfTads = dfTads[dfTads['VoltageClassCodeName'].isin(voltageClassesAllowedTads)]

sizeTads = dfTads.shape
print(f"Size of TADS db after filtering: {sizeTads[0]}, {sizeTads[1]}")

dfTadsSorted = sort_and_shift_columns(dfTads)

tadsSortedAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Sorted.xlsx")

dfTadsSorted.to_excel(tadsSortedAddr, index=False)

# dfTadsLatest = filter_tlines_by_latest_reported_year(dfTadsSorted)
dfTadsLatest = get_latest_entries(dfTadsSorted)

sizeTadsLatest = dfTadsLatest.shape

print(f"Size of TADS db after filtering for only latest reported year: {sizeTadsLatest[0]}, {sizeTadsLatest[1]}")

tadsLatestAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Latest.xlsx")

dfTadsLatest.to_excel(tadsLatestAddr)

{'200-299 kV', '100-199 kV', '600-799 kV', '300-399 kV'}
Size of TADS db after filtering: 16052, 47
Size of TADS db after filtering for only latest reported year: 1705, 47


In [8]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [9]:
dfMatch

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [10]:
dfMatch['Rec_ID']

28729     60847
28739     22650
184865     2486
118743    55397
118743    69508
          ...  
300894    55407
292165    60900
300914    60900
184890     2431
300927     2430
Name: Rec_ID, Length: 127, dtype: int64

Connected to .conda (Python 3.9.19)

In [1]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [2]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [3]:
tadsFileAddr = os.path.join(rawDataFolder, "TADS 2024 AC Inventory.csv")
dfTads0 = pd.read_csv(tadsFileAddr)
sizeTads0 = dfTads0.shape
print(f"Size of TADS db before filtering: {sizeTads0[0]}, {sizeTads0[1]}")
companyNamesTads0 = set(dfTads0.CompanyName)
numCompaniesTads0 = len(companyNamesTads0)
print(f"There are {numCompaniesTads0} unique companies owning tlines in the entire TADS database.")
# display(dftads)

Size of TADS db before filtering: 301152, 47
There are 304 unique companies owning tlines in the entire TADS database.


  dfTads0 = pd.read_csv(tadsFileAddr)


In [4]:
location = "chicago-ohare"
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\tlines-near-chicago-ohare-raw.xlsx
Size of velocity suite db before any filtering: 524, 21


In [5]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 459, 21
There are 6 named companies owning the tlines near chicago-ohare
Their names are:
{'American Transmission Co LLC', 'Undetermined Company', 'Northern Municipal Power Agency', 'AmerenIP', 'Northern Indiana Public Service Co LLC', 'Commonwealth Edison Co'}


In [6]:
print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire TADS database:")

print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.")

companyNamesVelo2Tads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# Replace the element using the 'discard' method (more efficient for sets)
companyNamesVelo2Tads.discard("Commonwealth Edison Co")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
companyNamesVelo2Tads.discard("AmerenIP")
companyNamesVelo2Tads.add("Ameren Services Company")
companyNamesVelo2Tads.discard("American Transmission Co LLC")
companyNamesVelo2Tads.add("American Transmission Company")
companyNamesVelo2Tads.discard("Northern Indiana Public Service Co LLC")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Northern Municipal Power Agency")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Undetermined Company")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
print(companyNamesVelo2Tads)

dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)

veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-Chicago-Ohare-Sorted.xlsx")
dfVeloSorted.to_excel(veloSortedAddr)

Now let's see how many tlines are owned by these 6 companies in the entire TADS database:
But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.
{'Northern Indiana Public Service Company [BA', 'American Transmission Company', 'Ameren Services Company', 'Commonwealth Edison Company'}


In [7]:
dfTads = dfTads0.copy()
dfTads = dfTads[dfTads['CompanyName'].isin(companyNamesVelo2Tads)]
voltageClassesTads0 = set(dfTads['VoltageClassCodeName'])
print(voltageClassesTads0)
voltageClassesAllowedTads = voltageClassesTads0.copy()
voltageClassesAllowedTads.discard("0-99 kV")

dfTads = dfTads[dfTads['VoltageClassCodeName'].isin(voltageClassesAllowedTads)]

sizeTads = dfTads.shape
print(f"Size of TADS db after filtering: {sizeTads[0]}, {sizeTads[1]}")

dfTadsSorted = sort_and_shift_columns(dfTads)

tadsSortedAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Sorted.xlsx")

dfTadsSorted.to_excel(tadsSortedAddr, index=False)

# dfTadsLatest = filter_tlines_by_latest_reported_year(dfTadsSorted)
dfTadsLatest = get_latest_entries(dfTadsSorted)

sizeTadsLatest = dfTadsLatest.shape

print(f"Size of TADS db after filtering for only latest reported year: {sizeTadsLatest[0]}, {sizeTadsLatest[1]}")

tadsLatestAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Latest.xlsx")

dfTadsLatest.to_excel(tadsLatestAddr)

{'600-799 kV', '200-299 kV', '300-399 kV', '100-199 kV'}
Size of TADS db after filtering: 16052, 47
Size of TADS db after filtering for only latest reported year: 1705, 47


In [8]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [9]:
dfMatch

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [10]:
dfMatch['Rec_ID']

28729     60847
28739     22650
184865     2486
118743    55397
118743    69508
          ...  
300894    55407
292165    60900
300914    60900
184890     2431
300927     2430
Name: Rec_ID, Length: 127, dtype: int64

In [11]:
dfMatch["RetirementDate"]

28729     NaN
28739     NaN
184865    NaN
118743    NaN
118743    NaN
         ... 
300894    NaN
292165    NaN
300914    NaN
184890    NaN
300927    NaN
Name: RetirementDate, Length: 127, dtype: object

In [12]:
dfMatch['CompanyName']

28729     Northern Indiana Public Service Company [BA
28739     Northern Indiana Public Service Company [BA
184865                    Commonwealth Edison Company
118743                    Commonwealth Edison Company
118743                    Commonwealth Edison Company
                             ...                     
300894                    Commonwealth Edison Company
292165                    Commonwealth Edison Company
300914                    Commonwealth Edison Company
184890                    Commonwealth Edison Company
300927                    Commonwealth Edison Company
Name: CompanyName, Length: 127, dtype: object

In [13]:
dfMatch['RegionCode']

28729     RFC
28739     RFC
184865    RFC
118743    RFC
118743    RFC
         ... 
300894    RFC
292165    RFC
300914    RFC
184890    RFC
300927    RFC
Name: RegionCode, Length: 127, dtype: object

In [14]:
dfMatch['FromBus']

28729                 Aetna
28739                 Aetna
184865         Libertyville
118743    Electric Junction
118743    Electric Junction
                ...        
300894                 Zion
292165             Waukegan
300914                 Zion
184890         Libertyville
300927                 Zion
Name: FromBus, Length: 127, dtype: object

In [15]:
dfMatch['ToBus']

28729            Lake George
28739                 Miller
184865             Aptakisic
118743                Aurora
118743                Aurora
                 ...        
300894            Northbrook
292165                  Zion
300914              Waukegan
184890    Zion Energy Center
300927    Zion Energy Center
Name: ToBus, Length: 127, dtype: object

In [16]:
dfMatch['TertiaryBus']

28729     NaN
28739     NaN
184865    NaN
118743    NaN
118743    NaN
         ... 
300894    NaN
292165    NaN
300914    NaN
184890    NaN
300927    NaN
Name: TertiaryBus, Length: 127, dtype: object

In [17]:
import pandas as pd
import os

import pandas as pd


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing only specific columns.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with 'ElementIdentifierName' values)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - 'RegionCode'
            - 'FromBus'
            - 'ToBus'
            - 'TertiaryBus'
            - 'Miles'
            - 'BESExemptedFlag'
            - 'NumberOfTerminals'
            - 'CircuitTypeCode'
            - 'VoltageClassCodeName'
            - 'ParentCode'
            - 'ConductorsPerPhaseCode'
            - 'OverheadGroundWireCode'
            - 'InsulatorTypeCode'
            - 'CableTypeCode'
            - 'StructureMaterialCode'
            - 'StructureTypeCode'
            - 'CircuitsPerStructureCode'
            - 'TerrainCode'
            - 'ElevationCode'
            - 'InServiceDate'
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Fill the 'combo' column with 'ElementIdentifierName' values
    df_reduced["combo"] = df_reduced["ElementIdentifierName"]

    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [18]:
get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced["ElementIdentifierName"]


Unnamed: 0,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,CircuitTypeCode,...,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID,combo
28729,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60847,138054
28739,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,22650,138102
184865,15410,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2486,15410
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,55397,11119
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,69508,11119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/13 0:00,,55407,2218
292165,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,1609
300914,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,2218
184890,15423,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2431,15423


In [19]:
import pandas as pd
import os

import pandas as pd


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [20]:
get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


Unnamed: 0,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,CircuitTypeCode,...,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID,combo
28729,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60847,ACO - AC Overhead - Aetna - Lake George - 138054
28739,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,22650,ACO - AC Overhead - Aetna - Miller - 138102
184865,15410,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2486,ACO - AC Overhead - Libertyville - Aptakisic -...
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,55397,ACO - AC Overhead - Electric Junction - Aurora...
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,69508,ACO - AC Overhead - Electric Junction - Aurora...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/13 0:00,,55407,ACO - AC Overhead - Zion - Northbrook - 2218
292165,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,ACO - AC Overhead - Waukegan - Zion - 1609
300914,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,ACO - AC Overhead - Zion - Waukegan - 2218
184890,15423,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2431,ACO - AC Overhead - Libertyville - Zion Energy...


In [21]:
import pandas as pd
import os

import pandas as pd


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    df_reduced = df_reduced[["combo"] + list(df_reduced.filter(like='^((?!combo).)*$'))]

    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [22]:
get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


Unnamed: 0,combo
28729,ACO - AC Overhead - Aetna - Lake George - 138054
28739,ACO - AC Overhead - Aetna - Miller - 138102
184865,ACO - AC Overhead - Libertyville - Aptakisic -...
118743,ACO - AC Overhead - Electric Junction - Aurora...
118743,ACO - AC Overhead - Electric Junction - Aurora...
...,...
300894,ACO - AC Overhead - Zion - Northbrook - 2218
292165,ACO - AC Overhead - Waukegan - Zion - 1609
300914,ACO - AC Overhead - Zion - Waukegan - 2218
184890,ACO - AC Overhead - Libertyville - Zion Energy...


In [23]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    df_reduced = df_reduced[["combo"] + list(df_reduced.filter(like='^((?!combo).)*$'))]

    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [24]:
get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


Unnamed: 0,combo
28729,ACO - AC Overhead - Aetna - Lake George - 138054
28739,ACO - AC Overhead - Aetna - Miller - 138102
184865,ACO - AC Overhead - Libertyville - Aptakisic -...
118743,ACO - AC Overhead - Electric Junction - Aurora...
118743,ACO - AC Overhead - Electric Junction - Aurora...
...,...
300894,ACO - AC Overhead - Zion - Northbrook - 2218
292165,ACO - AC Overhead - Waukegan - Zion - 1609
300914,ACO - AC Overhead - Zion - Waukegan - 2218
184890,ACO - AC Overhead - Libertyville - Zion Energy...


In [25]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    df_reduced.insert(0, "combo", df_reduced.pop("combo"))

    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [26]:
df_reduced = get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


In [27]:
dfMatch1 = copy(dfMatch)

NameError: name 'copy' is not defined

In [28]:
dfMatch1 = dfMatch.copy

In [29]:
dfMatch1

<bound method NDFrame.copy of                   FromBus               ToBus  ReportingYearNbr  \
28729               Aetna         Lake George              2024   
28739               Aetna              Miller              2024   
184865       Libertyville           Aptakisic              2024   
118743  Electric Junction              Aurora              2024   
118743  Electric Junction              Aurora              2024   
...                   ...                 ...               ...   
300894               Zion          Northbrook              2014   
292165           Waukegan                Zion              2024   
300914               Zion            Waukegan              2024   
184890       Libertyville  Zion Energy Center              2024   
300927               Zion  Zion Energy Center              2024   

        InventoryDataDetailID  InventoryDataID  \
28729                  113936             9259   
28739                  113983             9259   
184865         

In [30]:
dfMatch1['Rec_ID']

TypeError: 'method' object is not subscriptable

In [31]:
class(dfMatch1)

SyntaxError: invalid syntax (<ipython-input-31-4ca358aa3b3b>, line 1)

In [32]:
type(dfMatch1)

method

In [33]:
dfMatch1 = dfMatch.copy()

In [34]:
dfMatch1

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [35]:
dfMatch1.pop('combo')

KeyError: 'combo'

In [36]:
dfMatch1.pop("combo")

KeyError: 'combo'

In [37]:
dfMatch1.pop('Rec_ID')

28729     60847
28739     22650
184865     2486
118743    55397
118743    69508
          ...  
300894    55407
292165    60900
300914    60900
184890     2431
300927     2430
Name: Rec_ID, Length: 127, dtype: int64

In [38]:
dfMatch1

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,CreationDT,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,59:52.2,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,59:52.2,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,14:43.7,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,33:22.9,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1


In [39]:
dfMatch

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [40]:
dfMatch1 = dfMatch.copy()

In [41]:
col = dfMatch1.pop('Rec_ID')

In [42]:
dfMatch1.insert(0, col)

TypeError: insert() missing 1 required positional argument: 'value'

In [43]:
dfMatch1.insert(0, 'Rec_ID', col)

In [44]:
dfMatch1

Unnamed: 0,Rec_ID,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,...,CreationDT,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent
28729,60847,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,...,59:52.2,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1
28739,22650,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,...,59:52.2,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1
184865,2486,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1
118743,55397,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1
118743,69508,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,55407,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,14:43.7,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1
292165,60900,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1
300914,60900,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1
184890,2431,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,...,33:22.9,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1


In [45]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    # df_reduced.insert(0, "combo", df_reduced.pop("combo"))
    df_reduced_copy = df_reduced.copy()
    col = df_reduced_copy.pop('combo')
    df_reduced_copy.insert(0, 'combo', col)
    
    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [46]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    # df_reduced.insert(0, "combo", df_reduced.pop("combo"))
    # df_reduced_copy = df_reduced.copy()
    col = df_reduced.pop('combo')
    df_reduced.insert(0, 'combo', col)
    
    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [47]:
df_reduced = get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


In [48]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    # df_reduced.insert(0, "combo", df_reduced.pop("combo"))
    # df_reduced_copy = df_reduced.copy()
    col = df_reduced.pop('combo')
    df_reduced.iloc(:, 0) = col
    
    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

SyntaxError: invalid syntax (<ipython-input-48-a9d8df8d1dd5>, line 62)

In [49]:
? padnas.iloc

Object ` padnas.iloc` not found.


In [50]:
help

Type help() for interactive help, or help(object) for help about object.

In [51]:
help(pandas.iloc)

NameError: name 'pandas' is not defined

In [52]:
help(iloc)

NameError: name 'iloc' is not defined

In [53]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    # df_reduced.insert(0, "combo", df_reduced.pop("combo"))
    # df_reduced_copy = df_reduced.copy()
    col = df_reduced.pop('combo')
    df_reduced.iloc[:, 0] = col
    
    return df_reduced


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [54]:
df_reduced = get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


In [55]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    # Create a dynamic combo option string
    df_reduced["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Reorder columns with 'combo' as the first column
    # df_reduced.insert(0, "combo", df_reduced.pop("combo"))
    # df_reduced_copy = df_reduced.copy()
    col = df_reduced.pop('combo')
    df_reduced_copy = df_reduced.copy()
    df_reduced_copy.iloc[:, 0] = col
    
    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [56]:
df_reduced = get_reduced_df(dfMatch)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduced["combo"] = df_reduced.apply(


In [57]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    df_reduced_copy = df_reduced.copy()
    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # col = df_reduced_copy.pop('combo')
    # df_reduced_copy = df_reduced.copy()
    # df_reduced_copy.iloc[:, 0] = col
    
    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [58]:
df_reduced = get_reduced_df(dfMatch)

In [59]:
df_reduced

Unnamed: 0,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,CircuitTypeCode,...,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID,combo
28729,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60847,ACO - AC Overhead - Aetna - Lake George - 138054
28739,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,22650,ACO - AC Overhead - Aetna - Miller - 138102
184865,15410,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2486,ACO - AC Overhead - Libertyville - Aptakisic -...
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,55397,ACO - AC Overhead - Electric Junction - Aurora...
118743,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,69508,ACO - AC Overhead - Electric Junction - Aurora...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,ACO - AC Overhead,...,,,,,,,1/1/13 0:00,,55407,ACO - AC Overhead - Zion - Northbrook - 2218
292165,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,ACO - AC Overhead - Waukegan - Zion - 1609
300914,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,60900,ACO - AC Overhead - Zion - Waukegan - 2218
184890,15423,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,ACO - AC Overhead,...,,,,,,,1/1/15 0:00,,2431,ACO - AC Overhead - Libertyville - Zion Energy...


In [60]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns and a dynamic 'combo' column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName)
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns)
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Create the new DataFrame with desired columns
    df_reduced = dfMatch[
        [
            "ElementIdentifierName",
            "CompanyName",
            "RegionCode",
            "FromBus",
            "ToBus",
            "TertiaryBus",
            "Miles",
            "BESExemptedFlag",
            "NumberOfTerminals",
            "CircuitTypeCode",
            "VoltageClassCodeName",
            "ParentCode",
            "ConductorsPerPhaseCode",
            "OverheadGroundWireCode",
            "InsulatorTypeCode",
            "CableTypeCode",
            "StructureMaterialCode",
            "StructureTypeCode",
            "CircuitsPerStructureCode",
            "TerrainCode",
            "ElevationCode",
            "InServiceDate",
            "RetirementDate",
            "Rec_ID",
        ]
    ]

    df_reduced_copy = df_reduced.copy()
    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    col = df_reduced_copy.pop('combo')
    # df_reduced_copy = df_reduced.copy()
    df_reduced_copy.iloc[:, 0] = col
    
    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [61]:
df_reduced = get_reduced_df(dfMatch)

In [62]:
df_reduced

Unnamed: 0,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,CircuitTypeCode,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28729,ACO - AC Overhead - Aetna - Lake George - 138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO - AC Overhead - Aetna - Miller - 138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,22650
184865,ACO - AC Overhead - Libertyville - Aptakisic -...,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,2486
118743,ACO - AC Overhead - Electric Junction - Aurora...,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,55397
118743,ACO - AC Overhead - Electric Junction - Aurora...,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,ACO - AC Overhead - Zion - Northbrook - 2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,ACO - AC Overhead,...,,,,,,,,1/1/13 0:00,,55407
292165,ACO - AC Overhead - Waukegan - Zion - 1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,60900
300914,ACO - AC Overhead - Zion - Waukegan - 2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,60900
184890,ACO - AC Overhead - Libertyville - Zion Energy...,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,ACO - AC Overhead,...,,,,,,,,1/1/15 0:00,,2431


In [63]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [64]:
df_reduced = get_reduced_df(dfMatch)

In [65]:
df_reduced

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28729,ACO - AC Overhead - Aetna - Lake George - 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO - AC Overhead - Aetna - Miller - 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
184865,ACO - AC Overhead - Libertyville - Aptakisic -...,15410,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,...,,,,,,,,1/1/15 0:00,,2486
118743,ACO - AC Overhead - Electric Junction - Aurora...,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,55397
118743,ACO - AC Overhead - Electric Junction - Aurora...,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,ACO - AC Overhead - Zion - Northbrook - 2218,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55407
292165,ACO - AC Overhead - Waukegan - Zion - 1609,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,...,,,,,,,,1/1/15 0:00,,60900
300914,ACO - AC Overhead - Zion - Waukegan - 2218,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900
184890,ACO - AC Overhead - Libertyville - Zion Energy...,15423,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,...,,,,,,,,1/1/15 0:00,,2431


In [66]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_reduced_df' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

In [67]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [68]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [69]:
def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

In [70]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

Connected to .conda (Python 3.9.19)

In [1]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_matched_entries' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

In [2]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

In [3]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [4]:
def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

In [5]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [6]:
def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

In [7]:
# def get_matched_entries(dfVeloSorted, dfTadsLatest):
#     matched_indices = []

#     # Iterate through both DataFrames
#     for i in range(len(dfVeloSorted)):
#         from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
#             dfVeloSorted.iloc[i]["To Sub"]
#         )
#         for j in range(len(dfTadsLatest)):
#             from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
#                 dfTadsLatest.iloc[j]["ToBus"]
#             )

#             if (from_sub == from_bus and to_sub == to_bus) or (
#                 from_sub == to_bus and to_sub == from_bus
#             ):
#                 matched_indices.append(j)

#     dfTadsMatched = dfTadsLatest.iloc[matched_indices].copy()

#     return dfTadsMatched


def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [8]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_matched_entries' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

In [9]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

In [10]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_matched_entries' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

In [11]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [12]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_matched_entries' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

In [13]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Create a dynamic combo option string
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [14]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

ImportError: cannot import name 'get_matched_entries' from 'src.housekeeping' (c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\src\housekeeping.py)

Connected to .conda (Python 3.9.19)

In [1]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [2]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [3]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [4]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [5]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [6]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [7]:
tadsFileAddr = os.path.join(rawDataFolder, "TADS 2024 AC Inventory.csv")
dfTads0 = pd.read_csv(tadsFileAddr)
sizeTads0 = dfTads0.shape
print(f"Size of TADS db before filtering: {sizeTads0[0]}, {sizeTads0[1]}")
companyNamesTads0 = set(dfTads0.CompanyName)
numCompaniesTads0 = len(companyNamesTads0)
print(f"There are {numCompaniesTads0} unique companies owning tlines in the entire TADS database.")
# display(dftads)

  dfTads0 = pd.read_csv(tadsFileAddr)


Size of TADS db before filtering: 301152, 47
There are 304 unique companies owning tlines in the entire TADS database.


In [8]:
location = "chicago-ohare"
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\tlines-near-chicago-ohare-raw.xlsx
Size of velocity suite db before any filtering: 524, 21


In [9]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 459, 21
There are 6 named companies owning the tlines near chicago-ohare
Their names are:
{'AmerenIP', 'Undetermined Company', 'Commonwealth Edison Co', 'Northern Municipal Power Agency', 'Northern Indiana Public Service Co LLC', 'American Transmission Co LLC'}


In [10]:
print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire TADS database:")

print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.")

companyNamesVelo2Tads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# Replace the element using the 'discard' method (more efficient for sets)
companyNamesVelo2Tads.discard("Commonwealth Edison Co")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
companyNamesVelo2Tads.discard("AmerenIP")
companyNamesVelo2Tads.add("Ameren Services Company")
companyNamesVelo2Tads.discard("American Transmission Co LLC")
companyNamesVelo2Tads.add("American Transmission Company")
companyNamesVelo2Tads.discard("Northern Indiana Public Service Co LLC")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Northern Municipal Power Agency")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Undetermined Company")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
print(companyNamesVelo2Tads)

dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)

veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-Chicago-Ohare-Sorted.xlsx")
dfVeloSorted.to_excel(veloSortedAddr)

Now let's see how many tlines are owned by these 6 companies in the entire TADS database:
But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.
{'American Transmission Company', 'Commonwealth Edison Company', 'Northern Indiana Public Service Company [BA', 'Ameren Services Company'}


In [11]:
dfTads = dfTads0.copy()
dfTads = dfTads[dfTads['CompanyName'].isin(companyNamesVelo2Tads)]
voltageClassesTads0 = set(dfTads['VoltageClassCodeName'])
print(voltageClassesTads0)
voltageClassesAllowedTads = voltageClassesTads0.copy()
voltageClassesAllowedTads.discard("0-99 kV")

dfTads = dfTads[dfTads['VoltageClassCodeName'].isin(voltageClassesAllowedTads)]

sizeTads = dfTads.shape
print(f"Size of TADS db after filtering: {sizeTads[0]}, {sizeTads[1]}")

dfTadsSorted = sort_and_shift_columns(dfTads)

tadsSortedAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Sorted.xlsx")

dfTadsSorted.to_excel(tadsSortedAddr, index=False)

# dfTadsLatest = filter_tlines_by_latest_reported_year(dfTadsSorted)
dfTadsLatest = get_latest_entries(dfTadsSorted)

sizeTadsLatest = dfTadsLatest.shape

print(f"Size of TADS db after filtering for only latest reported year: {sizeTadsLatest[0]}, {sizeTadsLatest[1]}")

tadsLatestAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Latest.xlsx")

dfTadsLatest.to_excel(tadsLatestAddr)

{'100-199 kV', '600-799 kV', '200-299 kV', '300-399 kV'}
Size of TADS db after filtering: 16052, 47
Size of TADS db after filtering for only latest reported year: 1705, 47


In [12]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [13]:
dfMatchReduced = get_reduced_df(dfMatch)

In [14]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchAddr)

In [15]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [16]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr)

In [17]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Create a dynamic combo option string using the first word
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [18]:
dfMatchReduced = get_reduced_df(dfMatch)

In [19]:
dfMatchReduced['combo']

28729                  ACO - Aetna - Lake George - 138054
28739                       ACO - Aetna - Miller - 138102
184865             ACO - Libertyville - Aptakisic - 15410
118743           ACO - Electric Junction - Aurora - 11119
118743           ACO - Electric Junction - Aurora - 11119
                               ...                       
300894                     ACO - Zion - Northbrook - 2218
292165                       ACO - Waukegan - Zion - 1609
300914                       ACO - Zion - Waukegan - 2218
184890    ACO - Libertyville - Zion Energy Center - 15423
300927             ACO - Zion - Zion Energy Center - 2223
Name: combo, Length: 127, dtype: object

In [20]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr)

In [21]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

PermissionError: [Errno 13] Permission denied: 'c:\\Users\\jhaa\\Documents\\documents_general\\extreme-weather-repo\\processedData/chicago-ohare-lines.xlsx'

In [22]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [23]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Create a dynamic combo option string using the first word
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} - {row['FromBus']} - {row['ToBus']} - {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord") # no longer needed

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [24]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [25]:
import pandas as pd
import os

def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new pandas DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus, ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Create a dynamic combo option string using the first word
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['FromBus']}-{row['ToBus']}-{row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord") # no longer needed

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")  # Remove 'combo' column and store it
    df_reduced_copy.insert(
        loc=0, column="combo", value=col
    )  # Insert 'combo' as the first column

    return df_reduced_copy

def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [26]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [27]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [28]:
tadsFileAddr = os.path.join(rawDataFolder, "TADS 2024 AC Inventory.csv")
dfTads0 = pd.read_csv(tadsFileAddr)
sizeTads0 = dfTads0.shape
print(f"Size of TADS db before filtering: {sizeTads0[0]}, {sizeTads0[1]}")
companyNamesTads0 = set(dfTads0.CompanyName)
numCompaniesTads0 = len(companyNamesTads0)
print(f"There are {numCompaniesTads0} unique companies owning tlines in the entire TADS database.")
# display(dftads)

  dfTads0 = pd.read_csv(tadsFileAddr)


Size of TADS db before filtering: 301152, 47
There are 304 unique companies owning tlines in the entire TADS database.


In [29]:
location = "chicago-ohare"
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\tlines-near-chicago-ohare-raw.xlsx
Size of velocity suite db before any filtering: 524, 21


In [30]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 459, 21
There are 6 named companies owning the tlines near chicago-ohare
Their names are:
{'AmerenIP', 'Undetermined Company', 'Commonwealth Edison Co', 'Northern Municipal Power Agency', 'Northern Indiana Public Service Co LLC', 'American Transmission Co LLC'}


In [31]:
print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire TADS database:")

print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.")

companyNamesVelo2Tads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# Replace the element using the 'discard' method (more efficient for sets)
companyNamesVelo2Tads.discard("Commonwealth Edison Co")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
companyNamesVelo2Tads.discard("AmerenIP")
companyNamesVelo2Tads.add("Ameren Services Company")
companyNamesVelo2Tads.discard("American Transmission Co LLC")
companyNamesVelo2Tads.add("American Transmission Company")
companyNamesVelo2Tads.discard("Northern Indiana Public Service Co LLC")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Northern Municipal Power Agency")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Undetermined Company")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
print(companyNamesVelo2Tads)

dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)

veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-Chicago-Ohare-Sorted.xlsx")
dfVeloSorted.to_excel(veloSortedAddr)

Now let's see how many tlines are owned by these 6 companies in the entire TADS database:
But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.
{'American Transmission Company', 'Commonwealth Edison Company', 'Northern Indiana Public Service Company [BA', 'Ameren Services Company'}


In [32]:
dfTads = dfTads0.copy()
dfTads = dfTads[dfTads['CompanyName'].isin(companyNamesVelo2Tads)]
voltageClassesTads0 = set(dfTads['VoltageClassCodeName'])
print(voltageClassesTads0)
voltageClassesAllowedTads = voltageClassesTads0.copy()
voltageClassesAllowedTads.discard("0-99 kV")

dfTads = dfTads[dfTads['VoltageClassCodeName'].isin(voltageClassesAllowedTads)]

sizeTads = dfTads.shape
print(f"Size of TADS db after filtering: {sizeTads[0]}, {sizeTads[1]}")

dfTadsSorted = sort_and_shift_columns(dfTads)

tadsSortedAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Sorted.xlsx")

dfTadsSorted.to_excel(tadsSortedAddr, index=False)

# dfTadsLatest = filter_tlines_by_latest_reported_year(dfTadsSorted)
dfTadsLatest = get_latest_entries(dfTadsSorted)

sizeTadsLatest = dfTadsLatest.shape

print(f"Size of TADS db after filtering for only latest reported year: {sizeTadsLatest[0]}, {sizeTadsLatest[1]}")

tadsLatestAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Latest.xlsx")

dfTadsLatest.to_excel(tadsLatestAddr)

{'100-199 kV', '600-799 kV', '200-299 kV', '300-399 kV'}
Size of TADS db after filtering: 16052, 47
Size of TADS db after filtering for only latest reported year: 1705, 47


In [33]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [34]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [35]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched

In [36]:
dfMatchReduced = get_reduced_df(dfMatch)

In [37]:
dfMatchReduced

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
184865,ACO Aptakisic-Libertyville 15410,15410,Commonwealth Edison Company,RFC,Libertyville,Aptakisic,,10.133,,2.0,...,,,,,,,,1/1/15 0:00,,2486
118743,ACO Aurora-Electric Junction 11119,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,55397
118743,ACO Aurora-Electric Junction 11119,11119,Commonwealth Edison Company,RFC,Electric Junction,Aurora,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,ACO Northbrook-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55407
292165,ACO Waukegan-Zion 1609,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,...,,,,,,,,1/1/15 0:00,,60900
300914,ACO Waukegan-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900
184890,ACO Libertyville-Zion Energy Center 15423,15423,Commonwealth Edison Company,RFC,Libertyville,Zion Energy Center,,12.300,,2.0,...,,,,,,,,1/1/15 0:00,,2431


In [38]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [39]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [40]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [41]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [42]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

AttributeError: 'NoneType' object has no attribute 'to_excel'

In [43]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)
``

    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

SyntaxError: invalid syntax (<ipython-input-43-9969bc970303>, line 220)

In [44]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [45]:
dfMatchReduced = get_reduced_df(dfMatch)

In [46]:
dfMatchReduced

In [47]:
dfMatchReduced

In [48]:
size(dfMatchReduced)

NameError: name 'size' is not defined

In [49]:
type(dfMatchReduced)

NoneType

Connected to .conda (Python 3.9.19)

In [1]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [2]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus

In [3]:
y = get_reduced_df(dfMatch)

NameError: name 'dfMatch' is not defined

In [4]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [5]:
try:
    # pylint: disable=undefined-variable line-too-long invalid-name
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [6]:
tadsFileAddr = os.path.join(rawDataFolder, "TADS 2024 AC Inventory.csv")
dfTads0 = pd.read_csv(tadsFileAddr)
sizeTads0 = dfTads0.shape
print(f"Size of TADS db before filtering: {sizeTads0[0]}, {sizeTads0[1]}")
companyNamesTads0 = set(dfTads0.CompanyName)
numCompaniesTads0 = len(companyNamesTads0)
print(f"There are {numCompaniesTads0} unique companies owning tlines in the entire TADS database.")
# display(dftads)

Size of TADS db before filtering: 301152, 47
There are 304 unique companies owning tlines in the entire TADS database.


  dfTads0 = pd.read_csv(tadsFileAddr)


In [7]:
location = "chicago-ohare"
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\tlines-near-chicago-ohare-raw.xlsx
Size of velocity suite db before any filtering: 524, 21


In [8]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 459, 21
There are 6 named companies owning the tlines near chicago-ohare
Their names are:
{'Undetermined Company', 'American Transmission Co LLC', 'AmerenIP', 'Commonwealth Edison Co', 'Northern Indiana Public Service Co LLC', 'Northern Municipal Power Agency'}


In [9]:
print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire TADS database:")

print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.")

companyNamesVelo2Tads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# Replace the element using the 'discard' method (more efficient for sets)
companyNamesVelo2Tads.discard("Commonwealth Edison Co")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
companyNamesVelo2Tads.discard("AmerenIP")
companyNamesVelo2Tads.add("Ameren Services Company")
companyNamesVelo2Tads.discard("American Transmission Co LLC")
companyNamesVelo2Tads.add("American Transmission Company")
companyNamesVelo2Tads.discard("Northern Indiana Public Service Co LLC")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Northern Municipal Power Agency")
companyNamesVelo2Tads.add("Northern Indiana Public Service Company [BA")
companyNamesVelo2Tads.discard("Undetermined Company")
companyNamesVelo2Tads.add("Commonwealth Edison Company")
print(companyNamesVelo2Tads)

dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)

veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-Chicago-Ohare-Sorted.xlsx")
dfVeloSorted.to_excel(veloSortedAddr)

Now let's see how many tlines are owned by these 6 companies in the entire TADS database:
But first I'll need to rename some companies in vs db to match with the exact strings of the TADS db.
{'Northern Indiana Public Service Company [BA', 'American Transmission Company', 'Ameren Services Company', 'Commonwealth Edison Company'}


In [10]:
dfTads = dfTads0.copy()
dfTads = dfTads[dfTads['CompanyName'].isin(companyNamesVelo2Tads)]
voltageClassesTads0 = set(dfTads['VoltageClassCodeName'])
print(voltageClassesTads0)
voltageClassesAllowedTads = voltageClassesTads0.copy()
voltageClassesAllowedTads.discard("0-99 kV")

dfTads = dfTads[dfTads['VoltageClassCodeName'].isin(voltageClassesAllowedTads)]

sizeTads = dfTads.shape
print(f"Size of TADS db after filtering: {sizeTads[0]}, {sizeTads[1]}")

dfTadsSorted = sort_and_shift_columns(dfTads)

tadsSortedAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Sorted.xlsx")

dfTadsSorted.to_excel(tadsSortedAddr, index=False)

# dfTadsLatest = filter_tlines_by_latest_reported_year(dfTadsSorted)
dfTadsLatest = get_latest_entries(dfTadsSorted)

sizeTadsLatest = dfTadsLatest.shape

print(f"Size of TADS db after filtering for only latest reported year: {sizeTadsLatest[0]}, {sizeTadsLatest[1]}")

tadsLatestAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Latest.xlsx")

dfTadsLatest.to_excel(tadsLatestAddr)

{'600-799 kV', '300-399 kV', '200-299 kV', '100-199 kV'}
Size of TADS db after filtering: 16052, 47
Size of TADS db after filtering for only latest reported year: 1705, 47


In [11]:
dfMatch = get_matched_entries(dfVeloSorted, dfTadsLatest)
matchAddr = os.path.join(processedDataFolder, "dfTads-Chicago-Ohare-Matched.xlsx")
dfMatch.to_excel(matchAddr)

In [12]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [13]:
dfMatchReduced

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28715,ACO Aetna-Dune Acres 138006,138006,Northern Indiana Public Service Company [BA,RFC,Aetna,Dune Acres,,11.700,0.0,2.0,...,,,,,,,,1/1/15 0:00,,50026
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
11121,ACO Arcadian-Zion 2222,2222,American Transmission Company,MRO,Arcadian,Zion,,53.100,,2.0,...,,,,,,,,1/1/15 0:00,12/30/22 0:00,52054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300876,ACO Lakeview-Zion 28201,28201,Commonwealth Edison Company,RFC,Zion,Lakeview,,5.070,0.0,2.0,...,,,,,,,,1/1/13 0:00,,60901
300888,ACO Libertyville-Zion 2224,2224,Commonwealth Edison Company,RFC,Zion,Libertyville,,18.502,,2.0,...,,,,,,,,1/1/15 0:00,,62352
300894,ACO Northbrook-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55407
300914,ACO Waukegan-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900


In [14]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
  """
  This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
  with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

  Args:
      dfMatch: The input pandas DataFrame.

  Returns:
      A new DataFrame containing the following columns:
          - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
          - 'ElementIdentifierName'
          - 'CompanyName'
          - ... (other desired columns) - Include any other columns you want in the output DataFrame.
          - 'RetirementDate' (added)
          - 'Rec_ID' (added)
  """

  # Select desired columns from the input DataFrame
  desired_cols = [
      "ElementIdentifierName",
      "CompanyName",
      "RegionCode",
      "FromBus",
      "ToBus",
      "TertiaryBus",
      "Miles",
      "BESExemptedFlag",
      "NumberOfTerminals",
      "CircuitTypeCode",
      "VoltageClassCodeName",
      "ParentCode",
      "ConductorsPerPhaseCode",
      "OverheadGroundWireCode",
      "InsulatorTypeCode",
      "CableTypeCode",
      "StructureMaterialCode",
      "StructureTypeCode",
      "CircuitsPerStructureCode",
      "TerrainCode",
      "ElevationCode",
      "InServiceDate",
      "RetirementDate",
      "Rec_ID",
  ]

  df_reduced = dfMatch[desired_cols]

  # Create a copy of the DataFrame to avoid modifying the original
  df_reduced_copy = df_reduced.copy()

  # Check for Empty Strings (Optional)
  if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
      pass  # No empty strings, proceed
  else:
      # Replace empty strings with a consistent value (e.g., 'NA')
      df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
      df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

  # Extract the first word from CircuitTypeCode
  df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

  # Print intermediate sorted values (for debugging)
  print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

  # Temporary column to store the sorted Bus combination
  df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
      lambda x: "-".join(sorted(x)), axis=1
  )

  # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
  df_reduced_copy["combo"] = df_reduced_copy.apply(
      lambda row: f"{row['Circuit
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

SyntaxError: EOL while scanning string literal (<ipython-input-14-1da1f8e398d6>, line 78)

In [15]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Check for Empty Strings (Optional)
    if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
        pass  # No empty strings, proceed
    else:
        # Replace empty strings with a consistent value (e.g., 'NA')
        df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
        df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Print intermediate sorted values (for debugging)
    print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy

    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [16]:
y = get_reduced_df(dfMatch)

28729            Aetna-Lake George
28739                 Aetna-Miller
184865      Aptakisic-Libertyville
118743    Aurora-Electric Junction
118743    Aurora-Electric Junction
dtype: object


In [17]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Check for Empty Strings (Optional)
    if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
        pass  # No empty strings, proceed
    else:
        # Replace empty strings with a consistent value (e.g., 'NA')
        df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
        df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Print intermediate sorted values (for debugging)
    print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [18]:
dfMatch

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Libertyville,Aptakisic,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Electric Junction,Aurora,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Zion,Northbrook,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Zion,Waukegan,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [19]:
y = get_reduced_df(dfMatch)

28729            Aetna-Lake George
28739                 Aetna-Miller
184865      Aptakisic-Libertyville
118743    Aurora-Electric Junction
118743    Aurora-Electric Junction
dtype: object


In [20]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Check for Empty Strings (Optional)
    if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
        pass  # No empty strings, proceed
    else:
        # Replace empty strings with a consistent value (e.g., 'NA')
        df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
        df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Print intermediate sorted values (for debugging)
    print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [21]:
y = get_reduced_df(dfMatch)

28729            Aetna-Lake George
28739                 Aetna-Miller
184865      Aptakisic-Libertyville
118743    Aurora-Electric Junction
118743    Aurora-Electric Junction
dtype: object


In [22]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Sort the DataFrame by FromBus and ToBus (ensures FromBus < ToBus)
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Create a dynamic combo option string (no need for separate sorting)
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode']} {row['FromBus']}-{row['ToBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [23]:
y = get_reduced_df(dfMatch)

In [24]:
y

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28715,ACO - AC Overhead Aetna-Dune Acres 138006,138006,Northern Indiana Public Service Company [BA,RFC,Aetna,Dune Acres,,11.700,0.0,2.0,...,,,,,,,,1/1/15 0:00,,50026
28729,ACO - AC Overhead Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO - AC Overhead Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO - AC Overhead Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
11121,ACO - AC Overhead Arcadian-Zion 2222,2222,American Transmission Company,MRO,Arcadian,Zion,,53.100,,2.0,...,,,,,,,,1/1/15 0:00,12/30/22 0:00,52054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300876,ACO - AC Overhead Zion-Lakeview 28201,28201,Commonwealth Edison Company,RFC,Zion,Lakeview,,5.070,0.0,2.0,...,,,,,,,,1/1/13 0:00,,60901
300888,ACO - AC Overhead Zion-Libertyville 2224,2224,Commonwealth Edison Company,RFC,Zion,Libertyville,,18.502,,2.0,...,,,,,,,,1/1/15 0:00,,62352
300894,ACO - AC Overhead Zion-Northbrook 2218,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55407
300914,ACO - AC Overhead Zion-Waukegan 2218,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900


In [25]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Check for Empty Strings (Optional)
    if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
        pass  # No empty strings, proceed
    else:
        # Replace empty strings with a consistent value (e.g., 'NA')
        df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
        df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Print intermediate sorted values (for debugging)
    print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy



def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [26]:
y = get_reduced_df(dfMatch)

28729            Aetna-Lake George
28739                 Aetna-Miller
184865      Aptakisic-Libertyville
118743    Aurora-Electric Junction
118743    Aurora-Electric Junction
dtype: object


In [27]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Check for Empty Strings (Optional)
    if df_reduced_copy['FromBus'].notna().all() and df_reduced_copy['ToBus'].notna().all():
        pass  # No empty strings, proceed
    else:
        # Replace empty strings with a consistent value (e.g., 'NA')
        df_reduced_copy['FromBus'] = df_reduced_copy['FromBus'].fillna('NA')
        df_reduced_copy['ToBus'] = df_reduced_copy['ToBus'].fillna('NA')

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = df_reduced_copy["CircuitTypeCode"].str.split().str[0]

    # Print intermediate sorted values (for debugging)
    print(df_reduced_copy[['FromBus', 'ToBus']].apply(lambda x: "-".join(sorted(x)), axis=1).head())

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy

    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [28]:
y = get_reduced_df(dfMatch)

28729            Aetna-Lake George
28739                 Aetna-Miller
184865      Aptakisic-Libertyville
118743    Aurora-Electric Junction
118743    Aurora-Electric Junction
dtype: object


In [29]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)


    return dfTadsMatched

def rearrange_buses(df):
    """
    This function takes a DataFrame (df) and returns a new DataFrame with 'FromBus'
    lexicographically smaller than 'ToBus'.

    Args:
        df: The input DataFrame.

    Returns:
        A new DataFrame with 'FromBus' always preceding 'ToBus'.
    """

    # Sort the DataFrame by FromBus and ToBus
    df_rearranged = df.sort_values(by=["FromBus", "ToBus"])
    return df_rearranged

In [30]:
y = get_reduced_df(dfMatch)

In [31]:
y

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28715,ACO Aetna-Dune Acres 138006,138006,Northern Indiana Public Service Company [BA,RFC,Aetna,Dune Acres,,11.700,0.0,2.0,...,,,,,,,,1/1/15 0:00,,50026
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
11121,ACO Arcadian-Zion 2222,2222,American Transmission Company,MRO,Arcadian,Zion,,53.100,,2.0,...,,,,,,,,1/1/15 0:00,12/30/22 0:00,52054
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300876,ACO Lakeview-Zion 28201,28201,Commonwealth Edison Company,RFC,Zion,Lakeview,,5.070,0.0,2.0,...,,,,,,,,1/1/13 0:00,,60901
300888,ACO Libertyville-Zion 2224,2224,Commonwealth Edison Company,RFC,Zion,Libertyville,,18.502,,2.0,...,,,,,,,,1/1/15 0:00,,62352
300894,ACO Northbrook-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Northbrook,,26.210,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55407
300914,ACO Waukegan-Zion 2218,2218,Commonwealth Edison Company,RFC,Zion,Waukegan,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900


In [32]:
y[1:20]

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.9,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.5,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.5,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
11121,ACO Arcadian-Zion 2222,2222,American Transmission Company,MRO,Arcadian,Zion,,53.1,,2.0,...,,,,,,,,1/1/15 0:00,12/30/22 0:00,52054
41495,ACO Babcock-Lake George 345003,345003,Northern Indiana Public Service Company [BA,RFC,Babcock,Lake George,,12.0,0.0,2.0,...,,,,,,,,1/1/13 0:00,,55461
41979,ACO Albers-Bain 63143,63143,American Transmission Company,MRO,Bain,Albers,,4.79,,2.0,...,,,,,,,,1/1/15 0:00,,71534
41989,ACO Bain-Kenosha 63151,63151,American Transmission Company,MRO,Bain,Kenosha,,1.65,,2.0,...,,,,,,,,1/1/15 0:00,,7127
47634,ACO Bedford Park-Hayford 11521,11521,Commonwealth Edison Company,RFC,Bedford Park,Hayford,,5.65,0.0,2.0,...,,,,,,,,1/1/13 0:00,,2593
55732,ACO Bloom-Burnham 17908,17908,Commonwealth Edison Company,RFC,Bloom,Burnham,,12.42,,2.0,...,,,,,,,,1/1/15 0:00,,2798
55742,ACO Bloom-Davis Creek 17907,17907,Commonwealth Edison Company,RFC,Bloom,Davis Creek,,37.505,,2.0,...,,,,,,,,1/1/15 0:00,,2792


In [33]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched


def rearrangeColumns(df, col_x, col_y):
    """
    This function takes a pandas DataFrame (df), column names (col_x, col_y),
    and exchanges the values in those columns if val_x is bigger than val_y for each row.

    Args:
        df: The input pandas DataFrame.
        col_x: The name of the first column.
        col_y: The name of the second column.

    Returns:
        A new pandas DataFrame with columns x and y potentially exchanged based on the condition.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Apply a lambda function to compare and potentially swap values
    df_copy[[col_x, col_y]] = df_copy[[col_x, col_y]].where(
        df_copy[col_x] <= df_copy[col_y], [df_copy[col_y], df_copy[col_x]]
    )

    return df_copy

In [34]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched


def rearrangeColumns(df, col_x='FromBus', col_y='ToBus'):
    """
    This function takes a pandas DataFrame (df), column names (col_x, col_y),
    and exchanges the values in those columns if val_x is bigger than val_y for each row.

    Args:
        df: The input pandas DataFrame.
        col_x: The name of the first column.
        col_y: The name of the second column.

    Returns:
        A new pandas DataFrame with columns x and y potentially exchanged based on the condition.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Apply a lambda function to compare and potentially swap values
    df_copy[[col_x, col_y]] = df_copy[[col_x, col_y]].where(
        df_copy[col_x] <= df_copy[col_y], [df_copy[col_y], df_copy[col_x]]
    )

    return df_copy

In [35]:
rearrangeColumns(dfMatch)

ValueError: operands could not be broadcast together with shapes (127,2) (127,2) (2,127) 

In [36]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched


def rearrangeColumns(df, col_x='FromBus', col_y='ToBus'):
    """
    This function takes a pandas DataFrame (df), column names (col_x, col_y),
    and exchanges the values in those columns if val_x is bigger than val_y for each row,
    while keeping other columns unchanged.

    Args:
        df: The input pandas DataFrame.
        col_x: The name of the first column.
        col_y: The name of the second column.

    Returns:
        A new pandas DataFrame with columns x and y potentially exchanged based on the condition.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    df_copy = df.copy()

    # Select the columns to potentially exchange and other columns
    cols_to_exchange = [col_x, col_y]
    other_cols = list(set(df_copy.columns) - set(cols_to_exchange))

    # Apply a lambda function to compare and potentially swap values
    df_copy[cols_to_exchange] = df_copy[cols_to_exchange].where(
        df_copy[col_x] <= df_copy[col_y], [df_copy[col_y], df_copy[col_x]]
    )

    return df_copy

In [37]:
rearrangeColumns(dfMatch)

ValueError: operands could not be broadcast together with shapes (127,2) (127,2) (2,127) 

In [38]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched


def rearrangeColumns(df, col1="FromBus", col2="ToBus"):
    # Make a copy of the DataFrame to avoid modifying the original
    df = df.copy()

    # Iterate through each row and swap col1 and col2 if necessary
    for index, row in df.iterrows():
        value1 = str(row[col1])
        value2 = str(row[col2])

        if value1 > value2:
            df.at[index, col1] = value2
            df.at[index, col2] = value1

    return df

In [39]:
rearrangeColumns(dfMatch)

Unnamed: 0,FromBus,ToBus,ReportingYearNbr,InventoryDataDetailID,InventoryDataID,CompanyName,CompanyCode,NERCID,NERCID_AliasID,RegionCode,...,ExtractionDT,UpdateDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,Slicer,AliasID,IsCurrent,Rec_ID
28729,Aetna,Lake George,2024,113936,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,636757,1,9259 | 113936 | 2024,0x10ED02D26825C003EE3B8BB374B3D856,1,60847
28739,Aetna,Miller,2024,113983,9259,Northern Indiana Public Service Company [BA,NCR02611 | RFC,NCR02611,0x294791EC91004582F3E1DB12ADA4BB03,RFC,...,05:07.9,00:01.0,,01:21.7,642142,1,9259 | 113983 | 2024,0xBB81DBE68DB618667B9650E57C7267EA,1,22650
184865,Aptakisic,Libertyville,2024,118818,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650912,1,9400 | 118818 | 2024,0xAF15167B2979EF5C2EDF9A7BA84F1C01,1,2486
118743,Aurora,Electric Junction,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,55397
118743,Aurora,Electric Junction,2024,119072,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625367,1,9402 | 119072 | 2024,0x2A3F37E31771BB4D9468566640B78822,1,69508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300894,Northbrook,Zion,2014,31789,5803,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,10:05.5,08:34.2,,00:00.9,216720,1,5803 | 31789 | 2014,0xD7BAD6B5D071292FD40F898CABBC9677,1,55407
292165,Waukegan,Zion,2024,118840,9400,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,650849,1,9400 | 118840 | 2024,0x128D3A78E37B1B206191C78D9B5D7C4C,1,60900
300914,Waukegan,Zion,2024,119042,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625328,1,9402 | 119042 | 2024,0xD7BAD6B5D071292FD40F898CABBC9677,1,60900
184890,Libertyville,Zion Energy Center,2024,119100,9402,Commonwealth Edison Company,NCR08013 | RFC | RFC - PJM,NCR08013,0xCC2BCCB6A749329A905A8A66AA5A99DB,RFC,...,05:07.9,00:01.0,,01:21.7,625419,1,9402 | 119100 | 2024,0xC81DBDBE35DD273C67BADAA182719325,1,2431


In [40]:
import pandas as pd
import os


def get_reduced_df(dfMatch):
    """
    This function takes a pandas DataFrame (dfMatch) and returns a new DataFrame containing specific columns
    with a dynamic 'combo' column as the first column, with FromBus always preceding ToBus and sorted.

    Args:
        dfMatch: The input pandas DataFrame.

    Returns:
        A new DataFrame containing the following columns:
            - 'combo' (filled with a string combining CircuitTypeCode, FromBus-ToBus, and ElementIdentifierName) - This column becomes the first column in the output.
            - 'ElementIdentifierName'
            - 'CompanyName'
            - ... (other desired columns) - Include any other columns you want in the output DataFrame.
            - 'RetirementDate' (added)
            - 'Rec_ID' (added)
    """

    # Select desired columns from the input DataFrame
    desired_cols = [
        "ElementIdentifierName",
        "CompanyName",
        "RegionCode",
        "FromBus",
        "ToBus",
        "TertiaryBus",
        "Miles",
        "BESExemptedFlag",
        "NumberOfTerminals",
        "CircuitTypeCode",
        "VoltageClassCodeName",
        "ParentCode",
        "ConductorsPerPhaseCode",
        "OverheadGroundWireCode",
        "InsulatorTypeCode",
        "CableTypeCode",
        "StructureMaterialCode",
        "StructureTypeCode",
        "CircuitsPerStructureCode",
        "TerrainCode",
        "ElevationCode",
        "InServiceDate",
        "RetirementDate",
        "Rec_ID",
    ]

    df_reduced = dfMatch[desired_cols]

    df_reduced = rearrangeColumns(df_reduced)
    # Create a copy of the DataFrame to avoid modifying the original
    df_reduced_copy = df_reduced.copy()

    # Extract the first word from CircuitTypeCode
    df_reduced_copy["CircuitTypeCode_FirstWord"] = (
        df_reduced_copy["CircuitTypeCode"].str.split().str[0]
    )

    # Temporary column to store the sorted Bus combination
    df_reduced_copy["SortedBus"] = df_reduced_copy[["FromBus", "ToBus"]].apply(
        lambda x: "-".join(sorted(x)), axis=1
    )

    # Create a dynamic combo option string using the first word and sorted FromBus-ToBus
    df_reduced_copy["combo"] = df_reduced_copy.apply(
        lambda row: f"{row['CircuitTypeCode_FirstWord']} {row['SortedBus']} {row['ElementIdentifierName']}",
        axis=1,
    )

    df_reduced_copy.pop("CircuitTypeCode_FirstWord")  # No longer needed
    df_reduced_copy.pop("SortedBus")  # No longer needed

    # Sort the DataFrame by FromBus and ToBus
    df_reduced_copy = df_reduced_copy.sort_values(by=["FromBus", "ToBus"])

    # Make 'combo' the first column
    col = df_reduced_copy.pop("combo")
    df_reduced_copy.insert(loc=0, column="combo", value=col)

    return df_reduced_copy


def filter_tlines_by_latest_reported_year(df):
    """
    Filters a DataFrame to include only the first row for each unique combination of 'FromBus' and 'ToBus' columns,
    assuming 'FromBus', 'ToBus', and 'ReportingYearNbr' are already sorted in descending order by 'ReportingYearNbr'.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr' (sorted by 'ReportingYearNbr' descending).

    Returns:
        A new DataFrame containing the first row for each unique combination of 'FromBus' and 'ToBus' columns.
    """

    # Initialize variables to track current and previous values
    current_frombus = None
    current_tobus = None
    filtered_df = pd.DataFrame(
        columns=df.columns
    )  # Create empty DataFrame to store filtered rows

    # Iterate through each row
    for index, row in df.iterrows():
        frombus, tobus, _ = row["FromBus"], row["ToBus"], row["ReportingYearNbr"]

        # Check if new unique combination of 'FromBus' and 'ToBus' is encountered
        if (current_frombus != frombus) or (current_tobus != tobus):
            # Add previous row (if it exists) to the filtered DataFrame
            if current_frombus is not None and current_tobus is not None:
                try:
                    # Attempt to add the previous row using loc
                    filtered_df = pd.concat(
                        [filtered_df, df.loc[(current_frombus, current_tobus)]],
                        ignore_index=True,
                    )
                except KeyError:
                    # Handle potential KeyError (e.g., missing value in previous combination)
                    # You can choose a strategy like logging the error or skipping the row
                    print(
                        f"KeyError encountered for ({current_frombus}, {current_tobus}). Skipping row."
                    )

            # Update current values
            current_frombus = frombus
            current_tobus = tobus

        # Always append the current row (might be the first or subsequent for the same 'FromBus' and 'ToBus')
        filtered_df = pd.concat([filtered_df, row], ignore_index=True)

    return filtered_df

def get_latest_entries(dfTadsSorted):
    # Drop duplicates, keeping the first occurrence
    dfTadsLatest = dfTadsSorted.drop_duplicates(
        subset=["FromBus", "ToBus"], keep="first"
    )

    return dfTadsLatest

def sort_and_shift_columns(df):
    """
    Sorts a DataFrame by 'FromBus', 'ToBus', 'ReportingYearNbr' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'FromBus', 'ToBus', and 'ReportingYearNbr'.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus', 'ReportingYearNbr'
        with those three columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["FromBus", "ToBus", "ReportingYearNbr"], ascending=[True, True, False]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["FromBus", "ToBus", "ReportingYearNbr"] + [
        col
        for col in sorted_df.columns
        if col not in ["FromBus", "ToBus", "ReportingYearNbr"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]


    return shifted_df

def sort_and_shift_columns_dfVelo(df):
    """
    Sorts a DataFrame by 'From Sub', 'To Sub' and rearranges those columns to be first.

    Args:
        df: A pandas DataFrame containing columns 'From Sub', 'To Sub' apart from any other columns.

    Returns:
        A new pandas DataFrame with all columns sorted by 'FromBus', 'ToBus
        with those two columns positioned at the beginning.
    """

    # Sort by 'FromBus', 'ToBus', 'ReportingYearNbr' (descending order for ReportingYearNbr)
    sorted_df = df.sort_values(
        by=["From Sub", "To Sub"], ascending=[True, True]
    )

    # Define desired column order (efficient approach)
    desired_column_order = ["From Sub", "To Sub"] + [
        col for col in sorted_df.columns if col not in ["From Sub", "To Sub"]
    ]

    # Reorder columns using `.loc` indexing
    shifted_df = sorted_df.loc[:, desired_column_order]

    return shifted_df

def get_matched_entries(dfVeloSorted, dfTadsLatest):
    matched_rows = []

    # Iterate through both DataFrames
    for i in range(len(dfVeloSorted)):
        from_sub, to_sub = str(dfVeloSorted.iloc[i]["From Sub"]), str(
            dfVeloSorted.iloc[i]["To Sub"]
        )
        rec_id = dfVeloSorted.iloc[i]["Rec_ID"]
        for j in range(len(dfTadsLatest)):
            from_bus, to_bus = str(dfTadsLatest.iloc[j]["FromBus"]), str(
                dfTadsLatest.iloc[j]["ToBus"]
            )

            if (from_sub == from_bus and to_sub == to_bus) or (
                from_sub == to_bus and to_sub == from_bus
            ):
                matched_row = dfTadsLatest.iloc[j].copy()
                matched_row["Rec_ID"] = rec_id
                matched_rows.append(matched_row)

    dfTadsMatched = pd.DataFrame(matched_rows)

    return dfTadsMatched


def rearrangeColumns(df, col1="FromBus", col2="ToBus"):
    # Make a copy of the DataFrame to avoid modifying the original
    df = df.copy()

    # Iterate through each row and swap col1 and col2 if necessary
    for index, row in df.iterrows():
        value1 = str(row[col1])
        value2 = str(row[col2])

        if value1 > value2:
            df.at[index, col1] = value2
            df.at[index, col2] = value1

    return df

In [41]:
y = get_reduced_df(dfMatch)

In [42]:
y

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28715,ACO Aetna-Dune Acres 138006,138006,Northern Indiana Public Service Company [BA,RFC,Aetna,Dune Acres,,11.700,0.0,2.0,...,,,,,,,,1/1/15 0:00,,50026
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.900,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.500,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
41979,ACO Albers-Bain 63143,63143,American Transmission Company,MRO,Albers,Bain,,4.790,,2.0,...,,,,,,,,1/1/15 0:00,,71534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292357,ACO Tollway-Wayne 14402,14402,Commonwealth Edison Company,RFC,Tollway,Wayne,,5.598,,2.0,...,,,,,,,,1/1/15 0:00,,27894
286725,ACO University-Washington Park 17404,17404,Commonwealth Edison Company,RFC,University,Washington Park,,2.200,,2.0,...,,,,,,,,1/1/15 0:00,,2559
292165,ACO Waukegan-Zion 1609,1609,Commonwealth Edison Company,RFC,Waukegan,Zion,,12.275,,2.0,...,,,,,,,,1/1/15 0:00,,60900
300914,ACO Waukegan-Zion 2218,2218,Commonwealth Edison Company,RFC,Waukegan,Zion,,5.283,,2.0,...,,,,,,,,1/1/15 0:00,,60900


In [43]:
y[1:20]

Unnamed: 0,combo,ElementIdentifierName,CompanyName,RegionCode,FromBus,ToBus,TertiaryBus,Miles,BESExemptedFlag,NumberOfTerminals,...,InsulatorTypeCode,CableTypeCode,StructureMaterialCode,StructureTypeCode,CircuitsPerStructureCode,TerrainCode,ElevationCode,InServiceDate,RetirementDate,Rec_ID
28729,ACO Aetna-Lake George 138054,138054,Northern Indiana Public Service Company [BA,RFC,Aetna,Lake George,,4.9,0.0,2.0,...,,,,,,,,1/1/15 0:00,,60847
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.5,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22650
28739,ACO Aetna-Miller 138102,138102,Northern Indiana Public Service Company [BA,RFC,Aetna,Miller,,0.5,0.0,2.0,...,,,,,,,,1/1/15 0:00,,70313
41979,ACO Albers-Bain 63143,63143,American Transmission Company,MRO,Albers,Bain,,4.79,,2.0,...,,,,,,,,1/1/15 0:00,,71534
173167,ACO Albers-Kenosha 9352,9352,American Transmission Company,MRO,Albers,Kenosha,,3.97,,2.0,...,,,,,,,,1/1/15 0:00,,61461
184865,ACO Aptakisic-Libertyville 15410,15410,Commonwealth Edison Company,RFC,Aptakisic,Libertyville,,10.133,,2.0,...,,,,,,,,1/1/15 0:00,,2486
11121,ACO Arcadian-Zion 2222,2222,American Transmission Company,MRO,Arcadian,Zion,,53.1,,2.0,...,,,,,,,,1/1/15 0:00,12/30/22 0:00,52054
118743,ACO Aurora-Electric Junction 11119,11119,Commonwealth Edison Company,RFC,Aurora,Electric Junction,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,55397
118743,ACO Aurora-Electric Junction 11119,11119,Commonwealth Edison Company,RFC,Aurora,Electric Junction,,1.433,,2.0,...,,,,,,,,1/1/15 0:00,,69508
110995,ACO Babcock-Dune Acres 138075,138075,Northern Indiana Public Service Company [BA,RFC,Babcock,Dune Acres,,7.7,0.0,2.0,...,,,,,,,,1/1/15 0:00,,22653


In [44]:
dfMatchReduced = get_reduced_df(dfMatch)
matchReducedAddr = os.path.join(processedDataFolder, "chicago-ohare-lines.xlsx")
dfMatchReduced.to_excel(matchReducedAddr, index=False)

In [45]:
dfVelo

Unnamed: 0,Company Name,Transmission Line Name,Owner2,Voltage kV,Voltage Class kV,Number of Lines,Proposed,Underground,From Sub,To Sub,...,Length mi,Location Code,Source,Numeric Voltages,Holding Company Name,Owner2 ID,Rec_ID,Layer_ID,Type,Ownership Type
1,Commonwealth Edison Co,Belvidere to Marengo Tap 138 kV,,138,100-161,2,In Service,F,Belvidere,Marengo Tap,...,12.579538,1,Aerial Imagery,100-161,Exelon Corp,-99,2732,82,AC,IOU
2,Commonwealth Edison Co,Marengo Tap to Woodstock 138 kV,,138,100-161,1,In Service,F,Marengo Tap,Woodstock,...,11.489827,1,Aerial Imagery,100-161,Exelon Corp,-99,2737,82,AC,IOU
3,Commonwealth Edison Co,Marengo Tap to Marengo 138 kV,,138,100-161,2,In Service,F,Marengo Tap,Marengo,...,1.109239,1,Aerial Imagery,100-161,Exelon Corp,-99,55455,82,AC,IOU
4,Commonwealth Edison Co,McHenry to Crystal Lake 138 kV,,138,100-161,2,In Service,F,McHenry,Crystal Lake,...,5.882206,1,Aerial Imagery,100-161,Exelon Corp,-99,55456,82,AC,IOU
5,Commonwealth Edison Co,Marengo Tap to Pleasant Valley (Indope) 138 kV,,138,100-161,2,In Service,F,Marengo Tap,Pleasant Valley (Indope),...,10.764849,1,Aerial Imagery,100-161,Exelon Corp,-99,59314,82,AC,IOU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514,Northern Indiana Public Service Co LLC,Chicago Avenue to Praxair Inc No 1 138 kV,,138,100-161,1,In Service,F,Chicago Avenue,Praxair Inc No 1,...,2.533518,1,Aerial Imagery,100-161,NiSource Inc,-99,69262,82,AC,IOU
515,Northern Indiana Public Service Co LLC,Dune Acres to Michigan City 345 kV,,345,345,1,In Service,F,Dune Acres,Michigan City,...,11.776760,1,Aerial Imagery,345,NiSource Inc,-99,22656,82,AC,IOU
516,Northern Municipal Power Agency,Kenwood to Highland 138 kV,,138,100-161,1,In Service,F,Kenwood,Highland,...,1.736390,1,Aerial Imagery,100-161,Northern Municipal Power Agency,-99,59294,82,AC,Muni
517,Northern Municipal Power Agency,Highland to Lake George 138 kV,,138,100-161,1,In Service,F,Highland,Lake George,...,11.143082,1,Aerial Imagery,100-161,Northern Municipal Power Agency,-99,22658,82,AC,Muni


In [46]:
dfVelo['Rec_ID']

1       2732
2       2737
3      55455
4      55456
5      59314
       ...  
514    69262
515    22656
516    59294
517    22658
518    50022
Name: Rec_ID, Length: 459, dtype: int64

In [47]:
dfVelo[dfVelo['Rec_ID'] == 22650]

Unnamed: 0,Company Name,Transmission Line Name,Owner2,Voltage kV,Voltage Class kV,Number of Lines,Proposed,Underground,From Sub,To Sub,...,Length mi,Location Code,Source,Numeric Voltages,Holding Company Name,Owner2 ID,Rec_ID,Layer_ID,Type,Ownership Type
474,Northern Indiana Public Service Co LLC,Aetna to Miller 138 kV,,138,100-161,1,In Service,F,Aetna,Miller,...,0.54525,1,Aerial Imagery,100-161,NiSource Inc,-99,22650,82,AC,IOU


In [48]:
dfVelo[dfVelo['Rec_ID'] == 70313]

Unnamed: 0,Company Name,Transmission Line Name,Owner2,Voltage kV,Voltage Class kV,Number of Lines,Proposed,Underground,From Sub,To Sub,...,Length mi,Location Code,Source,Numeric Voltages,Holding Company Name,Owner2 ID,Rec_ID,Layer_ID,Type,Ownership Type
412,Undetermined Company,Miller to Aetna 138 kV,,138,100-161,1,In Service,F,Miller,Aetna,...,0.54393,1,Aerial Imagery,100-161,Unknown,-99,70313,82,AC,Unknown


In [49]:
dfVelo[dfVelo['Rec_ID'] == 55397]

Unnamed: 0,Company Name,Transmission Line Name,Owner2,Voltage kV,Voltage Class kV,Number of Lines,Proposed,Underground,From Sub,To Sub,...,Length mi,Location Code,Source,Numeric Voltages,Holding Company Name,Owner2 ID,Rec_ID,Layer_ID,Type,Ownership Type
213,Commonwealth Edison Co,Aurora to Electric Junction 138 kV,,138,100-161,2,In Service,F,Aurora,Electric Junction,...,1.25273,1,Aerial Imagery,100-161,Exelon Corp,-99,55397,82,AC,IOU


In [50]:
dfVelo[dfVelo['Rec_ID'] == 69508]

Unnamed: 0,Company Name,Transmission Line Name,Owner2,Voltage kV,Voltage Class kV,Number of Lines,Proposed,Underground,From Sub,To Sub,...,Length mi,Location Code,Source,Numeric Voltages,Holding Company Name,Owner2 ID,Rec_ID,Layer_ID,Type,Ownership Type
219,Undetermined Company,Aurora to Electric Junction 345 kV,,345,345,1,In Service,F,Aurora,Electric Junction,...,1.264928,3,Hitachi Energy,345,Unknown,-99,69508,82,AC,Unknown


In [51]:
dfMatchReduced['combo']

28715              ACO Aetna-Dune Acres 138006
28729             ACO Aetna-Lake George 138054
28739                  ACO Aetna-Miller 138102
28739                  ACO Aetna-Miller 138102
41979                    ACO Albers-Bain 63143
                          ...                 
292357                 ACO Tollway-Wayne 14402
286725    ACO University-Washington Park 17404
292165                  ACO Waukegan-Zion 1609
300914                  ACO Waukegan-Zion 2218
300927        ACO Zion-Zion Energy Center 2223
Name: combo, Length: 127, dtype: object

In [52]:
set(dfMatchReduced['combo'])

{'ACC Clybourn-Crosby 8207',
 'ACC Clybourn-Diversey 4013',
 'ACC Congress-Medical Center 6701',
 'ACC Crosby-Diversey 4018',
 'ACC Crosby-Ontario 8211',
 'ACC Damen-Evergreen Park 81414',
 'ACC Damen-Wallace 11801',
 'ACC Dekoven-Madison 3610',
 'ACC Des Plaines-Norridge 19801',
 'ACC Devon-Northwest 11411',
 'ACC Diversey-Northwest 11413',
 'ACC Galewood-Natoma 3701',
 'ACC Grand-Jefferson 4525',
 'ACC Grand-Madison 5810',
 'ACC Higgins-Natoma 3706',
 'ACC Humboldt Park-Rockwell 5001',
 'ACC IC Air Rights-Taylor 15311',
 'ACC Jefferson-Taylor 15302',
 'ACC Lasalle-Taylor 15316',
 'ACC Natoma-Norridge 3707',
 'ACC Natoma-Northwest 11412',
 'ACC Natoma-Oak Park 3709',
 'ACC Northwest-Rosehill 11407',
 'ACC Sears-Taylor 15304',
 'ACC Taylor-West Loop 15323',
 'ACO Aetna-Dune Acres 138006',
 'ACO Aetna-Lake George 138054',
 'ACO Aetna-Miller 138102',
 'ACO Albers-Bain 63143',
 'ACO Albers-Kenosha 9352',
 'ACO Aptakisic-Libertyville 15410',
 'ACO Arcadian-Zion 2222',
 'ACO Aurora-Electric

In [53]:
set(dfMatchReduced['combo']).__len__

<method-wrapper '__len__' of set object at 0x0000022E05E0FAC0>

In [54]:
length(set(dfMatchReduced['combo']))

NameError: name 'length' is not defined

In [55]:
size(set(dfMatchReduced['combo']))

NameError: name 'size' is not defined

In [56]:
set(dfMatchReduced['combo']).size()

AttributeError: 'set' object has no attribute 'size'

In [57]:
set(dfMatchReduced['combo']).length

AttributeError: 'set' object has no attribute 'length'

In [58]:
set(dfMatchReduced['combo']).length()

AttributeError: 'set' object has no attribute 'length'

In [59]:
length(set(dfMatchReduced['combo']))

NameError: name 'length' is not defined

In [60]:
len(set(dfMatchReduced['combo']))

117

Connected to .conda (Python 3.9.19)

In [1]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

In [2]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

from src.housekeeping_gads import (
    # do nothing lol
)

SyntaxError: invalid syntax (<ipython-input-2-91ab4b7cd46f>, line 18)

In [3]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

# from src.housekeeping_gads import (
#     # do nothing lol
# )

In [4]:
# pylint: disable=undefined-variable line-too-long invalid-name missing-function-docstring f-string-without-interpolation

try:
    fileAddr = __vsc_ipynb_file__
    wd = os.path.dirname(fileAddr)
    print("We seem to be working in a JuPyteR Notebook")
except ImportError:
    wd = os.getcwd()
    print("We seem to be working in a regular .py file")


rawDataFolder = os.path.join(wd, "rawData")
processedDataFolder = os.path.join(wd, "processedData/")

We seem to be working in a JuPyteR Notebook


In [5]:
gadsFileAddr = os.path.join(rawDataFolder, "GADS 2024 AC Inventory.csv")
dfGads0 = pd.read_csv(gadsFileAddr)
sizeGads0 = dfGads0.shape
print(f"Size of GADS db before filtering: {sizeGads0[0]}, {sizeGads0[1]}")
companyNamesGads0 = set(dfGads0.CompanyName)
numCompaniesGads0 = len(companyNamesGads0)
print(f"There are {numCompaniesGads0} unique companies owning tlines in the entire GADS database.")
# display(dfgads)

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\jhaa\\Documents\\documents_general\\extreme-weather-repo\\rawData\\GADS 2024 AC Inventory.csv'

In [6]:
gadsFileAddr = os.path.join(rawDataFolder, "GADS inventory 2024.csv")
dfGads0 = pd.read_csv(gadsFileAddr)
sizeGads0 = dfGads0.shape
print(f"Size of GADS db before filtering: {sizeGads0[0]}, {sizeGads0[1]}")
companyNamesGads0 = set(dfGads0.CompanyName)
numCompaniesGads0 = len(companyNamesGads0)
print(f"There are {numCompaniesGads0} unique companies owning tlines in the entire GADS database.")
# display(dfgads)

Size of GADS db before filtering: 11624, 40
There are 895 unique companies owning tlines in the entire GADS database.


In [7]:
gadsFileAddr = os.path.join(rawDataFolder, "GADS inventory 2024.csv")
dfGads0 = pd.read_csv(gadsFileAddr)
sizeGads0 = dfGads0.shape
print(f"Size of GADS db before filtering: {sizeGads0[0]}, {sizeGads0[1]}")
companyNamesGads0 = set(dfGads0.CompanyName)
numCompaniesGads0 = len(companyNamesGads0)
print(f"There are {numCompaniesGads0} unique companies owning tlines in the entire GADS database.")
# display(dfgads)

Size of GADS db before filtering: 11624, 40
There are 895 unique companies owning tlines in the entire GADS database.


In [8]:
location = "chicago-ohare"
components = 
veloFileAddr = os.path.join(rawDataFolder, "tlines-near-chicago-ohare-raw.xlsx") # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

SyntaxError: invalid syntax (<ipython-input-8-d77c174de64e>, line 3)

In [9]:
location = "chicago-ohare"
components = "genUnits"
filenameVeloGads = location + "-near-" + components
veloFileAddr = os.path.join(rawDataFolder, ) # tlines which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData


PermissionError: [Errno 13] Permission denied: 'c:\\Users\\jhaa\\Documents\\documents_general\\extreme-weather-repo\\rawData'

In [10]:
filenameVeloGads = location + "-near-" + components

In [11]:
filenameVeloGads

'chicago-ohare-near-genUnits'

In [12]:
location = "chicago-ohare"
components = "genUnits"
ext = ".xlsx"
filenameVeloGads = location + "-near-" + components + ext
veloFileAddr = os.path.join(rawDataFolder, filenameVeloGads) # gen units which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\chicago-ohare-near-genUnits.xlsx


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\jhaa\\Documents\\documents_general\\extreme-weather-repo\\rawData\\chicago-ohare-near-genUnits.xlsx'

In [13]:
location = "chicago-ohare"
components = "genUnits"
ext = ".xlsx"
filenameVeloGads = components + "-near-" + location + ext
veloFileAddr = os.path.join(rawDataFolder, filenameVeloGads) # gen units which are <= 50miles from `Chicago/Ohare` weather station
print(veloFileAddr)
dfVelo0 = pd.read_excel(veloFileAddr, engine='openpyxl')
sizeVelo0 = dfVelo0.shape
print(f"Size of velocity suite db before any filtering: {sizeVelo0[0]}, {sizeVelo0[1]}")
# dfVelo0

c:\Users\jhaa\Documents\documents_general\extreme-weather-repo\rawData\genUnits-near-chicago-ohare.xlsx
Size of velocity suite db before any filtering: 291, 18


In [14]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

KeyError: 'Voltage kV'

In [15]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
# dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
# dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
companyNamesVelo = set(dfVelo['Company Name'])
numCompaniesVelo = len(companyNamesVelo)
print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
print(f"Their names are:")
print(companyNamesVelo)
# dfVelo

Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': 291, 18


KeyError: 'Company Name'

In [16]:
# Filter rows with 'Undetermined Company`
# dfVelo = dfVelo0[ dfVelo0['Company Name'] != 'Undetermined Company' ]
# Filter tlines with less than 100kV voltage
dfVelo = dfVelo0.copy()
# dfVelo = dfVelo[ dfVelo['Voltage kV'] >= 100 ]
# Filter tlines not currently in service
# dfVelo = dfVelo[ dfVelo['Proposed'] == 'In Service']

sizeVelo = dfVelo.shape
# print(f"Size of velocity suite db after filtering for Company Names, Voltage [kV] and 'Proposed': {sizeVelo[0]}, {sizeVelo[1]}")
# companyNamesVelo = set(dfVelo['Company Name'])
# numCompaniesVelo = len(companyNamesVelo)
# print(f"There are {numCompaniesVelo} named companies owning the tlines near {location}")
# print(f"Their names are:")
# print(companyNamesVelo)
# dfVelo

In [17]:
# print(f"Now let's see how many tlines are owned by these {numCompaniesVelo} "       "companies in the entire GADS database:")

# print(""f"But first I'll need to rename some companies in vs db to match with the exact strings of the GADS db.")

# companyNamesVelo2Gads = companyNamesVelo.copy()  # Create a copy to avoid modifying the original

# # Replace the element using the 'discard' method (more efficient for sets)
# companyNamesVelo2Gads.discard("Commonwealth Edison Co")
# companyNamesVelo2Gads.add("Commonwealth Edison Company")
# companyNamesVelo2Gads.discard("AmerenIP")
# companyNamesVelo2Gads.add("Ameren Services Company")
# companyNamesVelo2Gads.discard("American Transmission Co LLC")
# companyNamesVelo2Gads.add("American Transmission Company")
# companyNamesVelo2Gads.discard("Northern Indiana Public Service Co LLC")
# companyNamesVelo2Gads.add("Northern Indiana Public Service Company [BA")
# companyNamesVelo2Gads.discard("Northern Municipal Power Agency")
# companyNamesVelo2Gads.add("Northern Indiana Public Service Company [BA")
# companyNamesVelo2Gads.discard("Undetermined Company")
# companyNamesVelo2Gads.add("Commonwealth Edison Company")
# print(companyNamesVelo2Gads)

# dfVeloSorted = sort_and_shift_columns_dfVelo(dfVelo)
dfVeloSorted = dfVelo
veloSortedAddr = os.path.join(processedDataFolder, "dfVelo-"+components+"-"+location+"-Sorted"+ext)
dfVeloSorted.to_excel(veloSortedAddr)

In [18]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

from src.housekeeping_gads import (
    filter_states # Forward Declaration
)

In [19]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 0, 40


In [20]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 0, 40


In [21]:
import os
from collections import defaultdict
import re
import pandas as pd

from src.housekeeping import (
    # filter_tlines_by_latest_reported_year,  # Forward Declaration
    get_latest_entries, # Forward Declaration
    get_matched_entries, # Forward Declaration
    get_reduced_df, # Forward Declaration
    sort_and_shift_columns, # Forward Declaration
    sort_and_shift_columns_dfVelo, # Forward Declaration
)

from src.housekeeping_gads import (
    filter_states # Forward Declaration
)

In [22]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 0, 40


In [23]:
dfGads

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent


In [24]:
gadsFileAddr = os.path.join(rawDataFolder, "GADS inventory 2024.csv")
dfGads0 = pd.read_csv(gadsFileAddr)
sizeGads0 = dfGads0.shape
print(f"Size of GADS db before filtering: {sizeGads0[0]}, {sizeGads0[1]}")
companyNamesGads0 = set(dfGads0.CompanyName)
numCompaniesGads0 = len(companyNamesGads0)
print(f"There are {numCompaniesGads0} unique companies owning tlines in the entire GADS database.")
# display(dfgads)

Size of GADS db before filtering: 11624, 40
There are 895 unique companies owning tlines in the entire GADS database.


In [25]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 0, 40


In [26]:
dfGads

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent


In [27]:
dfGasd0

NameError: name 'dfGasd0' is not defined

In [28]:
dfGads0

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
0,5225,934,Mid-Set Cogeneration Company,301,Mid-Set Cogeneration #1,934301,35.0,1.0,860,Co-generator Block\t,...,0x8F556D9551A5DC25589FF741B87A4310,WECC,WECC,2017-03-02 16:20:01.033,,2017-03-22 23:22:38.307,4675,1,0xC283EE6CE291A7CAD48AA411619603E9,1
1,10108,209,VINELAND MUNICIPAL ELECTRIC UTILITIES,391,Vineland CT 11,209391,59.9,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5967F80A0B5CBF7817E63C8F228F5885,RFC,RFC - PJM,2017-02-13 15:55:00.647,,2017-03-22 23:22:38.307,9148,1,0xF65721851DF7CAB86A39D862240A53D9,1
2,12135,630,Minnesota Municipal Power Agency,801,Shakopee Energy Park,630801,46.7,1.0,800,Miscellaneous,...,0xA99FDBEE11D34711CF8BE164BAA2E434,MRO,MRO,2017-02-14 20:20:02.077,,2017-03-22 23:22:38.307,11005,1,0xCB702F1DCCE6057310203FABAC054AB6,1
3,1215,450,Buckeye Power,360,GREENVILLE #1,450360,58.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0xF97C22645CBF0B981BB097D4A5F230F8,RFC,RFC,2017-03-23 15:30:00.513,,2017-03-23 16:00:05.970,11017,1,0x486F65135155CF15856A9AB68D2A559D,1
4,1216,450,Buckeye Power,361,GREENVILLE #2,450361,58.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0xF97C22645CBF0B981BB097D4A5F230F8,RFC,RFC,2017-03-23 15:30:00.513,,2017-03-23 16:00:05.970,11018,1,0x8D9B6D4F15F3ABDCA36A0E55FC141E1B,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11619,10149,917,Idaho Power Company,544,MILNER #2,917544,12.1,1.0,500,Pumped Storage/Hydro,...,0xF074C7196E9E0B1908E2C3D3D5E4FA00,WECC,WECC,2024-05-15 21:15:00.520,,2024-05-15 22:00:10.187,36316,1,0xBCB86FAE353770C036BDE208D5F48B69,1
11620,10150,917,Idaho Power Company,545,MILNER #3,917545,0.8,1.0,500,Pumped Storage/Hydro,...,0xF074C7196E9E0B1908E2C3D3D5E4FA00,WECC,WECC,2024-05-15 21:25:00.570,,2024-05-15 22:00:10.187,36317,1,0x2A498FD1E7D7391C5D0B8527EF7BE55B,1
11621,11706,2F9,Racine Hydro,500,Racine #1,2F9500,24.0,1.0,500,Pumped Storage/Hydro,...,0x93B9959FAF5F568A1E80FD3BBDE20CD1,RFC,RFC - PJM,2024-05-16 07:20:00.233,,2024-05-16 08:00:08.747,36318,1,0x5174E0425DB7E4922B2BF88051B2C653,1
11622,11707,2F9,Racine Hydro,501,Racine #2,2F9501,24.0,1.0,500,Pumped Storage/Hydro,...,0x93B9959FAF5F568A1E80FD3BBDE20CD1,RFC,RFC - PJM,2024-05-16 07:20:00.233,,2024-05-16 08:00:08.747,36319,1,0x60ED78B8304296D01A94C4E947EEA2AD,1


In [29]:
dfGads0['StateName'] == "Wisconsin"

0        False
1        False
2        False
3        False
4        False
         ...  
11619    False
11620    False
11621    False
11622    False
11623    False
Name: StateName, Length: 11624, dtype: bool

In [30]:
dfGads0[dfGads0['StateName'] == "Wisconsin"]

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
10,9428,521,Wisconsin Electric Power Co.,117,Port Washington #1,521117,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11053,1,0xAC063A7AFE8B135AC23ADD451A06A71E,1
11,9429,521,Wisconsin Electric Power Co.,118,Oak Creek #1,521118,120.0,2.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11054,1,0xEB8BF986B7DF680841C7F62D662840DB,1
13,9432,521,Wisconsin Electric Power Co.,127,Port Washington #2,521127,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11057,1,0x0ADB5861D864A6D3CF9D86E3CBF5E2EA,1
14,9433,521,Wisconsin Electric Power Co.,128,Oak Creek #2,521128,120.0,2.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11058,1,0x59E17F2C453768E91FFC76A75E2FED8E,1
16,9435,521,Wisconsin Electric Power Co.,137,Port Washington #3,521137,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11060,1,0x1FF49127A6342F6A97162FE4699826AE,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381,5082,520,Madison Gas and Electric Co.,302,Fitchburg #2,520302,30.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 18:45:00.450,,2024-05-07 19:00:09.220,36036,1,0x43751F2C67170C9959832FBD2E449F00,1
11382,5083,520,Madison Gas and Electric Co.,311,Sycamore #1,520311,21.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 19:30:00.483,,2024-05-07 20:00:04.643,36037,1,0xFE77204045354BC6505076B51F6DA00B,1
11383,5084,520,Madison Gas and Electric Co.,312,Sycamore #2,520312,23.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 19:20:00.893,,2024-05-07 20:00:04.643,36038,1,0x53B0C590CDB68E7F5345B3F263690596,1
11384,5088,520,Madison Gas and Electric Co.,333,West Marinette,520333,106.0,2.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 20:10:00.487,,2024-05-07 21:00:11.157,36041,1,0x638071A3508EE8EEFF24F603D635D303,1


In [31]:
dfGads0[dfGads0['StateName'] == "Indiana"]

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
570,4508,413,Indianapolis Power & Light Company,125,EAGLE Valley #1,413125,46.0,1.0,100,Fossil-Steam,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13639,1,0x1C255D6419A52890CCAE63904564E991,1
571,4509,413,Indianapolis Power & Light Company,126,EAGLE Valley #2,413126,46.0,1.0,100,Fossil-Steam,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13640,1,0xE917D86162CA232656CB3B54A30DCD7B,1
572,4510,413,Indianapolis Power & Light Company,127,EAGLE Valley #3,413127,48.0,1.0,100,Fossil-Steam,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13641,1,0x5271044ACCEA0A0F2CBC9B39FE58B1C7,1
573,4511,413,Indianapolis Power & Light Company,128,EAGLE Valley #4,413128,69.0,1.0,100,Fossil-Steam,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13642,1,0x73B33B1A61D17611441B231AF16C092E,1
574,4512,413,Indianapolis Power & Light Company,129,EAGLE Valley #5,413129,69.0,1.0,100,Fossil-Steam,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13643,1,0xA5CEA9DBACC29DF547C67BB6EBA2A0ED,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11275,2576,430,Cinergy,143,GIBSON #3,430143,618.0,3.0,100,Fossil-Steam,...,0x7E2A1071640F863D8C2F1D5FE99CA3AE,RFC,RFC - MISO,2024-05-01 18:35:00.503,,2024-05-01 19:00:06.147,35904,1,0xDF015FBF99DFCD9D775CE3C7CAE33B9B,1
11276,2577,430,Cinergy,144,GIBSON #4,430144,618.0,3.0,100,Fossil-Steam,...,0x7E2A1071640F863D8C2F1D5FE99CA3AE,RFC,RFC - MISO,2024-05-01 18:35:00.503,,2024-05-01 19:00:06.147,35905,1,0x8F07B6470DCB795D657252BE3CAD0758,1
11342,12352,413,Indianapolis Power & Light Company,348,Eagle Valley GT1,413348,230.0,3.0,851,CC GT units\t,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC - MISO,2024-05-03 14:40:00.350,,2024-05-03 15:00:12.127,35975,1,0xA5B1273E1394170B7F45FECFF5F95AAC,1
11343,12353,413,Indianapolis Power & Light Company,349,Eagle Valley GT2,413349,230.0,3.0,851,CC GT units\t,...,0x1521DD6F9E7C099C891D31F85A3D1153,RFC,RFC - MISO,2024-05-03 14:45:00.510,,2024-05-03 15:00:12.127,35976,1,0x6F0B4E33E90D65A8A29F0FF016548E97,1


In [32]:
dfGads0[dfGads0['StateName'] == "Illinois"]

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
101,12274,545,"Prairie Power, Inc",306,Alsey Unit 6,545306,45.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x77135E77DE03E2F16CB940C3B3074AB9,SERC,Gateway,2018-01-31 14:15:00.810,,2018-01-31 15:00:01.470,12055,1,0x3328FC6DA5E4C36D6A1666B6E44A7385,1
294,2829,510,Edison Mission Energy,101,COLLINS #1,510101,554.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13084,1,0x3E22C8F14AC30EF4E3F90858F57917A1,1
295,2830,510,Edison Mission Energy,102,COLLINS #2,510102,554.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13085,1,0xA742DD440617C11B42B9E193110C83F4,1
296,2831,510,Edison Mission Energy,103,COLLINS #3,510103,530.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13086,1,0x92861ABF4F8815A455AA5518708660CC,1
297,2832,510,Edison Mission Energy,104,COLLINS #4,510104,530.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13087,1,0x38B27BFE548DF9F526A05221EEDDCC94,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11213,2361,528,"Kincaid Generation, LLC",113,KINCAID #2,528113,580.0,3.0,100,Fossil-Steam,...,0x83C96E5B8B7FC1ADED8C380AEEA60908,MRO,MRO,2024-04-29 18:30:00.597,,2024-04-29 19:00:12.270,35785,1,0x588F4A0760D365899662B1D5D00A8320,1
11238,3957,526,GenOn Energy (RELIANT ENERGY),315,SHELBY CT E,526315,44.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x4D7467E7A84C0A46AD8D18586B45E93B,SERC,SERC,2024-04-29 23:50:00.453,,2024-04-30 00:00:07.970,35810,1,0x9CE4931D42518BB2822B58C06E56A3AF,1
11239,3958,526,GenOn Energy (RELIANT ENERGY),316,SHELBY CT F,526316,44.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x4D7467E7A84C0A46AD8D18586B45E93B,SERC,SERC,2024-04-29 23:55:01.163,,2024-04-30 00:00:07.970,35811,1,0x5D8AE84A1767961050DCDB622F4511FB,1
11240,3959,526,GenOn Energy (RELIANT ENERGY),317,SHELBY CT G,526317,44.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x4D7467E7A84C0A46AD8D18586B45E93B,SERC,SERC,2024-04-29 23:55:01.163,,2024-04-30 00:00:07.970,35812,1,0xC1C419578B15102BB692B6FEA54C4588,1


In [33]:
def filter_states(dfGads, states_to_keep=["Illinois", "Indiana", "Wisconsin"]):
    # Define the list of state names to filter
    # states_to_keep = ["IL", "IN", "WI"]

    # Filter the DataFrame
    dfFiltered = dfGads[dfGads["StateName"].isin(states_to_keep)]

    return dfFiltered

In [34]:
filter_states(dfGads0)

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
10,9428,521,Wisconsin Electric Power Co.,117,Port Washington #1,521117,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11053,1,0xAC063A7AFE8B135AC23ADD451A06A71E,1
11,9429,521,Wisconsin Electric Power Co.,118,Oak Creek #1,521118,120.0,2.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11054,1,0xEB8BF986B7DF680841C7F62D662840DB,1
13,9432,521,Wisconsin Electric Power Co.,127,Port Washington #2,521127,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11057,1,0x0ADB5861D864A6D3CF9D86E3CBF5E2EA,1
14,9433,521,Wisconsin Electric Power Co.,128,Oak Creek #2,521128,120.0,2.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11058,1,0x59E17F2C453768E91FFC76A75E2FED8E,1
16,9435,521,Wisconsin Electric Power Co.,137,Port Washington #3,521137,80.0,1.0,100,Fossil-Steam,...,0x039386647F44BC49D31ACC98C07983BB,RFC,RFC,2017-03-23 18:45:00.360,,2017-03-23 19:00:00.843,11060,1,0x1FF49127A6342F6A97162FE4699826AE,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381,5082,520,Madison Gas and Electric Co.,302,Fitchburg #2,520302,30.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 18:45:00.450,,2024-05-07 19:00:09.220,36036,1,0x43751F2C67170C9959832FBD2E449F00,1
11382,5083,520,Madison Gas and Electric Co.,311,Sycamore #1,520311,21.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 19:30:00.483,,2024-05-07 20:00:04.643,36037,1,0xFE77204045354BC6505076B51F6DA00B,1
11383,5084,520,Madison Gas and Electric Co.,312,Sycamore #2,520312,23.0,1.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 19:20:00.893,,2024-05-07 20:00:04.643,36038,1,0x53B0C590CDB68E7F5345B3F263690596,1
11384,5088,520,Madison Gas and Electric Co.,333,West Marinette,520333,106.0,2.0,300,Gas Turbine/Jet Engine (Simple Cycle Operation),...,0x5B627CCFD6ECD814C078E6F95CF56156,MRO,MRO,2024-05-07 20:10:00.487,,2024-05-07 21:00:11.157,36041,1,0x638071A3508EE8EEFF24F603D635D303,1


In [35]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads0)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 818, 40


In [36]:
dfGads['EIACode']

10       4040
11       4039
13       4040
14       4039
16       4040
         ... 
11381    3991
11382    3993
11383    3993
11384    7799
11385    9674
Name: EIACode, Length: 818, dtype: object

In [37]:
def filter_states(dfGads, states_to_keep=["Illinois", "Indiana", "Wisconsin"]):
    # Define the list of state names to filter
    # states_to_keep = ["IL", "IN", "WI"]

    # Filter the DataFrame
    dfFiltered = dfGads[dfGads["StateName"].isin(states_to_keep)]

    return dfFiltered

def filter_by_eia_code(dfVelo, dfGads):
    # Get the unique 'EIA ID' values from dfVelo
    eia_ids = dfVelo["EIA ID"].unique()

    # Filter dfGads to include only rows where 'EIACode' is in the list of 'EIA ID' values
    dfFiltered = dfGads[dfGads["EIACode"].isin(eia_ids)]

    return dfFiltered

In [38]:
dfVelo

Unnamed: 0,Plant Name,Plant Operator Name,Operating Cap MW,Planned Cap MW,Retired Cap MW,Canceled Cap MW,Mothballed Cap MW,Description,City,State,County,ZIP Code,Proposed,Location Code,Source,EIA ID,Layer_ID,Rec_ID
0,9521 US 14 Solar 1 LLC,9521 US 14 Solar 1 LLC,0.0,2.0,0.0,0.00,0,1 PL SOL PV(s),,IL,Mchenry,,T,2,Lat and Long,,69,32460
1,Lincoln Solar,Allco Finance Group Ltd,1.5,0.0,0.0,0.00,0,1 OP SOL PV(s),Merrillville,IN,Lake,46410.0,F,3,FERC eLibrary,58496,69,11844
2,Portage Solar,Allco Finance Group Ltd,1.5,0.0,0.0,0.00,0,1 OP SOL PV(s),Valparaiso,IN,Porter,46368.0,F,3,FERC eLibrary,60046,69,11824
3,Argo Power Project,Alliant Energy Corp,0.0,0.0,0.0,687.00,0,1 CN NG CC(s),Bedford Park,IL,Cook,60501.0,T,3,Hitachi Energy,906,69,8180
4,Alsip Paper Condominium Associates,Alsip Paper Condominium Association,0.0,0.0,6.9,0.00,0,1 RE NG GT(s),Alsip,IL,Cook,60658.0,F,3,Hitachi Energy,10406,69,8234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286,Wolfcastle Project,Wolfcastle Solar LLC,2.0,0.0,0.0,0.00,0,1 OP SOL PV(s),,IL,Dekalb,,F,2,Latitude and Longitude,64736,69,28249
287,Woodlawn II Project,Woodlawn Solar II LLC,2.0,0.0,0.0,0.00,0,1 OP SOL PV(s),Crete,IL,Will,60417.0,F,2,Latitude and Longitude,64733,69,28250
288,Woodlawn Project,Woodlawn Solar LLC,2.0,0.0,0.0,0.00,0,1 OP SOL PV(s),Crete,IL,Will,60417.0,F,2,Latitude and Longitude,64785,69,28251
289,Zion Municipal Landfill Solar,Zion (City of),0.0,0.0,0.0,2.43,0,1 CN SOL PV(s),,IL,Lake,,T,2,Street Address,,69,22809


In [39]:
dfVelo['EIA ID']

0        NaN
1      58496
2      60046
3        906
4      10406
       ...  
286    64736
287    64733
288    64785
289      NaN
290    55392
Name: EIA ID, Length: 291, dtype: object

In [40]:
dfGads['EIACode']

10       4040
11       4039
13       4040
14       4039
16       4040
         ... 
11381    3991
11382    3993
11383    3993
11384    7799
11385    9674
Name: EIACode, Length: 818, dtype: object

In [41]:
dfGadsFiltered = filter_by_eia_code(dfVelo, dfGads)

In [42]:
dfGadsFiltered

Unnamed: 0,UnitID,UtilityCode,UtilityName,UnitCode,UnitName,UtilityUnitCode,RatingMW,RatingMW_grp,UnitTypeCode,UnitTypeCodeDesc,...,NERCID_AliasID,RegionCode,SubRegionName,ExtractionDT,DeletionDT,NERC_DataPullDT,ID_SK,Rnk,AliasID,IsCurrent
294,2829,510,Edison Mission Energy,101,COLLINS #1,510101,554.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13084,1,0x3E22C8F14AC30EF4E3F90858F57917A1,1
295,2830,510,Edison Mission Energy,102,COLLINS #2,510102,554.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13085,1,0xA742DD440617C11B42B9E193110C83F4,1
296,2831,510,Edison Mission Energy,103,COLLINS #3,510103,530.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13086,1,0x92861ABF4F8815A455AA5518708660CC,1
297,2832,510,Edison Mission Energy,104,COLLINS #4,510104,530.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13087,1,0x38B27BFE548DF9F526A05221EEDDCC94,1
298,2833,510,Edison Mission Energy,105,COLLINS #5,510105,530.0,3.0,100,Fossil-Steam,...,0x3EB7A81A4A916415991F1222F16BEB36,RFC,RFC,2018-05-17 19:05:01.080,,2018-05-17 20:00:05.287,13088,1,0x0802695A0C9BFFE68BC4B9761EB43EA0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,3392,506,Exelon (ComEd),125,RIDGELAND #1,506125,160.0,2.0,100,Fossil-Steam,...,0x7A934D0BCD299CBB52A3966013A53769,RFC,RFC,2023-07-25 20:20:00.200,,2023-07-25 21:00:08.007,33507,1,0xACD96BACE0F36E67E10D88053F324B9F,1
9898,3393,506,Exelon (ComEd),126,RIDGELAND #2,506126,160.0,2.0,100,Fossil-Steam,...,0x7A934D0BCD299CBB52A3966013A53769,RFC,RFC,2023-07-25 20:20:00.200,,2023-07-25 21:00:08.007,33508,1,0x141AC8E2E957E7F4B6BE018EB00C70EE,1
9899,3394,506,Exelon (ComEd),127,RIDGELAND #3,506127,160.0,2.0,100,Fossil-Steam,...,0x7A934D0BCD299CBB52A3966013A53769,RFC,RFC,2023-07-25 20:20:00.200,,2023-07-25 21:00:08.007,33509,1,0x83788DA62C59B57BD6DD616927661A63,1
9900,3395,506,Exelon (ComEd),128,RIDGELAND #4,506128,160.0,2.0,100,Fossil-Steam,...,0x7A934D0BCD299CBB52A3966013A53769,RFC,RFC,2023-07-25 20:20:00.200,,2023-07-25 21:00:08.007,33510,1,0x110C8A705ABC8068E95FDDB246019BC9,1


In [43]:
dfGads = dfGads0.copy()
# dfGads = dfGads[dfGads['CompanyName'].isin(companyNamesVelo2Gads)]
# voltageClassesGads0 = set(dfGads['VoltageClassCodeName'])
# print(voltageClassesGads0)
# voltageClassesAllowedGads = voltageClassesGads0.copy()
# voltageClassesAllowedGads.discard("0-99 kV")

# dfGads = dfGads[dfGads['VoltageClassCodeName'].isin(voltageClassesAllowedGads)]
dfGads = filter_states(dfGads0)
sizeGads = dfGads.shape
print(f"Size of GADS db after filtering: {sizeGads[0]}, {sizeGads[1]}")

# dfGadsSorted = sort_and_shift_columns(dfGads)

# gadsSortedAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Sorted.xlsx")

# dfGadsSorted.to_excel(gadsSortedAddr, index=False)

# # dfGadsLatest = filter_tlines_by_latest_reported_year(dfGadsSorted)
# dfGadsLatest = get_latest_entries(dfGadsSorted)

# sizeGadsLatest = dfGadsLatest.shape

# print(f"Size of GADS db after filtering for only latest reported year: {sizeGadsLatest[0]}, {sizeGadsLatest[1]}")

# gadsLatestAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Latest.xlsx")

# dfGadsLatest.to_excel(gadsLatestAddr)

Size of GADS db after filtering: 818, 40


In [44]:
dfMatch = filter_by_eia_code(dfVelo, dfGads)
# dfMatch = get_matched_entries(dfVeloSorted, dfGadsLatest)
# matchAddr = os.path.join(processedDataFolder, "dfGads-Chicago-Ohare-Matched.xlsx")
matchAddr = os.path.join(
    processedDataFolder, "dfGads-" + components + "-" + location + "-Matched" + ext
)
dfMatch.to_excel(matchAddr)

In [45]:
dfVelo['Rec_ID']

0      32460
1      11844
2      11824
3       8180
4       8234
       ...  
286    28249
287    28250
288    28251
289    22809
290    10811
Name: Rec_ID, Length: 291, dtype: int64

In [46]:
def filter_states(dfGads, states_to_keep=["Illinois", "Indiana", "Wisconsin"]):
    # Define the list of state names to filter
    # states_to_keep = ["IL", "IN", "WI"]

    # Filter the DataFrame
    dfFiltered = dfGads[dfGads["StateName"].isin(states_to_keep)]

    return dfFiltered

def filter_by_eia_code0(dfVelo, dfGads):
    # Get the unique 'EIA ID' values from dfVelo
    eia_ids = dfVelo["EIA ID"].unique()

    # Filter dfGads to include only rows where 'EIACode' is in the list of 'EIA ID' values
    dfFiltered = dfGads[dfGads["EIACode"].isin(eia_ids)]

    return dfFiltered


def filter_by_eia_code(dfVelo, dfGads):
    # Merge the two DataFrames on 'EIA ID' and 'EIACode' columns
    dfMerged = pd.merge(
        dfGads,
        dfVelo[["EIA ID", "Rec_ID"]],
        left_on="EIACode",
        right_on="EIA ID",
        how="inner",
    )

    # Drop the duplicate 'EIA ID' column from the merge
    dfFiltered = dfMerged.drop(columns=["EIA ID"])

    return dfFiltered