In [1]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
from dateutil import tz
from datetime import datetime
from datetime import timedelta
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline_old:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.map = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        # TODO: use us zipcode database
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open(f"{self.config['state']}_mapping.json", 'r') as json_file:
                self.map = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        # Base transformation method
        raise NotImplementedError

    def standardize(self):
        # Base transformation method
        # where most of the functionality is
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['utility_provider'] = self.config['name']
        self._data['state'] = self.config['state']
        self._data['county'] = self._data['zipcode'].map(self.map)
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def _check_other_vars(self):
        # TODO: Check other useful variables
        pass
    
class GA11TX12(BasePipeline_old):    
    def __init__(self, config, base_file_path):
        super().__init__(config, base_file_path)
        with open('us_mapping.json', 'r') as json_file:
            self._usmap = json.load(json_file)
    
    def transform(self): # if edited, must recreate pipeline to reset transformed flag
        #### HELPER METHOD
        def _reformat_start_date(row): # taking the row and reformating its 'start_date' based on timestamp
            # Split the date string into components
            # start_date format: 03/15 05:28 pm
            month_day, time, ampm = row['start_date'].split(' ') # taking the row's start_date and parsing it
            # Split the month and day and determine year
            s_month, s_day = month_day.split('/')
            year = None
            # Determining year using timestamp as start_date does not include year
            if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
            # timestamp format: 01-18-2024 15:25:06
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])
                if t_month == '01' and s_month == '12':
                    # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                    year = str(int(t_year) - 1)
                else: # use timestamp year
                    year = t_year 
            else: # if no timestamp
            # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
                year = '2023'

            # Extract the hour and minute from the time
            hour, minute = time.split(':')

            # Convert hour to 24-hour format
            if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour

            # Add leading zeros if necessary
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            # Combine the components into the desired format
            reformatted_date = f'{s_month}-{s_day}-{year} {hour}:{minute}:00'

            return reformatted_date


        # Helper method
        def _reformat_update(row):
            # Format: Mar 15, 5 09, pm
            ## Splitting updateTime into components month, day, hour, min, am/pm
            month_day, time, ampm = row['updateTime'].split(',') # split into date; time; and pm/am 
            # Splitting into month, day
            u_month, u_day = month_day.split(' ')
            month_dict = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 
                        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }
            u_month = month_dict[u_month] # turning the 3 char month name to its two digit form
            
            # Determining year from timestamp as updateTime does not have year
            year = None
            if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
                # timestamp format: 01-18-2024 15:25:06
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])

                if t_month == '01' and u_month == '12':
                    # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                    year = str(int(t_year) - 1)
                else: # use timestamp year
                    year = t_year 
            else: # if no timestamp
            # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
                year = '2023'

            # Extract the hour and minute from the update time
            hour, minute = time.split() # splits based on whtie space which is space in this case and removed leading space
            # Convert hour to 24-hour format
            if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour # if pm and less than 12, add 12 hours  --> othertest it is 12 pm

            # Add leading zeros if necessary
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            # Combine the components into the desired format
            reformatted_date = f'{u_month}-{u_day}-{year} {hour}:{minute}:00'

            return reformatted_date

        ## HELPER METHOD ####
        

        try:
            self._data['start_date'] = self._data.apply(_reformat_start_date, axis=1) # reformattin the string to be datetime string form
            self._data['start_date'] = pd.to_datetime(self._data['start_date']) # change to timestamp data format
            self._data['updateTime'] = self._data.apply(_reformat_update, axis=1)
            self._data['updateTime'] = pd.to_datetime(self._data['updateTime'])
            self._data['duration'] = pd.to_timedelta(self._data['duration'])
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp']) # THIS HAS TO BE BEFORE applying reformat functions as splitting only works on timestamp in string form, not datetime

            # Renaming column names to match with superclass standardize
            self._data = self._data.rename(columns={
                'incident_id':'outage_id',
                'zip_code':'zipcode'
            })
            
            # # Eliminating outage_id's with multiple start dates since there are not that many of them
            # df = self._data
            # grouped = df.groupby('outage_id')['start_date'].nunique()
            # multi_start_outages = grouped[grouped > 1].index.tolist()
            # self._data = df[~df['outage_id'].isin(multi_start_outages)]


        except Exception as e:
            print(f"An error occurred during transformation: {e}")

    def standardize(self):
        # Specific transformation for GA11TX12
        # print(self.config)
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = grouped

    def _compute_metrics(self, group): # overwriting super class because the most accurate duration seems to be calculated from longest duration
        # group = groupby groupy of a unique outage_id
                
        duration_diff = group['duration'].max()
        duration_max = duration_diff + timedelta(minutes=15) # because 15 minute update intervals
        duration_mean = (duration_diff + duration_max) / 2
        start_time = group['start_date'].iloc[0]
        end_time = start_time + duration_diff
        customer_affected_mean = group['consumers_affected'].mean()
        total_customer_outage_time = customer_affected_mean * duration_diff
        zipcode = group['zipcode'].iloc[-1]
        zipcode_values = None
        
        null_zipcode = [None, None, None] 
        try:
            zipcode_values = self.map[zipcode] # the tuple of values from zipcode map (county name, fip, state) 
        except KeyError:
            try:
                zipcode_values = self._usmap[zipcode]
            except KeyError:
                # print(f"Nonexistent zipcode in {self.config['name']}: {zipcode}")        
                zipcode_values = null_zipcode

        return pd.Series({
            'start_time': start_time,
            'end_time': end_time,
            'lat': group['lat'].iloc[-1],
            'long': group['lon'].iloc[-1],
            'zipcode': zipcode,
            'county_name': zipcode_values[0], 
            'county_fips': zipcode_values[1],
            'utility_provider': self.config['name'],
            'state': zipcode_values[2],
            'duration_diff': duration_diff,
            'duration_max': duration_max,
            'duration_mean': duration_mean,
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time
        })


One Time Code to Generate the mappings from US_zips


In [2]:
# us_zip_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/US Zipcode Data/uszips.csv'
# with open(us_zip_path, 'r') as us_zip:
#     us_zip_df = pd.read_csv(us_zip)
#     us_zip_df.to_csv('filename.csv', index=False)

      # # Creating a US zip file  
#     us_zip_dict = dict(zip(us_zip_df['zip'], zip(us_zip_df['county_name'], us_zip_df['county_fips'], us_zip_df['state_name'])))
#     with open("us_mapping.json", 'w') as json_file:
#         json.dump(us_zip_dict, json_file)


# # Creating GA mapping
# states = ['Georgia', 'Texas'] # states to generate mappings for (update as needed)
# state_abbr_map = {
#     'Georgia': 'ga',
#     'Texas': 'tx'
# }
# for state in states:
#     state_df = us_zip_df[us_zip_df['state_name'] == state]
#     state_zip_dict = dict(zip(state_df['zip'], zip(state_df['county_name'], state_df['county_fips'], state_df['state_name'])))
#     with open(f"{state_abbr_map[state]}_mapping.json", 'w') as json_file:
#         json.dump(state_zip_dict, json_file)

Establishing config files

In [3]:
local_config_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/app/pipeline/config.yaml'
with open(local_config_path, 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['local_base_file_path']

# Test Code to Feel out and Validate Dataframe

Loading configs manually and separately

In [4]:
config_list = [
    (walton_conf := {'name': 'Walton EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (tris_conf := {'name': 'Tri-State EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (oconee_conf := {'name': 'Oconee EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (mitchell_conf := {'name': 'Mitchell EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (wise_conf := {'name': 'Wise Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}),
    (houston_conf := {'name': 'Houston County Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}),
    (cherokee_conf := {'name': 'Cherokee County Electric Coop Association', 'state': 'tx', 'layout': 12, 'type': 'o'})
]

pipelines = [
    (walton_pipe := GA11TX12(walton_conf, base_file_path)),
    (tris_pipe := GA11TX12(tris_conf, base_file_path)),
    (oconee_pipe := GA11TX12(oconee_conf, base_file_path)),
    (mitchell_pipe := GA11TX12(mitchell_conf, base_file_path)),
    (wise_pipe := GA11TX12(wise_conf, base_file_path)),
    (houston_pipe := GA11TX12(houston_conf, base_file_path)),
    (cherokee_pipe := GA11TX12(cherokee_conf, base_file_path))
]

for pipeline in pipelines:
    pipeline.load_data()
    
# for pipeline in pipelines:
#     pipeline.transform()

dataframes = [
    (walton_df := walton_pipe._data),
    (tris_df := tris_pipe._data),
    (oconee_df := oconee_pipe._data),
    (mitchell_df := mitchell_pipe._data),
    (wise_df := wise_pipe._data),
    (houston_df := houston_pipe._data),
    (cherokee_df := cherokee_pipe._data)
]

display(walton_df)

/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Walton EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Tri-State EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Oconee EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Wise Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Houston County Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Cherokee County Electric Coop Association.csv


  self._data = pd.read_csv(file_path)


Unnamed: 0,substation,feeder,incident_id,estimated_restore_time,formatted_ert,start_date,duration,consumers_affected,lon,lat,service_index_name,outages,NumConsumers,zip_code,isHighTraffic,updateTime,EMC,alias,outage_comment,timestamp,opt_code,poly
0,14.0,3.0,C950581,,,03/15 04:28 pm,00 hr 11 min,1,-83.669285,33.911106,Electric,1,137416,30656,False,"Mar 15, 3 39, pm",Walton EMC,,,,,
1,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 05 min,1,-83.703931,33.760364,Electric,1,137416,30655,False,"Mar 15, 3 54, pm",Walton EMC,,,,,
2,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 21 min,1,-83.703931,33.760364,Electric,1,137416,30655,False,"Mar 15, 4 09, pm",Walton EMC,,,,,
3,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 35 min,1,-83.703931,33.760364,Electric,2,137416,30655,False,"Mar 15, 4 24, pm",Walton EMC,,,,,
4,14.0,1.0,C950583,,,03/15 05:14 pm,00 hr 10 min,1,-83.768580,33.900673,Electric,2,137416,30620,False,"Mar 15, 4 24, pm",Walton EMC,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71389,11.0,5.0,D1009642,,,01/18 08:15 am,01 hr 54 min,3,-83.440692,33.872791,Electric,15,139225,30677,False,"Jan 18, 9 10, am",Walton EMC,EXPERIMENT STAT/ 15 KVA,,01-18-2024 15:25:06,,"[{'lon': '-83.4402683290629', 'lat': '33.87192..."
71390,6.0,1.0,C1009637,,,01/18 07:37 am,02 hr 32 min,1,-84.043121,33.810690,Electric,15,139225,30039,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,
71391,34.0,3.0,C1009640,,,01/18 08:08 am,02 hr 00 min,1,-83.435911,33.739100,Electric,15,139225,30638,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,
71392,4.0,2.0,C1009654,,,01/18 09:28 am,00 hr 41 min,1,-83.512373,33.571093,Electric,15,139225,30650,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,


Seeing what columns have mostly null values

In [5]:
for df in dataframes:
    print(df['timestamp'].iloc[0])
    print(df['timestamp'].iloc[-1])
    print()

print(walton_df.info())

nan
01-18-2024 15:25:06

nan
01-18-2024 14:39:51

nan
01-18-2024 13:40:27

nan
01-18-2024 15:25:40

04-15-2023 18:17:23
01-17-2024 19:32:27

04-14-2023 12:32:11
01-17-2024 15:32:20

04-14-2023 15:01:56
01-18-2024 15:31:56

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71394 entries, 0 to 71393
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   substation              71275 non-null  float64
 1   feeder                  71274 non-null  float64
 2   incident_id             71394 non-null  object 
 3   estimated_restore_time  0 non-null      float64
 4   formatted_ert           0 non-null      float64
 5   start_date              71394 non-null  object 
 6   duration                71394 non-null  object 
 7   consumers_affected      71394 non-null  int64  
 8   lon                     71394 non-null  float64
 9   lat                     71394 non-null  float64
 10  service_index_name      713

In [6]:
null_rate = 0
print("Non-null proportion, null count, and number unique per column across all providers")
for df in dataframes:
    print(df['EMC'].iloc[0])
    print(f"# of columns: {len(df.columns)}")
    null_rate = null_rate + df.isnull().sum() / len(df)
    
    non_null_count = df.count()
    null_count = len(df) - non_null_count
    non_null_prop = non_null_count / len(df)
    unique_count = df.nunique()

    # Create DataFrame with non-null proportions and null counts
    result_df = pd.DataFrame({
        'non_null_proportion': non_null_prop,
        'null_count': null_count,
        'num_unique': unique_count
    })

    display(result_df)

print("Summed Null rate across all df's")
print(null_rate / len(dataframes))







Non-null proportion, null count, and number unique per column across all providers
Walton EMC
# of columns: 22


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,0.998333,119,30
feeder,0.998319,120,8
incident_id,1.0,0,14690
estimated_restore_time,0.0,71394,0
formatted_ert,0.0,71394,0
start_date,1.0,0,9046
duration,1.0,0,1531
consumers_affected,1.0,0,333
lon,1.0,0,12541
lat,1.0,0,12540


Tri-State EMC
# of columns: 21


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,1.0,0,7
feeder,0.999878,5,14
incident_id,1.0,0,1591
alias,0.898727,4160,955
outage_comment,0.061153,38565,75
estimated_restore_time,0.006987,40790,46
formatted_ert,0.006987,40790,46
start_date,1.0,0,1418
duration,1.0,0,8244
consumers_affected,1.0,0,214


Oconee EMC
# of columns: 22


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,0.998026,118,14
feeder,0.997457,152,6
incident_id,1.0,0,7935
estimated_restore_time,0.0,59770,0
formatted_ert,0.0,59770,0
start_date,1.0,0,4054
duration,1.0,0,1010
consumers_affected,1.0,0,138
lon,1.0,0,5408
lat,1.0,0,5407


Mitchell EMC
# of columns: 22


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,0.885988,14116,20
feeder,0.885988,14116,8
incident_id,1.0,0,7344
alias,0.0,123812,0
outage_comment,0.367275,78339,423
estimated_restore_time,8e-06,123811,1
formatted_ert,8e-06,123811,1
start_date,1.0,0,5755
duration,1.0,0,10172
consumers_affected,1.0,0,261


Wise Electric Coop, Inc.
# of columns: 20


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,1.0,0,23
feeder,0.998956,11,18
incident_id,1.0,0,2188
estimated_restore_time,0.001423,10525,1
formatted_ert,0.001423,10525,1
start_date,1.0,0,1611
duration,1.0,0,587
consumers_affected,1.0,0,133
lon,1.0,0,1835
lat,1.0,0,1835


Houston County Electric Coop, Inc.
# of columns: 22


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,1.0,0,11
feeder,0.996663,47,9
incident_id,1.0,0,1660
alias,0.907477,1303,780
outage_comment,0.102677,12637,145
estimated_restore_time,0.370802,8861,618
formatted_ert,0.370802,8861,618
start_date,1.0,0,1460
duration,1.0,0,1314
consumers_affected,1.0,0,257


Cherokee County Electric Coop Association
# of columns: 22


Unnamed: 0,non_null_proportion,null_count,num_unique
substation,0.999825,9,13
feeder,0.999709,15,6
incident_id,1.0,0,4780
alias,0.0,51507,0
outage_comment,0.012678,50854,17
estimated_restore_time,1.9e-05,51506,1
formatted_ert,1.9e-05,51506,1
start_date,1.0,0,3212
duration,1.0,0,2263
consumers_affected,1.0,0,219


Summed Null rate across all df's
EMC                       0.000000
NumConsumers              0.000000
alias                     0.710222
consumers_affected        0.000000
duration                  0.000000
estimated_restore_time    0.945823
feeder                    0.017576
formatted_ert             0.945823
incident_id               0.000000
isHighTraffic             0.000000
lat                       0.019902
lon                       0.019902
opt_code                       NaN
outage_comment            0.911656
outages                   0.000000
poly                           NaN
service_index_name        0.000000
start_date                0.000000
substation                0.016832
timestamp                 0.009554
updateTime                0.000000
zip_code                  0.000000
dtype: float64


Seeing the rate of null values, we see that across all the providers:
- outage_comment
- estimated_restore_time
- formatted ert
- alias

have significantly high % of nulls. Thus we will not use them.
Other unused variables:

- feeder
- isHighTraffic
- poly
- opt_code
- substation




### Sample Views of Transformation, Validation, and Compute Metrics

In [7]:
# Control which df to test on
test_conf = mitchell_conf 
test_p = GA11TX12(test_conf, base_file_path)
test_p.load_data()


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)


#### Issues with Raw Data and Validating Transform:
- The updateTime and start_date do not have the year included, so we need to utilize the timestamp to calculate the years in order to convert to a datetime format to calculate metrics
- However, some rows have null timestamps.

In [8]:
# Display the rows of each dataframe where timestamp is null
for dataframe in dataframes:
    display(dataframe[dataframe['timestamp'].isna()][['incident_id', 'start_date', 'updateTime', 'timestamp']])

Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,C950581,03/15 04:28 pm,"Mar 15, 3 39, pm",
1,C950582,03/15 04:48 pm,"Mar 15, 3 54, pm",
2,C950582,03/15 04:48 pm,"Mar 15, 4 09, pm",
3,C950582,03/15 04:48 pm,"Mar 15, 4 24, pm",
4,C950583,03/15 05:14 pm,"Mar 15, 4 24, pm",
...,...,...,...,...
1735,C951589,03/29 03:09 pm,"Mar 29, 4 09, pm",
1736,C951590,03/29 04:46 pm,"Mar 29, 4 09, pm",
1737,C951589,03/29 03:09 pm,"Mar 29, 4 24, pm",
1738,C951589,03/29 03:09 pm,"Mar 29, 4 39, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,D760313,03/16 09:34 pm,"Mar 16, 8 39, pm",
1,D760313,03/16 09:34 pm,"Mar 16, 8 54, pm",
2,D760313,03/16 09:34 pm,"Mar 16, 9 09, pm",
3,D760353,03/19 09:16 am,"Mar 19, 8 24, am",
4,D760353,03/19 09:16 am,"Mar 19, 8 39, am",
...,...,...,...,...
202,D760657,03/28 11:19 am,"Mar 28, 12 09, pm",
203,D760660,03/28 01:59 pm,"Mar 28, 1 09, pm",
204,D760660,03/28 01:59 pm,"Mar 28, 1 24, pm",
205,D760660,03/28 01:59 pm,"Mar 28, 1 39, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,C434216,03/15 04:31 pm,"Mar 15, 3 40, pm",
1,C434216,03/15 04:31 pm,"Mar 15, 3 55, pm",
2,C434216,03/15 04:31 pm,"Mar 15, 4 10, pm",
3,C434216,03/15 04:31 pm,"Mar 15, 4 25, pm",
4,C434219,03/16 07:42 pm,"Mar 16, 6 55, pm",
...,...,...,...,...
1140,D434618,03/29 04:52 pm,"Mar 29, 4 10, pm",
1141,D434617,03/29 04:35 pm,"Mar 29, 4 25, pm",
1142,D434618,03/29 04:52 pm,"Mar 29, 4 25, pm",
1143,D434617,03/29 04:35 pm,"Mar 29, 4 40, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,G437006,03/15 07:14 am,"Mar 15, 3 25, pm",
1,C483513,03/15 02:24 pm,"Mar 15, 3 25, pm",
2,G437006,03/15 07:14 am,"Mar 15, 3 40, pm",
3,G437006,03/15 07:14 am,"Mar 15, 3 55, pm",
4,G437006,03/15 07:14 am,"Mar 15, 4 10, pm",
...,...,...,...,...
2262,G437006,03/29 07:31 am,"Mar 29, 3 40, pm",
2263,G437006,03/29 07:31 am,"Mar 29, 3 55, pm",
2264,G437006,03/29 07:31 am,"Mar 29, 4 10, pm",
2265,G437006,03/29 07:31 am,"Mar 29, 4 25, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp


Unnamed: 0,incident_id,start_date,updateTime,timestamp


Unnamed: 0,incident_id,start_date,updateTime,timestamp


As you can see, only the first 4 of the 7 dataframes (which coincide with the GA providers) have null timestamps and they are all in March 2023 (this initial dataset created from 3/2023 to 1/2024).

Hence, from now on for future updates of the dataset, we will assume there are NO MORE null timestamps and the only null ones will be 3/2023

#### Transformation Before and After

Creating a transform_test function to separately try out transformations outside of the final transform function above

In [9]:
def transform_test(pipeline): # if edited, must recreate pipeline to reset transformed flag
    #### HELPER METHOD
    def _reformat_start_date(row): # taking the row and reformating its 'start_date' based on timestamp
        # Split the date string into components
        # start_date format: 03/15 05:28 pm
        month_day, time, ampm = row['start_date'].split(' ') # taking the row's start_date and parsing it
        # Split the month and day and determine year
        s_month, s_day = month_day.split('/')
        year = None
        # Determining year using timestamp as start_date does not include year
        if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
        # timestamp format: 01-18-2024 15:25:06
            timestamp_components = row['timestamp'].split(' ')
            ts_date_comp = timestamp_components[0].split('-')
            t_month, t_day = ts_date_comp[0], ts_date_comp[1]
            t_year = pd.to_numeric(ts_date_comp[2])
            if t_month == '01' and s_month == '12':
                # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                year = str(int(t_year) - 1)
            else: # use timestamp year
                year = t_year 
        else: # if no timestamp
        # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
            year = '2023'

        # Extract the hour and minute from the time
        hour, minute = time.split(':')

        # Convert hour to 24-hour format
        if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
            hour = '00' 
        if 'pm' in ampm.lower():
            hour = str(int(hour) + 12) if int(hour) < 12 else hour

        # Add leading zeros if necessary
        hour = hour.zfill(2)
        minute = minute.zfill(2)

        # Combine the components into the desired format
        reformatted_date = f'{s_month}-{s_day}-{year} {hour}:{minute}:00'

        return reformatted_date


    # Helper method
    def _reformat_update(row):
        # Format: Mar 15, 5 09, pm
        ## Splitting updateTime into components month, day, hour, min, am/pm
        month_day, time, ampm = row['updateTime'].split(',') # split into date; time; and pm/am 
        # Splitting into month, day
        u_month, u_day = month_day.split(' ')
        month_dict = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 
                    'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }
        u_month = month_dict[u_month] # turning the 3 char month name to its two digit form
        
        # Determining year from timestamp as updateTime does not have year
        year = None
        if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
            # timestamp format: 01-18-2024 15:25:06
            timestamp_components = row['timestamp'].split(' ')
            ts_date_comp = timestamp_components[0].split('-')
            t_month, t_day = ts_date_comp[0], ts_date_comp[1]
            t_year = pd.to_numeric(ts_date_comp[2])

            if t_month == '01' and u_month == '12':
                # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                year = str(int(t_year) - 1)
            else: # use timestamp year
                year = t_year 
        else: # if no timestamp
        # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
            year = '2023'

        # Extract the hour and minute from the update time
        hour, minute = time.split() # splits based on whtie space which is space in this case and removed leading space
        # Convert hour to 24-hour format
        if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
            hour = '00' 
        if 'pm' in ampm.lower():
            hour = str(int(hour) + 12) if int(hour) < 12 else hour # if pm and less than 12, add 12 hours  --> othertest it is 12 pm

        # Add leading zeros if necessary
        hour = hour.zfill(2)
        minute = minute.zfill(2)

        # Combine the components into the desired format
        reformatted_date = f'{u_month}-{u_day}-{year} {hour}:{minute}:00'

        return reformatted_date

    ## HELPER METHOD ####
    

    try:
        pipeline._data['start_date'] = pipeline._data.apply(_reformat_start_date, axis=1) # reformattin the string to be datetime string form
        pipeline._data['start_date'] = pd.to_datetime(pipeline._data['start_date']) # change to timestamp data format
        pipeline._data['updateTime'] = pipeline._data.apply(_reformat_update, axis=1)
        pipeline._data['updateTime'] = pd.to_datetime(pipeline._data['updateTime'])
        pipeline._data['duration'] = pd.to_timedelta(pipeline._data['duration'])
        pipeline._data['timestamp'] = pd.to_datetime(pipeline._data['timestamp']) # THIS HAS TO BE BEFORE applying reformat functions as splitting only works on timestamp in string form, not datetime

        # Renaming column names to match with superclass standardize
        pipeline._data = pipeline._data.rename(columns={
            'incident_id':'outage_id',
            'zip_code':'zipcode'
        })


    except Exception as e:
        print(f"An error occurred during transformation: {e}")


In [10]:
print("Before transform")
print(test_p._data.info()) # test_p is from mitchell which is one of the null timestamp pipelines
display(test_p._data[['incident_id', 'start_date', 'updateTime']])
transform_test(test_p)
print("after transform")
print(test_p._data.info())
display(test_p._data[['outage_id', 'start_date', 'updateTime']])


Before transform
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123812 entries, 0 to 123811
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   substation              109696 non-null  float64
 1   feeder                  109696 non-null  float64
 2   incident_id             123812 non-null  object 
 3   alias                   0 non-null       float64
 4   outage_comment          45473 non-null   object 
 5   estimated_restore_time  1 non-null       object 
 6   formatted_ert           1 non-null       object 
 7   start_date              123812 non-null  object 
 8   duration                123812 non-null  object 
 9   consumers_affected      123812 non-null  int64  
 10  lon                     106572 non-null  float64
 11  lat                     106572 non-null  float64
 12  opt_code                11893 non-null   object 
 13  service_index_name      123812 non-null  object 
 14  out

Unnamed: 0,incident_id,start_date,updateTime
0,G437006,03/15 07:14 am,"Mar 15, 3 25, pm"
1,C483513,03/15 02:24 pm,"Mar 15, 3 25, pm"
2,G437006,03/15 07:14 am,"Mar 15, 3 40, pm"
3,G437006,03/15 07:14 am,"Mar 15, 3 55, pm"
4,G437006,03/15 07:14 am,"Mar 15, 4 10, pm"
...,...,...,...
123807,G437006,01/17 06:32 am,"Jan 18, 9 25, am"
123808,D500951,01/15 07:10 am,"Jan 18, 9 25, am"
123809,D500998,01/17 07:20 am,"Jan 18, 9 25, am"
123810,C501047,01/18 09:20 am,"Jan 18, 9 25, am"


after transform
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123812 entries, 0 to 123811
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype          
---  ------                  --------------   -----          
 0   substation              109696 non-null  float64        
 1   feeder                  109696 non-null  float64        
 2   outage_id               123812 non-null  object         
 3   alias                   0 non-null       float64        
 4   outage_comment          45473 non-null   object         
 5   estimated_restore_time  1 non-null       object         
 6   formatted_ert           1 non-null       object         
 7   start_date              123812 non-null  datetime64[ns] 
 8   duration                123812 non-null  timedelta64[ns]
 9   consumers_affected      123812 non-null  int64          
 10  lon                     106572 non-null  float64        
 11  lat                     106572 non-null  float64        
 12  

Unnamed: 0,outage_id,start_date,updateTime
0,G437006,2023-03-15 07:14:00,2023-03-15 15:25:00
1,C483513,2023-03-15 14:24:00,2023-03-15 15:25:00
2,G437006,2023-03-15 07:14:00,2023-03-15 15:40:00
3,G437006,2023-03-15 07:14:00,2023-03-15 15:55:00
4,G437006,2023-03-15 07:14:00,2023-03-15 16:10:00
...,...,...,...
123807,G437006,2024-01-17 06:32:00,2024-01-18 09:25:00
123808,D500951,2024-01-15 07:10:00,2024-01-18 09:25:00
123809,D500998,2024-01-17 07:20:00,2024-01-18 09:25:00
123810,C501047,2024-01-18 09:20:00,2024-01-18 09:25:00


## Validating Data via Summary Metrics

Now that we transformed the data, we can do try to do some validtaion and calculations on the various data.

In [11]:
def _validate(group): # custom aggregating function for each groupby unique outage_id to see wehether the latest outage start date for a outage_id is before the earliest update time
    num_rows = len(group)
    num_unique_lon = len(group['lon'].unique())
    num_unique_lat = len(group['lat'].unique())
    num_unique_outages = len(group['outages'].unique())
    num_unique_customers_aff = len(group['consumers_affected'].unique())
    num_unique_customers = len(group['NumConsumers'].unique())
    num_unique_start_times = len(group['start_date'].unique())
    earliest_start_time = group['start_date'].min()
    latest_start_time = group['start_date'].max()
    earliest_update_time = group['updateTime'].min()
    latest_update_time = group['updateTime'].max()
    earliest_timestamp = group['timestamp'].min()
    latest_timestamp = group['timestamp'].max()
    duration_by_lastupt_laststrt = latest_update_time - latest_start_time
    duration_by_lastupt_firststart = latest_update_time - earliest_start_time
    duration_by_update_time = latest_update_time - earliest_update_time
    duration_by_timestamp = latest_timestamp - earliest_timestamp
    timestamp_upt_diff = latest_timestamp - latest_update_time
    is_update_dur_div_15_min = (duration_by_update_time.total_seconds()/60) % 15 == 0
    max_duration = group['duration'].max()
    update_dur_timestamp_dur_err = abs(duration_by_update_time - duration_by_timestamp) if pd.notna(duration_by_update_time) and pd.notna(duration_by_timestamp) else pd.NaT
    update_dur_max_dur_error = abs(max_duration - duration_by_update_time) if pd.notna(max_duration) and pd.notna(duration_by_update_time) else pd.NaT
    is_timestamp_dur_greater_max_dur = duration_by_timestamp > max_duration
    timestamp_dur_max_dur_err = duration_by_timestamp - max_duration if pd.notna(max_duration) and pd.notna(duration_by_timestamp) else pd.NaT
    does_start_update_intersect = latest_start_time > earliest_update_time and earliest_start_time < latest_update_time# does start and update intersect?
    is_start_after_update = earliest_start_time >= latest_update_time # is updatetime range just completely before the startdate range? 
    is_start_before_update = latest_start_time <= earliest_update_time
    is_timestamp_null = group['timestamp'].isnull().any()
    is_start_after_timestamp = earliest_start_time > earliest_timestamp and not is_timestamp_null
    is_start_before_timestamp = earliest_start_time < earliest_timestamp and not is_timestamp_null
    is_timestamp_upt_00_min = (timestamp_upt_diff.total_seconds() // 60) % 60 == 0 if pd.notna(timestamp_upt_diff) else None

    return pd.Series({
        'num_rows': num_rows,
        'num_unique_lon': num_unique_lon,
        'num_unique_lat': num_unique_lat,
        'num_unique_outages': num_unique_outages,
        'num_unique_customers_aff': num_unique_customers_aff,
        'num_unique_customers': num_unique_customers,
        'num_unique_start_times': num_unique_start_times,
        'earliest_start_time': earliest_start_time,
        'latest_start_time': latest_start_time, 
        'earliest_update_time': earliest_update_time,
        'latest_update_time': latest_update_time, 
        'earliest_timestamp': earliest_start_time,
        'latest_timestamp': latest_timestamp,
        'is_start_before_update': is_start_before_update, 
        'does_start_update_intersect': does_start_update_intersect, # is the latest start time before the earliest update time?
        'is_start_after_update': is_start_after_update, # is updatetime range just completely before the startdate range? 
        'max_duration': max_duration, # duration from the given df column
        'duration_by_update': duration_by_update_time,
        'duration_by_timestamp': duration_by_timestamp,
        'timestamp_upt_diff': timestamp_upt_diff,
        'update_dur_max_dur_error': update_dur_max_dur_error,
        'update_dur_timestamp_dur_err': update_dur_timestamp_dur_err,
        'is_timestamp_dur_greater_max_dur': is_timestamp_dur_greater_max_dur,
        'timestamp_dur_max_dur_err': timestamp_dur_max_dur_err,
        'duration_by_lastupt_laststrt': duration_by_lastupt_laststrt, # duration from calculating via update times
        'duration_by_lastupt_firststart': duration_by_lastupt_firststart, 
        'is_update_dur_div_15_min': is_update_dur_div_15_min,
        'is_timestamp_upt_00_min': is_timestamp_upt_00_min,
        'is_timestamp_null': is_timestamp_null,
        'is_start_before_timestamp': is_start_before_timestamp,
        'is_start_after_timestamp': is_start_after_timestamp

    })

In [12]:
# Testing on a dataframe
grouped = test_p._data.groupby("outage_id").apply(_validate)
display(grouped)

  grouped = test_p._data.groupby("outage_id").apply(_validate)


Unnamed: 0_level_0,num_rows,num_unique_lon,num_unique_lat,num_unique_outages,num_unique_customers_aff,num_unique_customers,num_unique_start_times,earliest_start_time,latest_start_time,earliest_update_time,latest_update_time,earliest_timestamp,latest_timestamp,is_start_before_update,does_start_update_intersect,is_start_after_update,max_duration,duration_by_update,duration_by_timestamp,timestamp_upt_diff,update_dur_max_dur_error,update_dur_timestamp_dur_err,is_timestamp_dur_greater_max_dur,timestamp_dur_max_dur_err,duration_by_lastupt_laststrt,duration_by_lastupt_firststart,is_update_dur_div_15_min,is_timestamp_upt_00_min,is_timestamp_null,is_start_before_timestamp,is_start_after_timestamp
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
C483513,1,1,1,1,1,1,1,2023-03-15 14:24:00,2023-03-15 14:24:00,2023-03-15 15:25:00,2023-03-15 15:25:00,2023-03-15 14:24:00,NaT,True,False,False,0 days 02:00:00,0 days 00:00:00,NaT,NaT,0 days 02:00:00,NaT,False,NaT,0 days 01:01:00,0 days 01:01:00,True,,True,False,False
C483521,1,1,1,1,1,1,1,2023-03-16 07:22:00,2023-03-16 07:22:00,2023-03-16 06:25:00,2023-03-16 06:25:00,2023-03-16 07:22:00,NaT,False,False,True,0 days 00:02:00,0 days 00:00:00,NaT,NaT,0 days 00:02:00,NaT,False,NaT,-1 days +23:03:00,-1 days +23:03:00,True,,True,False,False
C483523,6,1,1,2,1,1,1,2023-03-16 13:27:00,2023-03-16 13:27:00,2023-03-16 12:55:00,2023-03-16 14:10:00,2023-03-16 13:27:00,NaT,False,True,False,0 days 01:42:00,0 days 01:15:00,NaT,NaT,0 days 00:27:00,NaT,False,NaT,0 days 00:43:00,0 days 00:43:00,True,,True,False,False
C483525,6,1,1,1,1,2,1,2023-03-17 06:04:00,2023-03-17 06:04:00,2023-03-17 05:25:00,2023-03-17 06:55:00,2023-03-17 06:04:00,NaT,False,True,False,0 days 01:49:00,0 days 01:30:00,NaT,NaT,0 days 00:19:00,NaT,False,NaT,0 days 00:51:00,0 days 00:51:00,True,,True,False,False
C483528,1,1,1,1,1,1,1,2023-03-17 08:34:00,2023-03-17 08:34:00,2023-03-17 07:40:00,2023-03-17 07:40:00,2023-03-17 08:34:00,NaT,False,False,True,0 days 00:05:00,0 days 00:00:00,NaT,NaT,0 days 00:05:00,NaT,False,NaT,-1 days +23:06:00,-1 days +23:06:00,True,,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D501048,4,1,1,3,1,1,1,2024-01-18 09:35:00,2024-01-18 09:35:00,2024-01-18 08:40:00,2024-01-18 09:25:00,2024-01-18 09:35:00,2024-01-18 15:25:40,False,False,True,0 days 00:48:00,0 days 00:45:00,0 days 00:44:53,0 days 06:00:40,0 days 00:03:00,0 days 00:00:07,False,-1 days +23:56:53,-1 days +23:50:00,-1 days +23:50:00,True,True,False,True,False
G437006,27691,785,786,274,59,103,264,2023-03-15 07:14:00,2024-01-17 06:32:00,2023-03-15 15:25:00,2024-01-18 09:25:00,2023-03-15 07:14:00,2024-01-18 15:25:40,False,True,False,7 days 23:27:00,308 days 18:00:00,294 days 17:29:57,0 days 06:00:40,300 days 18:33:00,14 days 00:30:03,True,286 days 18:02:57,1 days 02:53:00,309 days 02:11:00,True,True,True,False,False
G484576,1,1,1,1,1,1,1,2023-04-14 10:25:00,2023-04-14 10:25:00,2023-04-14 09:55:00,2023-04-14 09:55:00,2023-04-14 10:25:00,2023-04-14 14:55:40,False,False,True,0 days 00:30:00,0 days 00:00:00,0 days 00:00:00,0 days 05:00:40,0 days 00:30:00,0 days 00:00:00,False,-1 days +23:30:00,-1 days +23:30:00,-1 days +23:30:00,True,True,False,True,False
G495149,4,2,2,4,2,1,1,2023-08-30 09:40:00,2023-08-30 09:40:00,2023-08-30 08:55:00,2023-08-30 09:40:00,2023-08-30 09:40:00,2023-08-30 14:40:56,False,False,True,0 days 01:00:00,0 days 00:45:00,0 days 00:45:08,0 days 05:00:56,0 days 00:15:00,0 days 00:00:08,False,-1 days +23:45:08,0 days 00:00:00,0 days 00:00:00,True,True,False,True,False


As we notice, while we have start times, update times, and timestamps:
- there could be multiple start times
- the range of start times could intersect or come after the range of update times which doesn't make any sense
- the timestamps are hours after the updateTimes and suspiciously in hour intervals

Also, there is a large outlier in outage_id: G437006.	

#### Let's first check to see what outages have multiple start times and see what patterns there could be for said instances of multiple start times.

In [13]:
df = test_p._data
# Group by outage_id and have a new temp 2nd column which counts the number of unique start_dates
grouped = df.groupby('outage_id')['start_date'].nunique()
display(grouped)

# Within grouped, finds the outage_id's that have ">1" which means more than 1 unique start_date (outage_id are indexes of grouped), and turn it into a list
mult_start_ids = grouped[grouped > 1].index.tolist()

# Subset the original DataFrame based on these incident_ids
subset_df = df[df['outage_id'].isin(mult_start_ids)]
print(f"{len(subset_df['outage_id'].unique())} outage_id's with more than 1 unique start date") # 25 outages with more than 1 unique start_date
# display(subset_df)
cleaned_multi_group = subset_df.groupby('outage_id')['start_date'].apply(lambda x: x.max() - x.min())
display(cleaned_multi_group)

outage_id
C483513      1
C483521      1
C483523      1
C483525      1
C483528      1
          ... 
D501048      1
G437006    264
G484576      1
G495149      1
G495228      1
Name: start_date, Length: 7344, dtype: int64

25 outage_id's with more than 1 unique start date


outage_id
C491856     0 days 00:10:00
C498646     0 days 00:51:00
C498756     0 days 02:00:00
D487369     0 days 01:24:00
D488297     1 days 19:26:00
D488463     0 days 19:21:00
D488474     1 days 13:29:00
D488539     1 days 12:03:00
D488807     0 days 10:02:00
D488816     0 days 19:35:00
D489015     1 days 11:26:00
D489020     0 days 17:52:00
D489095     0 days 17:09:00
D489097     0 days 05:25:00
D489104     0 days 22:17:00
D489142     1 days 10:18:00
D489332     1 days 18:57:00
D489462     0 days 16:43:00
D489484     0 days 18:31:00
D489493     0 days 16:28:00
D489495     0 days 18:39:00
D489497     0 days 13:06:00
D489603     0 days 00:11:00
D493173     0 days 01:00:00
G437006   307 days 23:18:00
Name: start_date, dtype: timedelta64[ns]

Let's find all multi-start_date outageid's dataframes and keep a running list:

In [14]:
# separate code block to 1-time transform and allow the below code block be run independently
temp_pipelines1 = []
for conf in config_list:
    temp_pipelines1.append(GA11TX12(conf, base_file_path))

for pipeline in temp_pipelines1:
    pipeline.load_data()    
    transform_test(pipeline) 

#IMPORTANT: IF I DID temp_pipelines1 = pipelines.copy() 
#           Even though the list of pipelines are copied, since the items are references to pipelines, 
#           any changes made to pipelines in temp (like transforming) will be reflected in the original 
#           pipelines in original pipeline list

/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Walton EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Tri-State EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Oconee EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Wise Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Houston County Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Cherokee County Electric Coop Association.csv


  self._data = pd.read_csv(file_path)


Finding multi-start_date outage_id's

In [15]:
# if any edits done to this code block, rerun entire file.
multi_start_list = []
df_multi = pd.DataFrame()
total_unique_outages = 0
for pipeline in temp_pipelines1:
    df = pipeline._data
    print()
    print(df['EMC'].iloc[0])
    grouped = df.groupby(['outage_id'])['start_date'].nunique()
    total_unique_outages = total_unique_outages + len(grouped)
    filtered_ids = grouped[grouped > 1].index.tolist()
    print(f"{len(filtered_ids)} outage_id's with multiple start_dates")
    multi_start_list.extend(filtered_ids)
    subset_df = df[df['outage_id'].isin(filtered_ids)]
    
    cleaned_multi_group = subset_df.groupby('outage_id')['start_date'].apply(lambda x: x.max() - x.min()) # creating a grouping for each multi-start outage where i find the diff in their latest start and their latest end
    df_multi = pd.concat([df_multi, cleaned_multi_group])


print()
print(multi_start_list)
print(f"{len(multi_start_list)} incident id's with more than 1 unique start date out of {total_unique_outages} total unique incident_id's. {len(multi_start_list)/total_unique_outages} rate")

# rename columns and convert to datetime
df_multi = df_multi.rename(columns={'start_date': 'start_date_range'})
df_multi['start_date_range'] = pd.to_timedelta(df_multi['start_date_range'])
display(df_multi)

# Removing the G437006 outlier case

print(f"Is 'G437006' one of the multi_start_date outage_id's?: {'G437006' in multi_start_list}")

df_multi_wo_outlier = df_multi.drop('G437006').copy() # outlier of G437006 has start dates from 3/2023 to 1/2024 thus skewing the determinations
print(f"Max start_date_range: {df_multi_wo_outlier['start_date_range'].max()}")
print("Min start_date_range: ", df_multi_wo_outlier['start_date_range'].min())

# Calculating mean of date ranges
# sdr_sec = df_multi_wo_outlier['start_date_range'].dt.total_seconds() 
# avg_sec = sdr_sec.mean()
# avg_sdr = pd.Timedelta(seconds=avg_sec)
# print(f"Average start_date_range: {pd.Timedelta(seconds=(df_multi_wo_outlier['start_date_range'].dt.total_seconds()).mean())}") # change to seconds, take the mean, and convert back to timedelta
print("Average start_date_range: ", df_multi_wo_outlier['start_date_range'].mean())


Walton EMC
29 outage_id's with multiple start_dates

Tri-State EMC
17 outage_id's with multiple start_dates

Oconee EMC
0 outage_id's with multiple start_dates

Mitchell EMC
25 outage_id's with multiple start_dates

Wise Electric Coop, Inc.
0 outage_id's with multiple start_dates

Houston County Electric Coop, Inc.
2 outage_id's with multiple start_dates

Cherokee County Electric Coop Association
3 outage_id's with multiple start_dates

['C1001443', 'C1002889', 'C1003971', 'C1009359', 'C951552', 'C952864', 'C960441', 'C961166', 'C962486', 'C979523', 'C979577', 'C980226', 'C988065', 'C991075', 'C992458', 'C996555', 'C997576', 'D1001543', 'D1003884', 'D1008494', 'D967733', 'D969201', 'D980225', 'D987758', 'D990731', 'D995253', 'D997655', 'D997960', 'G961469', 'D763983', 'D764404', 'D764738', 'D764891', 'D764938', 'D765097', 'D765100', 'D765256', 'D765957', 'D765989', 'D766098', 'D766921', 'D766995', 'D767082', 'D767114', 'D767144', 'D767276', 'C491856', 'C498646', 'C498756', 'D487369', 

Unnamed: 0,start_date_range
C1001443,0 days 00:15:00
C1002889,0 days 14:50:00
C1003971,0 days 12:30:00
C1009359,0 days 16:21:00
C951552,0 days 01:00:00
...,...
G720140,0 days 00:29:00
G728187,0 days 00:11:00
D469964,0 days 01:27:00
D487132,0 days 01:30:00


Is 'G437006' one of the multi_start_date outage_id's?: True
Max start_date_range: 6 days 00:00:00
Min start_date_range:  0 days 00:08:00
Average start_date_range:  0 days 15:13:36.800000


As we see, there are only a very few amount of incident_id's (76 out of 40188) that have more than 1 unique start_date. And among those with multiple unique start_dates, they are on average less than a day apart. 

However, we still want to see if start_date can further subdivide the outage_id's. Thus, we will groupby outage_id and start_date and see if the # of unique start_date's (=1) will have a unique longitude and latitude associated with it.

We will assume a single unique longitude and latitude for each unique grouping will mean that there is a unique outage since maybe outages with the same ID differ by location within an area.

In [16]:
for pipeline in temp_pipelines1:
    df = pipeline._data

    print()
    os_grouped = df.groupby(['outage_id', 'start_date'])
    removed = df[~(df['lat'].isna() | df['lon'].isna())]
    oll_grouped = removed.groupby(['outage_id', 'lon', 'lat'])
    print(f"{((os_grouped['lon'].nunique() == os_grouped['lat'].nunique())).sum()} / {len(os_grouped)} outage_id+start_date combos have same # unique lat+lon")
    print(f"{(oll_grouped['start_date'].nunique() == 1).sum()} / {len(oll_grouped)} outage_id+lat+long combos have 1 unique start date")




14722 / 14722 outage_id+start_date combos have same # unique lat+lon
14661 / 14690 outage_id+lat+long combos have 1 unique start date

1609 / 1609 outage_id+start_date combos have same # unique lat+lon
1579 / 1596 outage_id+lat+long combos have 1 unique start date

7935 / 7935 outage_id+start_date combos have same # unique lat+lon
7936 / 7936 outage_id+lat+long combos have 1 unique start date

7632 / 7633 outage_id+start_date combos have same # unique lat+lon
8100 / 8129 outage_id+lat+long combos have 1 unique start date

2188 / 2188 outage_id+start_date combos have same # unique lat+lon
2188 / 2188 outage_id+lat+long combos have 1 unique start date

1656 / 1662 outage_id+start_date combos have same # unique lat+lon
1711 / 1712 outage_id+lat+long combos have 1 unique start date

4783 / 4783 outage_id+start_date combos have same # unique lat+lon
4777 / 4780 outage_id+lat+long combos have 1 unique start date


In [17]:
os_count = 0
os_count2 = 0
oll_count = 0

for pipeline in temp_pipelines1:
    df = pipeline._data
    print(df['EMC'].iloc[0])
    os_grouped = df.groupby(['outage_id', 'start_date'])
    removed = df[~(df['lat'].isna() | df['lon'].isna())]
    oll_grouped = removed.groupby(['outage_id', 'lon', 'lat'])
    
    os_count = os_count + ((os_grouped['lon'].nunique() != 1) | (os_grouped['lat'].nunique() != 1)).sum()
    os_count2 = os_count2 + (os_grouped['lon'].nunique() != os_grouped['lat'].nunique()).sum()
    oll_count = oll_count + (oll_grouped['start_date'].nunique() != 1).sum()
    
    print(f"{((os_grouped['lon'].nunique() == 1) & (os_grouped['lat'].nunique() == 1)).sum()} / {len(os_grouped)} outage_id+start_date combos have 1 unique lat+lon")
    print(f"{((os_grouped['lon'].nunique() == os_grouped['lat'].nunique())).sum()} / {len(os_grouped)} outage_id+start_date combos have same # unique lat+lon")
    print()
    print(f"{(oll_grouped['start_date'].nunique() == 1).sum()} / {len(oll_grouped)} outage_id+lat+long combos have 1 unique start date")
    print()
    print()

print(f"{os_count} outage_id+start outages where # unique lon or # unique lat != 1")
print(f"{os_count2} outage_id+start outages where # unique lon != # unique lat")
print(f"{oll_count} outage+lat+lon outages where # unique start_dates != 1")

Walton EMC
14722 / 14722 outage_id+start_date combos have 1 unique lat+lon
14722 / 14722 outage_id+start_date combos have same # unique lat+lon

14661 / 14690 outage_id+lat+long combos have 1 unique start date


Tri-State EMC
1604 / 1609 outage_id+start_date combos have 1 unique lat+lon
1609 / 1609 outage_id+start_date combos have same # unique lat+lon

1579 / 1596 outage_id+lat+long combos have 1 unique start date


Oconee EMC
7934 / 7935 outage_id+start_date combos have 1 unique lat+lon
7935 / 7935 outage_id+start_date combos have same # unique lat+lon

7936 / 7936 outage_id+lat+long combos have 1 unique start date


Mitchell EMC
7463 / 7633 outage_id+start_date combos have 1 unique lat+lon
7632 / 7633 outage_id+start_date combos have same # unique lat+lon

8100 / 8129 outage_id+lat+long combos have 1 unique start date


Wise Electric Coop, Inc.
2188 / 2188 outage_id+start_date combos have 1 unique lat+lon
2188 / 2188 outage_id+start_date combos have same # unique lat+lon

2188 / 218

In [18]:
for pipeline in temp_pipelines1:
    df = pipeline._data

    print()
    print(df['EMC'].iloc[0])
    mult_start = df[df['outage_id'].isin(multi_start_list)]
    mult_start2 = mult_start[~(mult_start['lat'].isna() | mult_start['lon'].isna())]

    df2 = df[~(df['lat'].isna() | df['lon'].isna())]
    df_grouped = df2.groupby('outage_id').agg({'start_date': 'nunique', 'lat': 'nunique', 'lon': 'nunique'})
    ms_grouped = mult_start2.groupby('outage_id').agg({'start_date': 'nunique', 'lat': 'nunique', 'lon': 'nunique'})

    df_grouped_os = df2.groupby(['outage_id', 'start_date']).agg({'start_date': 'nunique', 'lat': 'nunique', 'lon': 'nunique'})
    ms_grouped_os = mult_start2.groupby(['outage_id', 'start_date']).agg({'start_date': 'nunique', 'lat': 'nunique', 'lon': 'nunique'})



    
    print(f"For each and every unique outage+start_date combo from general df, # of unique start_dates = # of unique latitudes = # of unique longitudes?: {((df_grouped_os['start_date'] == df_grouped_os['lat']) & (df_grouped_os['lat'] == df_grouped_os['lon'])).all()}")
    print(f"{((df_grouped_os['start_date'] == df_grouped_os['lat']) & (df_grouped_os['lat'] == df_grouped_os['lon'])).sum()} out of {len(df_grouped_os)} outages in general df where # of unique start_date = # of unique lats = # of unique long")
    print(f"For each and every unique outage+start_date combo with multi-startTimes, # of unique start_dates = # of unique latitudes = # of unique longitudes?: {((ms_grouped_os['start_date'] == ms_grouped_os['lat']) & (ms_grouped_os['lat'] == ms_grouped_os['lon'])).all()}")
    print(f"{((ms_grouped_os['start_date'] == ms_grouped_os['lat']) & (ms_grouped_os['lat'] == ms_grouped_os['lon'])).sum()} out of {len(ms_grouped_os)} outages in multi-start outage df where # of unique start_date = # of unique lats = # of unique long")
    print()
    print(f"For each and every unique outage_id with multi-startTimes, # of unique start_dates = # of unique latitudes = # of unique longitudes?: {((ms_grouped['start_date'] == ms_grouped['lat']) & (ms_grouped['lat'] == ms_grouped['lon'])).all()}")
    print(f"{((ms_grouped['start_date'] == ms_grouped['lat']) & (ms_grouped['lat'] == ms_grouped['lon'])).sum()} out of {len(ms_grouped)} outages in multi-start outage df where # of unique start_date = # of unique lats = # of unique long")
    print(f"For each and every unique outage_id from general df, # of unique start_dates = # of unique latitudes = # of unique longitudes?: {((df_grouped['start_date'] == df_grouped['lat']) & (df_grouped['lat'] == df_grouped['lon'])).all()}")
    print(f"{((df_grouped['start_date'] == df_grouped['lat']) & (df_grouped['lat'] == df_grouped['lon'])).sum()} out of {len(df_grouped)} outages in general df where # of unique start_date = # of unique lats = # of unique long")
    print()
    print()



Walton EMC
For each and every unique outage+start_date combo from general df, # of unique start_dates = # of unique latitudes = # of unique longitudes?: True
14722 out of 14722 outages in general df where # of unique start_date = # of unique lats = # of unique long
For each and every unique outage+start_date combo with multi-startTimes, # of unique start_dates = # of unique latitudes = # of unique longitudes?: True
61 out of 61 outages in multi-start outage df where # of unique start_date = # of unique lats = # of unique long

For each and every unique outage_id with multi-startTimes, # of unique start_dates = # of unique latitudes = # of unique longitudes?: False
0 out of 29 outages in multi-start outage df where # of unique start_date = # of unique lats = # of unique long
For each and every unique outage_id from general df, # of unique start_dates = # of unique latitudes = # of unique longitudes?: False
14661 out of 14690 outages in general df where # of unique start_date = # of uni


We see that across all 7 dataframes, nearly (only <~30 outliers) all of the outages (except for mitchell with its couple hundred) have the number of unique start times equal the # of unique lats+longs

## Summary of Validation Metrics across all Providers

In [19]:
# Transforming that actual pipelines
for pipeline in pipelines:
    transform_test(pipeline)

In [20]:
# Removing the multi-starttime outage-id's and testing on temp_pipelines1
for pipeline in temp_pipelines1:
    # remove outliers
    df = pipeline._data
    pipeline._data = df[~df['outage_id'].isin(multi_start_list)]

In [21]:
# Helper Function for returning a summary row for each provider as a dictionary
def summary_stats(pipeline, validated):
    subdf = {
        "Provider": pipeline.config['name'],
        "Avg num rows per outage_id": validated['num_rows'].mean(),
        "Num outages": len(validated),
        "Average unique outages per outage": validated['num_unique_outages'].mean(),
        "Average unique num cust aff": validated['num_unique_customers_aff'].mean(),
        "Average unique num cust": validated['num_unique_customers'].mean(),
        "Average unique start times": validated['num_unique_start_times'].mean(),
        "Unique Longitude Avg": validated['num_unique_lon'].mean(),
        "Unique Long Median": validated['num_unique_lon'].median(),
        "Unique Long Max": validated['num_unique_lon'].max(),
        "Unique Long Std": validated['num_unique_lon'].std(),
        "Unique Latitude Avg": validated['num_unique_lat'].mean(),
        "Unique Lat Median": validated['num_unique_lat'].median(),
        "Unique Lat Max": validated['num_unique_lat'].max(),
        "Unique Lat Std": validated['num_unique_lat'].std(),
        "Unique startDate Avg": validated['num_unique_start_times'].mean(),
        "Unique startDate Median": validated['num_unique_start_times'].median(),
        "Unique startDate Max": validated['num_unique_start_times'].max(),
        "Unique startDate Std": validated['num_unique_start_times'].std(),
        "Average update_dur_max_dur error": validated['update_dur_max_dur_error'].mean(),
        "Average update_dur_timestamp_dur error": validated['update_dur_timestamp_dur_err'].mean(),
        "Average Max Duration": validated['max_duration'].mean(),
        "Average Duration by Timestamp": validated['duration_by_timestamp'].mean(),
        "Timestamp_dur_greater_than_max_dur_rate": len(validated[validated['is_timestamp_dur_greater_max_dur'] == True]) / len(validated),
        "Average timestamp_dur_max_dur error": validated['timestamp_dur_max_dur_err'].mean(),
        "start intersects update or start > update freq": (len(validated[validated['is_start_before_update'] == False]) / len(validated)),
        "is_update_dur_div_15_min freq": len(validated[validated['is_update_dur_div_15_min']]) / len(validated),
        "is_timestamp_upt_00_min freq": len(validated[validated['is_timestamp_upt_00_min'] == True]) / len(validated),
        "is_timestamp_null": len(validated[validated['is_timestamp_null'] == True]) / len(validated),
        "is_start_before_timestamp": len(validated[validated['is_start_before_timestamp'] == True]) / len(validated),
        "is_start_after_timestamp": len(validated[validated['is_start_after_timestamp'] == True]) / len(validated)
    }
    return subdf

In [22]:
# For only single-starttime outages
summary_data = []
for pipeline in temp_pipelines1:
    validated = pipeline._data.groupby('outage_id').apply(_validate, include_groups=False).reset_index()
        # validated is the validated rows for a provider where each row is a unique outage_id
    summary_data.append(summary_stats(pipeline, validated)) # row-wise appending
display(pd.DataFrame(summary_data))

Unnamed: 0,Provider,Avg num rows per outage_id,Num outages,Average unique outages per outage,Average unique num cust aff,Average unique num cust,Average unique start times,Unique Longitude Avg,Unique Long Median,Unique Long Max,Unique Long Std,Unique Latitude Avg,Unique Lat Median,Unique Lat Max,Unique Lat Std,Unique startDate Avg,Unique startDate Median,Unique startDate Max,Unique startDate Std,Average update_dur_max_dur error,Average update_dur_timestamp_dur error,Average Max Duration,Average Duration by Timestamp,Timestamp_dur_greater_than_max_dur_rate,Average timestamp_dur_max_dur error,start intersects update or start > update freq,is_update_dur_div_15_min freq,is_timestamp_upt_00_min freq,is_timestamp_null,is_start_before_timestamp,is_start_after_timestamp
0,Walton EMC,4.844895,14661,3.423914,1.00341,1.016029,1.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,0 days 00:32:15.375485983,0 days 00:06:51.199496362,0 days 01:44:22.283609576,0 days 01:15:03.875909345,0.101221,-1 days +23:30:06.880945720,0.902599,0.872383,0.225564,0.025169,0.974831,0.0
1,Tri-State EMC,25.023507,1574,12.614358,1.066074,1.174714,1.0,1.003177,1.0,2,0.05629,1.003177,1.0,2,0.05629,1.0,1.0,1,0.0,0 days 01:20:55.921219822,0 days 00:00:03.552887735,0 days 07:39:26.607369758,0 days 06:24:50.932511356,0.005083,-1 days +22:37:49.024659313,0.857687,0.965693,0.723634,0.020966,0.979034,0.0
2,Oconee EMC,7.532451,7935,6.288847,1.001512,1.012728,1.0,1.000126,1.0,2,0.011226,1.000126,1.0,2,0.011226,1.0,1.0,1,0.0,0 days 00:30:19.720226843,0 days 00:00:26.727929584,0 days 01:46:50.102079395,0 days 01:17:59.834340514,0.043352,-1 days +23:29:07.968864951,0.889225,0.798362,0.521613,0.040706,0.959294,0.0
3,Mitchell EMC,12.916781,7318,4.723558,1.006559,1.080896,1.0,1.000137,1.0,2,0.01169,1.000137,1.0,2,0.01169,1.0,1.0,1,0.0,0 days 00:25:40.912817709,0 days 00:00:56.050167224,0 days 03:45:26.056299535,0 days 03:22:41.093088071,0.034162,-1 days +23:35:11.720178373,0.954769,0.839027,0.690216,0.019404,0.980596,0.0
4,"Wise Electric Coop, Inc.",4.817185,2188,2.347349,1.002285,1.053016,1.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,0 days 00:10:17.221206581,0 days 00:00:03.721206581,0 days 01:09:22.650822669,0 days 00:59:07.128884826,0.018282,-1 days +23:49:44.478062158,0.0,0.987203,0.901737,0.0,1.0,0.0
5,"Houston County Electric Coop, Inc.",8.471049,1658,5.460193,1.034982,1.056092,1.0,1.029554,1.0,6,0.270844,1.028951,1.0,6,0.269793,1.0,1.0,1,0.0,0 days 01:29:38.395657418,0 days 00:02:09.931242460,0 days 03:23:46.284680337,0 days 01:55:51.940892641,0.045235,-1 days +22:32:05.656212304,0.0,0.796743,0.752714,0.0,1.0,0.0
6,Cherokee County Electric Coop Association,10.778313,4777,8.093783,1.000419,1.1166,1.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,0 days 01:44:50.517060916,0 days 00:00:25.033912497,0 days 05:13:16.717605191,0 days 03:28:22.998743981,0.016747,-1 days +22:15:06.281138791,0.0,0.999372,0.300816,0.0,1.0,0.0


In [23]:
# For all outages including multi-starttime outages
summary_data = []
for pipeline in pipelines:
    validated = pipeline._data.groupby('outage_id').apply(_validate, include_groups=False).reset_index()
        # validated is the validated rows for a provider where each row is a unique outage_id
    summary_data.append(summary_stats(pipeline, validated)) # row-wise appending
display(pd.DataFrame(summary_data))

Unnamed: 0,Provider,Avg num rows per outage_id,Num outages,Average unique outages per outage,Average unique num cust aff,Average unique num cust,Average unique start times,Unique Longitude Avg,Unique Long Median,Unique Long Max,Unique Long Std,Unique Latitude Avg,Unique Lat Median,Unique Lat Max,Unique Lat Std,Unique startDate Avg,Unique startDate Median,Unique startDate Max,Unique startDate Std,Average update_dur_max_dur error,Average update_dur_timestamp_dur error,Average Max Duration,Average Duration by Timestamp,Timestamp_dur_greater_than_max_dur_rate,Average timestamp_dur_max_dur error,start intersects update or start > update freq,is_update_dur_div_15_min freq,is_timestamp_upt_00_min freq,is_timestamp_null,is_start_before_timestamp,is_start_after_timestamp
0,Walton EMC,4.860041,14690,3.426004,1.00354,1.016201,1.002178,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.002178,1.0,3,0.050816,0 days 00:32:28.019060585,0 days 00:06:51.464185981,0 days 01:45:02.258679373,0 days 01:15:49.418667969,0.101906,-1 days +23:30:11.459997208,0.902723,0.87209,0.225459,0.025187,0.974813,0.0
1,Tri-State EMC,25.818353,1591,13.199874,1.069767,1.179133,1.011314,1.003143,1.0,2,0.055989,1.003143,1.0,2,0.055989,1.011314,1.0,3,0.111582,0 days 01:28:30.999371464,0 days 00:00:03.621309370,0 days 07:57:45.707102451,0 days 06:41:20.581514762,0.013828,-1 days +22:35:41.261874198,0.85858,0.964173,0.715902,0.020742,0.979258,0.0
2,Oconee EMC,7.532451,7935,6.288847,1.001512,1.012728,1.0,1.000126,1.0,2,0.011226,1.000126,1.0,2,0.011226,1.0,1.0,1,0.0,0 days 00:30:19.720226843,0 days 00:00:26.727929584,0 days 01:46:50.102079395,0 days 01:17:59.834340514,0.043352,-1 days +23:29:07.968864951,0.889225,0.798362,0.521613,0.040706,0.959294,0.0
3,Mitchell EMC,16.858932,7344,4.888617,1.014434,1.097086,1.039352,1.10689,1.0,785,9.148502,1.107026,1.0,786,9.160171,1.039352,1.0,264,3.069569,0 days 01:25:27.312091503,0 days 00:03:44.620383226,0 days 03:49:49.477124183,0 days 04:24:08.841432935,0.035812,0 days 00:32:11.299083587,0.954929,0.837554,0.689134,0.019472,0.980528,0.0
4,"Wise Electric Coop, Inc.",4.817185,2188,2.347349,1.002285,1.053016,1.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,0 days 00:10:17.221206581,0 days 00:00:03.721206581,0 days 01:09:22.650822669,0 days 00:59:07.128884826,0.018282,-1 days +23:49:44.478062158,0.0,0.987203,0.901737,0.0,1.0,0.0
5,"Houston County Electric Coop, Inc.",8.483735,1660,5.476506,1.036747,1.056024,1.001205,1.03012,1.0,6,0.271728,1.029518,1.0,6,0.270683,1.001205,1.0,2,0.0347,0 days 01:30:09.831325301,0 days 00:02:09.906024096,0 days 03:24:29.096385542,0 days 01:56:03.060240963,0.045181,-1 days +22:31:33.963855422,0.0,0.796386,0.753012,0.0,1.0,0.0
6,Cherokee County Electric Coop Association,10.775523,4780,8.092259,1.000418,1.116527,1.000628,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.000628,1.0,2,0.025047,0 days 01:44:48.288702928,0 days 00:00:25.038075313,0 days 05:13:07.594142259,0 days 03:28:18.183263598,0.017155,-1 days +22:15:10.589121339,0.000628,0.999372,0.300837,0.0,1.0,0.0


Start date and End date?

From the summary table above, we notice how the 4 GA providers have a majority of their start_date ranges intersecting with their updateTime ranges (far right column) making both updateTime and startDate dubious metrics to use for start/end.
A startDate after an updateTime could mean the startDate was determined to start at a later date after the updateTime.  
For now, we'll use startDate as the startTime metric. 

How about the duration?
- Most of the update time differences are in 15 min intervals meaning it is not really precise
- Some providers have their latest timestamp strictly some hours (ex: 6:00:00) after the latest updatetime (a non trivial %) meaning the timestamp may be dependent on the updatetime (maybe timestamp is some timezone diff away from updatetime)

Comparing max duration vs diff in update vs diff in timestamp:
- The max duration vs the difference in update yields non-trivial differences (several hours to over a day long), so that means that the difference update times could not be the entirety of the outage (ex: the first updatetime is associated with a duration meaning an outage is ongoing at this point) (ex2: outages with only 1 line meaning the diff in updatetime is 0 but either the updatetime should be the start/end of the outage).
- The max duration vs the diff in timestamp yields non-trivial differences as well.
- Curiously, diff in update and diff in timestamp show the smallest difference between them which could allude that update and timestamp are more related with each other than max duration is to either.
- With the clues that update time and timestamp may not be precise and independent, max_duration seems like a more promising candidate.

Therefore, by process of eliminating updateTime diff, timestamp diff, and starttime diff, we will use the duration column for the duration calculation

- Thus, we will use the start_date as the start and add the max_duration to the start_date to get the end date.

# Running the Pipeline

In [24]:
# Instantiate a BasePipeline object for each provider in the configuration
for provider in config['providers']:
    pipeline = GA11TX12(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)

/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Walton EMC.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C1000672,2023-11-18 18:08:00,2023-11-21 16:39:00,33.71,-84.01,30012,Rockdale,13247.0,Walton EMC,Georgia,2 days 22:31:00,2 days 22:46:00,2 days 22:38:30,1.0,2 days 22:31:00
1,C1000694,2023-11-20 09:15:00,2023-11-20 11:23:00,33.78,-83.86,30052,Walton,13297.0,Walton EMC,Georgia,0 days 02:08:00,0 days 02:23:00,0 days 02:15:30,1.0,0 days 02:08:00
2,C1000697,2023-11-20 09:41:00,2023-11-20 10:09:00,33.74,-83.70,30655,Walton,13297.0,Walton EMC,Georgia,0 days 00:28:00,0 days 00:43:00,0 days 00:35:30,1.0,0 days 00:28:00
3,C1000701,2023-11-20 10:19:00,2023-11-20 10:24:00,33.91,-83.41,30606,Clarke,13059.0,Walton EMC,Georgia,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,1.0,0 days 00:05:00
4,C1000702,2023-11-20 11:06:00,2023-11-20 13:10:00,33.75,-83.99,30012,Rockdale,13247.0,Walton EMC,Georgia,0 days 02:04:00,0 days 02:19:00,0 days 02:11:30,1.0,0 days 02:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14685,G995287,2023-09-08 21:49:00,2023-09-09 00:24:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 02:35:00,0 days 02:50:00,0 days 02:42:30,83.9,9 days 00:44:30
14686,G996351,2023-09-12 16:20:00,2023-09-12 17:54:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 01:34:00,0 days 01:49:00,0 days 01:41:30,3.0,0 days 04:42:00
14687,G996585,2023-09-15 05:24:00,2023-09-15 06:09:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 00:45:00,0 days 01:00:00,0 days 00:52:30,2.0,0 days 01:30:00
14688,G997542,2023-09-28 21:01:00,2023-09-28 22:09:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 01:08:00,0 days 01:23:00,0 days 01:15:30,12.0,0 days 13:36:00


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Tri-State EMC.csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,D760313,2023-03-16 21:34:00,2023-03-16 22:10:00,35.12,-84.33,37391,Polk,47139.0,Tri-State EMC,Tennessee,0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,3.0,0 days 01:48:00
1,D760353,2023-03-19 09:16:00,2023-03-19 17:09:00,34.93,-84.32,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 07:53:00,0 days 08:08:00,0 days 08:00:30,780.0,256 days 05:00:00
2,D760483,2023-03-20 01:05:00,2023-03-20 02:24:00,34.92,-84.28,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 01:19:00,0 days 01:34:00,0 days 01:26:30,823.0,45 days 03:37:00
3,D760513,2023-03-20 07:37:00,2023-03-20 09:24:00,34.99,-84.27,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,1.0,0 days 01:47:00
4,D760514,2023-03-20 11:50:00,2023-03-20 12:39:00,34.84,-84.30,30513,Fannin,13111.0,Tri-State EMC,Georgia,0 days 00:49:00,0 days 01:04:00,0 days 00:56:30,1.0,0 days 00:49:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,D774504,2024-01-17 23:23:00,2024-01-18 01:39:00,34.90,-84.27,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 02:16:00,0 days 02:31:00,0 days 02:23:30,2.0,0 days 04:32:00
1587,D774507,2024-01-18 04:47:00,2024-01-18 07:23:00,35.09,-84.30,unknown,,,Tri-State EMC,,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,2.0,0 days 05:12:00
1588,D774509,2024-01-18 04:47:00,2024-01-18 07:23:00,35.09,-84.30,28906,Cherokee,37039.0,Tri-State EMC,North Carolina,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,7.0,0 days 18:12:00
1589,D774512,2024-01-18 09:07:00,2024-01-18 09:39:00,34.94,-84.28,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 00:32:00,0 days 00:47:00,0 days 00:39:30,3.0,0 days 01:36:00


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Oconee EMC.csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C434216,2023-03-15 16:31:00,2023-03-15 17:24:00,32.70,-83.50,31295,,,Oconee EMC,,0 days 00:53:00,0 days 01:08:00,0 days 01:00:30,1.0,0 days 00:53:00
1,C434219,2023-03-16 19:42:00,2023-03-16 20:39:00,32.46,-83.18,unknown,,,Oconee EMC,,0 days 00:57:00,0 days 01:12:00,0 days 01:04:30,1.0,0 days 00:57:00
2,C434221,2023-03-17 15:02:00,2023-03-17 15:39:00,32.62,-83.35,31044,Twiggs,13289.0,Oconee EMC,Georgia,0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,1.0,0 days 00:37:00
3,C434222,2023-03-17 15:15:00,2023-03-17 15:39:00,32.62,-83.35,31044,Twiggs,13289.0,Oconee EMC,Georgia,0 days 00:24:00,0 days 00:39:00,0 days 00:31:30,1.0,0 days 00:24:00
4,C434224,2023-03-17 19:15:00,2023-03-17 19:54:00,32.56,-82.95,31021,Laurens,13175.0,Oconee EMC,Georgia,0 days 00:39:00,0 days 00:54:00,0 days 00:46:30,1.0,0 days 00:39:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7930,D444934,2024-01-10 13:07:00,2024-01-10 14:39:00,32.61,-82.97,31021,Laurens,13175.0,Oconee EMC,Georgia,0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,1.0,0 days 01:32:00
7931,D444941,2024-01-10 16:57:00,2024-01-10 19:23:00,32.65,-82.97,unknown,,,Oconee EMC,,0 days 02:26:00,0 days 02:41:00,0 days 02:33:30,1.0,0 days 02:26:00
7932,D444972,2024-01-13 11:53:00,2024-01-13 14:08:00,32.90,-83.25,31054,Wilkinson,13319.0,Oconee EMC,Georgia,0 days 02:15:00,0 days 02:30:00,0 days 02:22:30,4.0,0 days 09:00:00
7933,D444980,2024-01-16 10:10:00,2024-01-16 11:39:00,32.90,-83.25,31054,Wilkinson,13319.0,Oconee EMC,Georgia,0 days 01:29:00,0 days 01:44:00,0 days 01:36:30,5.0,0 days 07:25:00


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C483513,2023-03-15 14:24:00,2023-03-15 16:24:00,31.21,-84.37,31730,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 02:00:00,0 days 02:15:00,0 days 02:07:30,1.00,0 days 02:00:00
1,C483521,2023-03-16 07:22:00,2023-03-16 07:24:00,31.18,-84.10,31779,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 00:02:00,0 days 00:17:00,0 days 00:09:30,1.00,0 days 00:02:00
2,C483523,2023-03-16 13:27:00,2023-03-16 15:09:00,31.25,-84.43,unknown,,,Mitchell EMC,,0 days 01:42:00,0 days 01:57:00,0 days 01:49:30,1.00,0 days 01:42:00
3,C483525,2023-03-17 06:04:00,2023-03-17 07:53:00,31.40,-83.72,31795,Worth,13321.0,Mitchell EMC,Georgia,0 days 01:49:00,0 days 02:04:00,0 days 01:56:30,1.00,0 days 01:49:00
4,C483528,2023-03-17 08:34:00,2023-03-17 08:39:00,31.24,-84.26,31730,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,1.00,0 days 00:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7339,D501048,2024-01-18 09:35:00,2024-01-18 10:23:00,31.39,-84.67,39813,Calhoun,13037.0,Mitchell EMC,Georgia,0 days 00:48:00,0 days 01:03:00,0 days 00:55:30,1.00,0 days 00:48:00
7340,G437006,2023-03-15 07:14:00,2023-03-23 06:41:00,31.43,-84.16,unknown,,,Mitchell EMC,,7 days 23:27:00,7 days 23:42:00,7 days 23:34:30,1.21,9 days 16:17:23.312267523
7341,G484576,2023-04-14 10:25:00,2023-04-14 10:55:00,31.55,-84.31,31721,Dougherty,13095.0,Mitchell EMC,Georgia,0 days 00:30:00,0 days 00:45:00,0 days 00:37:30,45.00,0 days 22:30:00
7342,G495149,2023-08-30 09:40:00,2023-08-30 10:40:00,31.55,-83.96,31791,Worth,13321.0,Mitchell EMC,Georgia,0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,4.00,0 days 04:00:00


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Wise Electric Coop, Inc..csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C399381,2023-04-15 13:07:00,2023-04-15 14:16:00,33.04,-97.75,76083,,,"Wise Electric Coop, Inc.",,0 days 01:09:00,0 days 01:24:00,0 days 01:16:30,1.0,0 days 01:09:00
1,C399382,2023-04-16 07:35:00,2023-04-16 09:16:00,33.06,-97.78,76073,Wise,48497.0,"Wise Electric Coop, Inc.",Texas,0 days 01:41:00,0 days 01:56:00,0 days 01:48:30,1.0,0 days 01:41:00
2,C399386,2023-04-16 16:29:00,2023-04-16 17:46:00,33.17,-97.44,76234,Wise,48497.0,"Wise Electric Coop, Inc.",Texas,0 days 01:17:00,0 days 01:32:00,0 days 01:24:30,1.0,0 days 01:17:00
3,C399401,2023-04-17 16:07:00,2023-04-17 18:00:00,33.28,-97.56,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:53:00,0 days 02:08:00,0 days 02:00:30,1.0,0 days 01:53:00
4,C399402,2023-04-18 08:55:00,2023-04-18 09:31:00,33.34,-97.63,unknown,,,"Wise Electric Coop, Inc.",,0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,1.0,0 days 00:36:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2183,G406396,2023-12-30 07:23:00,2023-12-30 08:46:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:23:00,0 days 01:38:00,0 days 01:30:30,14.0,0 days 19:22:00
2184,G406415,2023-12-30 12:41:00,2023-12-30 13:46:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:05:00,0 days 01:20:00,0 days 01:12:30,12.0,0 days 13:00:00
2185,G406964,2024-01-12 01:31:00,2024-01-12 02:31:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,6.0,0 days 06:00:00
2186,G406967,2024-01-12 04:23:00,2024-01-12 05:47:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:24:00,0 days 01:39:00,0 days 01:31:30,2.0,0 days 02:48:00


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Houston County Electric Coop, Inc..csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,D712398,2023-04-14 07:21:00,2023-04-14 07:46:00,31.28,-95.39,75835,Houston,48225.0,"Houston County Electric Coop, Inc.",Texas,0 days 00:25:00,0 days 00:40:00,0 days 00:32:30,2.00,0 days 00:50:00
1,D712405,2023-04-15 06:13:00,2023-04-15 07:16:00,30.93,-95.67,75852,Madison,48313.0,"Houston County Electric Coop, Inc.",Texas,0 days 01:03:00,0 days 01:18:00,0 days 01:10:30,31.00,1 days 08:33:00
2,D712420,2023-04-15 09:58:00,2023-04-15 11:16:00,31.72,-95.56,75801,Anderson,48001.0,"Houston County Electric Coop, Inc.",Texas,0 days 01:18:00,0 days 01:33:00,0 days 01:25:30,44.00,2 days 09:12:00
3,D712431,2023-04-15 12:18:00,2023-04-15 13:01:00,31.13,-95.74,75835,Houston,48225.0,"Houston County Electric Coop, Inc.",Texas,0 days 00:43:00,0 days 00:58:00,0 days 00:50:30,8.00,0 days 05:44:00
4,D712434,2023-04-15 17:44:00,2023-04-15 19:16:00,31.53,-95.89,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,7.00,0 days 10:44:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,G740838,2024-01-12 06:55:00,2024-01-12 07:32:00,31.22,-95.39,unknown,,,"Houston County Electric Coop, Inc.",,0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,48.00,1 days 05:36:00
1656,G741154,2024-01-12 07:48:00,2024-01-12 08:58:00,31.26,-94.89,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:10:00,0 days 01:25:00,0 days 01:17:30,77.00,3 days 17:50:00
1657,G741203,2024-01-12 08:08:00,2024-01-12 09:55:00,31.21,-95.04,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,8.00,0 days 14:16:00
1658,G741890,2024-01-15 05:40:00,2024-01-15 09:46:00,31.02,-95.43,unknown,,,"Houston County Electric Coop, Inc.",,0 days 04:06:00,0 days 04:21:00,0 days 04:13:30,68.14,11 days 15:23:08.571428571


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Cherokee County Electric Coop Association.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C480035,2023-09-24 22:28:00,2023-09-25 17:31:00,32.16,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 19:03:00,0 days 19:18:00,0 days 19:10:30,1.0,0 days 19:03:00
1,C480066,2023-09-24 22:43:00,2023-09-25 17:31:00,32.15,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 18:48:00,0 days 19:03:00,0 days 18:55:30,1.0,0 days 18:48:00
2,C480094,2023-09-24 22:45:00,2023-09-25 12:16:00,32.15,-95.39,75757,Cherokee,48073.0,Cherokee County Electric Coop Association,Texas,0 days 13:31:00,0 days 13:46:00,0 days 13:38:30,1.0,0 days 13:31:00
3,C480111,2023-09-24 22:46:00,2023-09-25 17:31:00,32.16,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 18:45:00,0 days 19:00:00,0 days 18:52:30,1.0,0 days 18:45:00
4,C480121,2023-09-24 22:47:00,2023-09-26 05:46:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:59:00,1 days 07:14:00,1 days 07:06:30,1.0,1 days 06:59:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4775,G483387,2023-09-24 22:55:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 07:06:00,1 days 07:21:00,1 days 07:13:30,14.0,18 days 03:24:00
4776,G483389,2023-09-24 23:05:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:56:00,1 days 07:11:00,1 days 07:03:30,18.0,23 days 04:48:00
4777,G483390,2023-09-24 22:47:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 07:14:00,1 days 07:29:00,1 days 07:21:30,9.0,11 days 17:06:00
4778,G483391,2023-09-24 23:04:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:57:00,1 days 07:12:00,1 days 07:04:30,12.0,15 days 11:24:00


# Testing the New Pipeline

In [44]:
class BasePipeline_new:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.type_to_prefix = {'o': 'per_outage', 'c': 'per_county', 'z': 'per_zipcode'} 
        self._data = pd.DataFrame({})
        self.geomap = {}
        self._load_data()
        self._load_geo_mapping()
        
    def _load_geo_mapping(self):
        try:
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
            with open('zip_to_state_name.json', 'r') as json_file:
                self.geomap['zip_to_state_name'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during geo map loading: {e}") 

    def _construct_file_path(self):
        file_prefix = self.type_to_prefix[self.config['type']]
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def _load_data(self):
        try:
            file_path = self._construct_file_path()
            self._data = pd.read_csv(file_path)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self, **kwargs):
        raise NotImplementedError
    
    # TODO: remove 
    def standardize(self):
        """
        obslete
        """
        self._load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def to_incident_level(self, identifers=['outage_id'], method='id_grouping'):
        """
        identifer: default identifier name "outage_id", or list like ['IncidentId', 'lat', 'lng', 'subgroup']
        method: "id_grouping" or "timegap_seperation" 

        This method will replace the standardize basically
        """
        df = self.transform(identifiers=identifers, method=method)
        grouped = df.groupby('id').apply(self._agg_vars).reset_index().round(2)
        
        return grouped
    
    def to_geoarea_level(self, geo_level='zipcode', time_interval='hourly'):
        """
        geo_level: 'zipcode', 'county', 'state'
        time_interval: 'hourly', 'daily', 'monthly'
        """    
        # TODO: complet geo_level and time_interval support
        eastern = tz.gettz('US/Eastern')
        self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)
        self._data['year'] = self._data['timestamp'].dt.year
        self._data['month'] = self._data['timestamp'].dt.month
        self._data['day'] = self._data['timestamp'].dt.day
        self._data['hour'] = self._data['timestamp'].dt.hour
        
        df = self.transform(geo_level=geo_level, time_interval=time_interval)
        
        # element wise metrics computation
        df['duration_weight'] = 15
        df['outage_freq_x_cust_a'] = df['customer_affected'] * df['outage_count']
        df['cust_a_x_duration'] = df['customer_affected'] * df['duration_weight']
        
        # TODO: complet geo_level and time_interval support 
        keys = ['EMC', 'year', 'month', 'day', 'hour', geo_level]
        grouped = df.groupby(keys).agg({
            'customer_affected': 'mean',
            'customer_served': 'mean',
            'percent_customer_affected': 'mean',
            'outage_count': 'max',
            'duration_weight': 'sum',
            'outage_freq_x_cust_a': 'sum',
            'cust_a_x_duration': 'sum'
            }).reset_index()
        
        #TODO: fill non outage hours with 0
        
        return grouped
    
    def _agg_vars(self, group):
        first_timestamp = group['timestamp'].iloc[0]
        last_timestamp = group['timestamp'].iloc[-1]
        duration_diff = (last_timestamp - first_timestamp).total_seconds() / 60
        duration_15 = 15 * len(group)
        group['duration_weight'] = (group['timestamp'].diff().dt.total_seconds() / 60).round(0).fillna(15)
        cust_affected_x_duration = (group['customer_affected'] * group['duration_weight']).sum()
        cust_a_mean = cust_affected_x_duration / group['duration_weight'].sum()
        
        return pd.Series({
            'first_timestamp': first_timestamp,
            'last_timestamp': last_timestamp,
            'duration_diff': duration_diff,
            'duration_15': duration_15,
            'customer_affected_mean': cust_a_mean,
            'cust_affected_x_duration': cust_affected_x_duration
        })
    
    # TODO: remove 
    def _compute_metrics(self, group):
        """
        Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
        given dataframe being transformed with standardized column names
        """
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
    
    def save(self, path=None):
        raise NotImplementedError
    
    def get_dataframe(self):
        return self._data
    
    def _add_metadata(self):
        """
        #TODO: add state, provider variables
        """
        raise NotImplementedError
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass


class GA11TX12_new(BasePipeline_new):
    def transform(self, **kwargs):
        """
        Formatting issues with start_date and updateTime
        start_date format: 03/15 05:28 pm
        timestamp format: 01-18-2024 15:25:06 (For Walton, Tri-State, Oconee, and Mitchell, there are null timestamps in March 2023)
        updateTime format: Mar 15, 5 09, pm

        Remove outages with multiple start dates due to not there being many
        """

        identifiers = kwargs.get('identifiers')
        method = kwargs.get('method')
        geo_level = kwargs.get('geo_level')
        time_interval = kwargs.get('time_interval')
        df = kwargs.get('dataframe', self._data.copy())

        df = df.rename(columns={
            'incident_id':'outage_id',
            'start_date': 'start_time',
            'zip_code':'zipcode',
            'consumers_affected': 'customer_affected'
        })
        
        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')

        def _reformat_start_date(row):
            month_day, time, ampm = row['start_time'].split(' ')
            s_month, s_day = month_day.split('/')
            year = None
            # Determining year using timestamp as start_date does not include year
            if pd.notna(row['timestamp']): 
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])
                if t_month == '01' and s_month == '12':
                    year = str(int(t_year) - 1)
                else:
                    year = t_year 
            else:
            # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
                year = '2023'

            hour, minute = time.split(':')

            if 'am' in ampm.lower() and hour == '12':
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour

            # Add leading zeros if necessary
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            reformatted_date = f'{s_month}-{s_day}-{year} {hour}:{minute}:00'
            return reformatted_date

        def _reformat_update(row):
            month_day, time, ampm = row['updateTime'].split(',') 
            u_month, u_day = month_day.split(' ')
            month_dict = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 
                        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }
            u_month = month_dict[u_month]
            
            year = None
            if pd.notna(row['timestamp']):
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])
                if t_month == '01' and u_month == '12':
                    year = str(int(t_year) - 1)
                else:
                    year = t_year 
            else:
                year = '2023'

            hour, minute = time.split()
            if 'am' in ampm.lower() and hour == '12':
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            reformatted_date = f'{u_month}-{u_day}-{year} {hour}:{minute}:00'
            return reformatted_date

        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')
        try:
            df['start_time'] = df.apply(_reformat_start_date, axis=1) 
            df['start_time'] = pd.to_datetime(df['start_time'], utc=True).dt.tz_convert(eastern)
            df['updateTime'] = df.apply(_reformat_update, axis=1)
            df['updateTime'] = pd.to_datetime(df['updateTime'], utc=True).dt.tz_convert(eastern)
            df['duration'] = pd.to_timedelta(df['duration'])
            df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True).dt.tz_convert(eastern)


        except Exception as e:
            print(f"An error occurred during transformation: {e}")

        return df

    def to_incident_level(self, identifers=['outage_id', 'start_time'], method='id_grouping'):
        """
        identifer: default identifier name "outage_id", or list like ['IncidentId', 'lat', 'lng', 'subgroup']
        method: "id_grouping" or "timegap_seperation" 
        """
        df = self.transform(identifiers=identifers, method=method)
        grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)
        
        return grouped
    
    def _agg_vars(self, group):
        """
        Overwriting superclass _compute_metrics as duration is given in this layout is more accurate
        """
        first_timestamp = group['timestamp'].min()
        last_timestamp = group['timestamp'].max()

        duration_dur_max = group['duration'].max()
        duration_ts_diff = last_timestamp - first_timestamp
        
        duration_diff = duration_dur_max
        duration_15 = 15 * len(group)
        group['duration_weight'] = (group['timestamp'].diff().dt.total_seconds() / 60).round(0).fillna(15)
        cust_affected_x_duration = (group['customer_affected'] * group['duration_weight']).sum()
        cust_a_mean = cust_affected_x_duration / group['duration_weight'].sum()

        duration_max = duration_diff + timedelta(minutes=15) # because 15 minute update intervals
        duration_mean = (duration_diff + duration_max) / 2
        start_time = group['start_time'].min()
        end_time = start_time + duration_diff

        zipcode = group['zipcode'].iloc[-1]
        zipcode_map = self.geomap['zip_to_county_name']        
        county_name = self.geomap['zip_to_county_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        county_fips = self.geomap['zip_to_county_fips'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        state = self.geomap['zip_to_state_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA

        utility_provider = group['EMC'].iloc[-1]

        customer_affected_mean = group['customer_affected'].mean()
        total_customer_outage_time = customer_affected_mean * duration_diff

        return pd.Series({
            # 'start_time': start_time,
            'end_time': end_time,
            'first_timestamp': first_timestamp,
            'last_timestamp': last_timestamp,
            'lat': group['lat'].unique(),
            'long': group['lon'].unique(),
            'zipcode': group['zipcode'].unique(),
            'county_name': county_name, 
            'county_fips': county_fips,
            'state': state,
            'utility_provider': utility_provider,
            'duration_diff': duration_dur_max,
            'duration_max': duration_max,
            'duration_mean': duration_mean,
            'duration_15': duration_15,
            'customer_affected_mean': cust_a_mean,
            'cust_affected_x_duration': cust_affected_x_duration,
            'total_customer_outage_time': total_customer_outage_time
        })


In [45]:
for provider in config['providers']:
    new_pipeline = GA11TX12_new(provider, base_file_path)
    new_pipeline._data = new_pipeline.to_incident_level(identifers=['outage_id', 'start_time'], method='id_grouping')
    display(new_pipeline._data)

  self._data = pd.read_csv(file_path)


Unnamed: 0,substation,feeder,outage_id,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,EMC,alias,outage_comment,timestamp,opt_code,poly
0,14.0,3.0,C950581,,,2023-03-15 12:28:00-04:00,0 days 00:11:00,1,-83.669285,33.911106,Electric,1,137416,30656,False,2023-03-15 11:39:00-04:00,Walton EMC,,,NaT,,
1,1.0,3.0,C950582,,,2023-03-15 12:48:00-04:00,0 days 00:05:00,1,-83.703931,33.760364,Electric,1,137416,30655,False,2023-03-15 11:54:00-04:00,Walton EMC,,,NaT,,
2,1.0,3.0,C950582,,,2023-03-15 12:48:00-04:00,0 days 00:21:00,1,-83.703931,33.760364,Electric,1,137416,30655,False,2023-03-15 12:09:00-04:00,Walton EMC,,,NaT,,
3,1.0,3.0,C950582,,,2023-03-15 12:48:00-04:00,0 days 00:35:00,1,-83.703931,33.760364,Electric,2,137416,30655,False,2023-03-15 12:24:00-04:00,Walton EMC,,,NaT,,
4,14.0,1.0,C950583,,,2023-03-15 13:14:00-04:00,0 days 00:10:00,1,-83.768580,33.900673,Electric,2,137416,30620,False,2023-03-15 12:24:00-04:00,Walton EMC,,,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71389,11.0,5.0,D1009642,,,2024-01-18 03:15:00-05:00,0 days 01:54:00,3,-83.440692,33.872791,Electric,15,139225,30677,False,2024-01-18 04:10:00-05:00,Walton EMC,EXPERIMENT STAT/ 15 KVA,,2024-01-18 10:25:06-05:00,,"[{'lon': '-83.4402683290629', 'lat': '33.87192..."
71390,6.0,1.0,C1009637,,,2024-01-18 02:37:00-05:00,0 days 02:32:00,1,-84.043121,33.810690,Electric,15,139225,30039,False,2024-01-18 04:10:00-05:00,Walton EMC,,,2024-01-18 10:25:06-05:00,,
71391,34.0,3.0,C1009640,,,2024-01-18 03:08:00-05:00,0 days 02:00:00,1,-83.435911,33.739100,Electric,15,139225,30638,False,2024-01-18 04:10:00-05:00,Walton EMC,,,2024-01-18 10:25:06-05:00,,
71392,4.0,2.0,C1009654,,,2024-01-18 04:28:00-05:00,0 days 00:41:00,1,-83.512373,33.571093,Electric,15,139225,30650,False,2024-01-18 04:10:00-05:00,Walton EMC,,,2024-01-18 10:25:06-05:00,,


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,C1000672,2023-11-18 13:08:00-05:00,2023-11-21 11:39:00-05:00,2023-11-21 15:25:08-05:00,2023-11-21 16:40:10-05:00,[33.71487350614651],[-84.00784826417568],[30012],Rockdale,13247,Georgia,Walton EMC,2 days 22:31:00,2 days 22:46:00,2 days 22:38:30,90,1.0,90.0,2 days 22:31:00
1,C1000694,2023-11-20 04:15:00-05:00,2023-11-20 06:23:00-05:00,2023-11-20 09:25:06-05:00,2023-11-20 11:55:06-05:00,[33.77704743985282],[-83.85792774928218],[30052],Walton,13297,Georgia,Walton EMC,0 days 02:08:00,0 days 02:23:00,0 days 02:15:30,150,1.0,165.0,0 days 02:08:00
2,C1000697,2023-11-20 04:41:00-05:00,2023-11-20 05:09:00-05:00,2023-11-20 09:55:05-05:00,2023-11-20 10:10:12-05:00,[33.73784886972853],[-83.70276616988113],[30655],Walton,13297,Georgia,Walton EMC,0 days 00:28:00,0 days 00:43:00,0 days 00:35:30,30,1.0,30.0,0 days 00:28:00
3,C1000701,2023-11-20 05:19:00-05:00,2023-11-20 05:24:00-05:00,2023-11-20 10:25:10-05:00,2023-11-20 10:25:10-05:00,[33.91113849481128],[-83.4083882731394],[30606],Clarke,13059,Georgia,Walton EMC,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,15,1.0,15.0,0 days 00:05:00
4,C1000702,2023-11-20 06:06:00-05:00,2023-11-20 08:10:00-05:00,2023-11-20 11:25:06-05:00,2023-11-20 14:10:08-05:00,[33.751105070831585],[-83.98963662598489],[30012],Rockdale,13247,Georgia,Walton EMC,0 days 02:04:00,0 days 02:19:00,0 days 02:11:30,165,1.0,180.0,0 days 02:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14717,G995287,2023-09-08 17:49:00-04:00,2023-09-08 20:24:00-04:00,2023-09-08 22:10:09-04:00,2023-09-09 00:25:07-04:00,[0.0],[-85.4887438847069],[unknown],,,,Walton EMC,0 days 02:35:00,0 days 02:50:00,0 days 02:42:30,150,83.9,12585.0,9 days 00:44:30
14718,G996351,2023-09-12 12:20:00-04:00,2023-09-12 13:54:00-04:00,2023-09-12 17:10:13-04:00,2023-09-12 17:55:07-04:00,[0.0],[-85.4887438847069],[unknown],,,,Walton EMC,0 days 01:34:00,0 days 01:49:00,0 days 01:41:30,60,3.0,180.0,0 days 04:42:00
14719,G996585,2023-09-15 01:24:00-04:00,2023-09-15 02:09:00-04:00,2023-09-15 05:40:07-04:00,2023-09-15 06:10:04-04:00,[0.0],[-85.4887438847069],[unknown],,,,Walton EMC,0 days 00:45:00,0 days 01:00:00,0 days 00:52:30,45,2.0,90.0,0 days 01:30:00
14720,G997542,2023-09-28 17:01:00-04:00,2023-09-28 18:09:00-04:00,2023-09-28 21:55:06-04:00,2023-09-28 22:10:07-04:00,[0.0],[-85.4887438847069],[unknown],,,,Walton EMC,0 days 01:08:00,0 days 01:23:00,0 days 01:15:30,30,12.0,360.0,0 days 13:36:00


Unnamed: 0,substation,feeder,outage_id,alias,outage_comment,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,opt_code,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,EMC,timestamp
0,5,334.0,D760313,Loy Williamson,,,,2023-03-16 17:34:00-04:00,0 days 00:06:00,3,-84.326558,35.121572,Cloudy,Electric,0,20906,37391,False,2023-03-16 16:39:00-04:00,Tri-State EMC,NaT
1,5,334.0,D760313,Loy Williamson,,,,2023-03-16 17:34:00-04:00,0 days 00:21:00,3,-84.326558,35.121572,Cloudy,Electric,0,20906,37391,False,2023-03-16 16:54:00-04:00,Tri-State EMC,NaT
2,5,334.0,D760313,Loy Williamson,,,,2023-03-16 17:34:00-04:00,0 days 00:36:00,3,-84.326558,35.121572,Cloudy,Electric,0,20906,37391,False,2023-03-16 17:09:00-04:00,Tri-State EMC,NaT
3,2,314.0,D760353,,,,,2023-03-19 05:16:00-04:00,0 days 00:08:00,780,-84.317560,34.934301,Clear,Electric,57,20903,30559,False,2023-03-19 04:24:00-04:00,Tri-State EMC,NaT
4,2,314.0,D760353,,,,,2023-03-19 05:16:00-04:00,0 days 00:23:00,780,-84.317560,34.934301,Clear,Electric,86,20903,30559,False,2023-03-19 04:39:00-04:00,Tri-State EMC,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41072,2,344.0,D774512,,,,,2024-01-18 04:07:00-05:00,0 days 00:02:00,3,-84.275412,34.935686,,Electric,2,21287,30559,False,2024-01-18 03:09:00-05:00,Tri-State EMC,2024-01-18 09:09:51-05:00
41073,2,344.0,D774513,,,,,2024-01-18 03:34:00-05:00,0 days 00:50:00,2,-84.275424,34.935768,,Electric,2,21287,30559,False,2024-01-18 03:24:00-05:00,Tri-State EMC,2024-01-18 09:24:49-05:00
41074,2,344.0,D774512,,,,,2024-01-18 04:07:00-05:00,0 days 00:17:00,3,-84.275412,34.935686,,Electric,2,21287,30559,False,2024-01-18 03:24:00-05:00,Tri-State EMC,2024-01-18 09:24:49-05:00
41075,2,344.0,D774513,,,,,2024-01-18 03:34:00-05:00,0 days 01:05:00,2,-84.275424,34.935768,,Electric,2,21287,30559,False,2024-01-18 03:39:00-05:00,Tri-State EMC,2024-01-18 09:39:51-05:00


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,D760313,2023-03-16 17:34:00-04:00,2023-03-16 18:10:00-04:00,NaT,NaT,[35.121572377824094],[-84.32655842905368],[37391],Polk,47139,Tennessee,Tri-State EMC,0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,45,3.0,135.0,0 days 01:48:00
1,D760353,2023-03-19 05:16:00-04:00,2023-03-19 13:09:00-04:00,NaT,NaT,[34.934301059308666],[-84.31756035270524],[30559],Fannin,13111,Georgia,Tri-State EMC,0 days 07:53:00,0 days 08:08:00,0 days 08:00:30,480,780.0,374400.0,256 days 05:00:00
2,D760483,2023-03-19 21:05:00-04:00,2023-03-19 22:24:00-04:00,NaT,NaT,[34.9247317073667],[-84.28289194885062],[30559],Fannin,13111,Georgia,Tri-State EMC,0 days 01:19:00,0 days 01:34:00,0 days 01:26:30,75,823.0,61725.0,45 days 03:37:00
3,D760513,2023-03-20 03:37:00-04:00,2023-03-20 05:24:00-04:00,NaT,NaT,[34.988284773021064],[-84.26526552447059],[30559],Fannin,13111,Georgia,Tri-State EMC,0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,60,1.0,60.0,0 days 01:47:00
4,D760514,2023-03-20 07:50:00-04:00,2023-03-20 08:39:00-04:00,NaT,NaT,[34.843191355996304],[-84.2994741858854],[30513],Fannin,13111,Georgia,Tri-State EMC,0 days 00:49:00,0 days 01:04:00,0 days 00:56:30,60,1.0,60.0,0 days 00:49:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1604,D774504,2024-01-17 18:23:00-05:00,2024-01-17 20:39:00-05:00,2024-01-17 23:54:50-05:00,2024-01-18 01:39:50-05:00,[34.89774997804474],[-84.27340448844093],[30559],Fannin,13111,Georgia,Tri-State EMC,0 days 02:16:00,0 days 02:31:00,0 days 02:23:30,120,2.0,240.0,0 days 04:32:00
1605,D774507,2024-01-17 23:47:00-05:00,2024-01-18 02:23:00-05:00,2024-01-18 04:54:50-05:00,2024-01-18 07:24:51-05:00,[35.087756602920685],[-84.29921901635525],[unknown],,,,Tri-State EMC,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,165,2.0,330.0,0 days 05:12:00
1606,D774509,2024-01-17 23:47:00-05:00,2024-01-18 02:23:00-05:00,2024-01-18 06:09:50-05:00,2024-01-18 07:24:51-05:00,[35.09370898864956],[-84.29900499951849],[28906],Cherokee,37039,North Carolina,Tri-State EMC,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,90,7.0,630.0,0 days 18:12:00
1607,D774512,2024-01-18 04:07:00-05:00,2024-01-18 04:39:00-05:00,2024-01-18 09:09:51-05:00,2024-01-18 09:39:51-05:00,[34.93568623901852],[-84.27541169172594],[30559],Fannin,13111,Georgia,Tri-State EMC,0 days 00:32:00,0 days 00:47:00,0 days 00:39:30,45,3.0,135.0,0 days 01:36:00


Unnamed: 0,substation,feeder,outage_id,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,EMC,alias,outage_comment,timestamp,opt_code,poly
0,7.0,2.0,C434216,,,2023-03-15 12:31:00-04:00,0 days 00:08:00,1,-83.495825,32.696321,Electric,1,12852,31295,False,2023-03-15 11:40:00-04:00,Oconee EMC,,,NaT,,
1,7.0,2.0,C434216,,,2023-03-15 12:31:00-04:00,0 days 00:24:00,1,-83.495825,32.696321,Electric,1,12852,31295,False,2023-03-15 11:55:00-04:00,Oconee EMC,,,NaT,,
2,7.0,2.0,C434216,,,2023-03-15 12:31:00-04:00,0 days 00:38:00,1,-83.495825,32.696321,Electric,1,12852,31295,False,2023-03-15 12:10:00-04:00,Oconee EMC,,,NaT,,
3,7.0,2.0,C434216,,,2023-03-15 12:31:00-04:00,0 days 00:53:00,1,-83.495825,32.696321,Electric,1,12852,31295,False,2023-03-15 12:25:00-04:00,Oconee EMC,,,NaT,,
4,15.0,1.0,C434219,,,2023-03-16 15:42:00-04:00,0 days 00:12:00,1,-83.175852,32.462475,Electric,1,12850,unknown,False,2023-03-16 14:55:00-04:00,Oconee EMC,,,NaT,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59765,2.0,1.0,C445005,,,2024-01-18 02:32:00-05:00,0 days 00:51:00,1,-83.298711,32.901316,Electric,3,12926,31031,False,2024-01-18 02:25:00-05:00,Oconee EMC,,,2024-01-18 08:25:25-05:00,,
59766,2.0,1.0,C445007,,,2024-01-18 03:14:00-05:00,0 days 00:09:00,1,-83.298984,32.901143,Electric,3,12926,31031,False,2024-01-18 02:25:00-05:00,Oconee EMC,,,2024-01-18 08:25:25-05:00,,
59767,2.0,1.0,C445006,,,2024-01-18 02:48:00-05:00,0 days 00:50:00,1,-83.298968,32.901069,Electric,3,12926,31031,False,2024-01-18 02:40:00-05:00,Oconee EMC,,,2024-01-18 08:40:27-05:00,,
59768,2.0,1.0,C445005,,,2024-01-18 02:32:00-05:00,0 days 01:06:00,1,-83.298711,32.901316,Electric,3,12926,31031,False,2024-01-18 02:40:00-05:00,Oconee EMC,,,2024-01-18 08:40:27-05:00,,


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,C434216,2023-03-15 12:31:00-04:00,2023-03-15 13:24:00-04:00,NaT,NaT,[32.6963206535166],[-83.4958250438261],[31295],,,,Oconee EMC,0 days 00:53:00,0 days 01:08:00,0 days 01:00:30,60,1.0,60.0,0 days 00:53:00
1,C434219,2023-03-16 15:42:00-04:00,2023-03-16 16:39:00-04:00,NaT,NaT,[32.4624747950688],[-83.1758520493372],[unknown],,,,Oconee EMC,0 days 00:57:00,0 days 01:12:00,0 days 01:04:30,60,1.0,60.0,0 days 00:57:00
2,C434221,2023-03-17 11:02:00-04:00,2023-03-17 11:39:00-04:00,NaT,NaT,[32.616212986581],[-83.3452972574774],[31044],Twiggs,13289,Georgia,Oconee EMC,0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,45,1.0,45.0,0 days 00:37:00
3,C434222,2023-03-17 11:15:00-04:00,2023-03-17 11:39:00-04:00,NaT,NaT,[32.6166001690322],[-83.3450804998653],[31044],Twiggs,13289,Georgia,Oconee EMC,0 days 00:24:00,0 days 00:39:00,0 days 00:31:30,30,1.0,30.0,0 days 00:24:00
4,C434224,2023-03-17 15:15:00-04:00,2023-03-17 15:54:00-04:00,NaT,NaT,[32.5570715501155],[-82.9521748715231],[31021],Laurens,13175,Georgia,Oconee EMC,0 days 00:39:00,0 days 00:54:00,0 days 00:46:30,30,1.0,30.0,0 days 00:39:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7930,D444934,2024-01-10 08:07:00-05:00,2024-01-10 09:39:00-05:00,2024-01-10 14:40:23-05:00,2024-01-10 14:40:23-05:00,[32.61491475475819],[-82.97292325486109],[31021],Laurens,13175,Georgia,Oconee EMC,0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,15,1.0,15.0,0 days 01:32:00
7931,D444941,2024-01-10 11:57:00-05:00,2024-01-10 14:23:00-05:00,2024-01-10 19:25:21-05:00,2024-01-10 19:25:21-05:00,[32.64740804385846],[-82.96969756775235],[unknown],,,,Oconee EMC,0 days 02:26:00,0 days 02:41:00,0 days 02:33:30,15,1.0,15.0,0 days 02:26:00
7932,D444972,2024-01-13 06:53:00-05:00,2024-01-13 09:08:00-05:00,2024-01-13 13:40:26-05:00,2024-01-13 14:10:28-05:00,[32.903891769278715],[-83.25488856219343],[31054],Wilkinson,13319,Georgia,Oconee EMC,0 days 02:15:00,0 days 02:30:00,0 days 02:22:30,45,4.0,180.0,0 days 09:00:00
7933,D444980,2024-01-16 05:10:00-05:00,2024-01-16 06:39:00-05:00,2024-01-16 11:25:31-05:00,2024-01-16 11:40:45-05:00,[32.903891769278715],[-83.25488856219343],[31054],Wilkinson,13319,Georgia,Oconee EMC,0 days 01:29:00,0 days 01:44:00,0 days 01:36:30,30,5.0,150.0,0 days 07:25:00


  self._data = pd.read_csv(file_path)


Unnamed: 0,substation,feeder,outage_id,alias,outage_comment,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,opt_code,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,EMC,timestamp,poly
0,0.0,0.0,G437006,,Meter Shop,,,2023-03-15 03:14:00-04:00,0 days 09:10:00,0,,,,Electric,1,25259,unknown,False,2023-03-15 11:25:00-04:00,Mitchell EMC,NaT,
1,8.0,4.0,C483513,,,,,2023-03-15 10:24:00-04:00,0 days 02:00:00,1,-84.374671,31.214144,,Electric,1,25259,31730,False,2023-03-15 11:25:00-04:00,Mitchell EMC,NaT,
2,0.0,0.0,G437006,,Meter Shop,,,2023-03-15 03:14:00-04:00,0 days 09:25:00,0,,,,Electric,0,25259,unknown,False,2023-03-15 11:40:00-04:00,Mitchell EMC,NaT,
3,0.0,0.0,G437006,,Meter Shop,,,2023-03-15 03:14:00-04:00,0 days 09:40:00,0,,,,Electric,0,25259,unknown,False,2023-03-15 11:55:00-04:00,Mitchell EMC,NaT,
4,0.0,0.0,G437006,,Meter Shop,,,2023-03-15 03:14:00-04:00,0 days 09:55:00,0,,,,Electric,0,25259,unknown,False,2023-03-15 12:10:00-04:00,Mitchell EMC,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123807,0.0,0.0,G437006,,Meter Shop,,,2024-01-17 01:32:00-05:00,1 days 03:52:00,7,-84.159175,31.425896,,Electric,10,25456,unknown,False,2024-01-18 04:25:00-05:00,Mitchell EMC,2024-01-18 10:25:40-05:00,"[{'lon': '', 'lat': ''}]"
123808,11.0,1.0,D500951,,,,,2024-01-15 02:10:00-05:00,3 days 03:14:00,2,-83.925921,31.451546,,Electric,10,25456,31791,False,2024-01-18 04:25:00-05:00,Mitchell EMC,2024-01-18 10:25:40-05:00,"[{'lon': '-83.9280096071789', 'lat': '31.45150..."
123809,1.0,1.0,D500998,,,,,2024-01-17 02:20:00-05:00,1 days 03:04:00,2,-84.137203,31.256813,,Electric,10,25456,31730,False,2024-01-18 04:25:00-05:00,Mitchell EMC,2024-01-18 10:25:40-05:00,"[{'lon': '-84.1359461267312', 'lat': '31.25706..."
123810,20.0,2.0,C501047,,,,,2024-01-18 04:20:00-05:00,0 days 01:04:00,1,-84.117057,31.332163,,Electric,10,25456,31716,False,2024-01-18 04:25:00-05:00,Mitchell EMC,2024-01-18 10:25:40-05:00,


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,C483513,2023-03-15 10:24:00-04:00,2023-03-15 12:24:00-04:00,NaT,NaT,[31.2141438821248],[-84.3746707478679],[31730],Mitchell,13205,Georgia,Mitchell EMC,0 days 02:00:00,0 days 02:15:00,0 days 02:07:30,15,1.00,15.0,0 days 02:00:00
1,C483521,2023-03-16 03:22:00-04:00,2023-03-16 03:24:00-04:00,NaT,NaT,[31.1751658640547],[-84.0992626512505],[31779],Mitchell,13205,Georgia,Mitchell EMC,0 days 00:02:00,0 days 00:17:00,0 days 00:09:30,15,1.00,15.0,0 days 00:02:00
2,C483523,2023-03-16 09:27:00-04:00,2023-03-16 11:09:00-04:00,NaT,NaT,[31.2490668335486],[-84.4338589663475],[unknown],,,,Mitchell EMC,0 days 01:42:00,0 days 01:57:00,0 days 01:49:30,90,1.00,90.0,0 days 01:42:00
3,C483525,2023-03-17 02:04:00-04:00,2023-03-17 03:53:00-04:00,NaT,NaT,[31.4014455976227],[-83.7245005486182],[31795],Worth,13321,Georgia,Mitchell EMC,0 days 01:49:00,0 days 02:04:00,0 days 01:56:30,90,1.00,90.0,0 days 01:49:00
4,C483528,2023-03-17 04:34:00-04:00,2023-03-17 04:39:00-04:00,NaT,NaT,[31.2443292237511],[-84.2595276154844],[31730],Mitchell,13205,Georgia,Mitchell EMC,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,15,1.00,15.0,0 days 00:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7628,G437006,2024-01-17 01:25:00-05:00,2024-01-18 03:55:00-05:00,2024-01-17 08:25:43-05:00,2024-01-18 08:55:47-05:00,"[31.4760287601336, 31.4680685961645, 31.434712...","[-84.1092331630223, -84.1082751248251, -84.165...","[unknown, 31705, 31721, 31716]",Mitchell,13205,Georgia,Mitchell EMC,1 days 02:30:00,1 days 02:45:00,1 days 02:37:30,1470,6.52,9675.0,7 days 04:47:26.938775510
7629,G437006,2024-01-17 01:32:00-05:00,2024-01-18 05:24:00-05:00,2024-01-18 09:10:54-05:00,2024-01-18 10:25:40-05:00,"[31.3927452857212, 31.3607717733857, 31.393825...","[-84.1714649357984, -84.205170767159, -84.1844...","[31716, unknown]",,,,Mitchell EMC,1 days 03:52:00,1 days 04:07:00,1 days 03:59:30,90,8.67,780.0,10 days 01:30:40
7630,G484576,2023-04-14 06:25:00-04:00,2023-04-14 06:55:00-04:00,2023-04-14 10:55:40-04:00,2023-04-14 10:55:40-04:00,[31.5489276199601],[-84.3065167503742],[31721],Dougherty,13095,Georgia,Mitchell EMC,0 days 00:30:00,0 days 00:45:00,0 days 00:37:30,15,45.00,675.0,0 days 22:30:00
7631,G495149,2023-08-30 05:40:00-04:00,2023-08-30 06:40:00-04:00,2023-08-30 09:55:48-04:00,2023-08-30 10:40:56-04:00,"[31.5513778631095, 31.5512333357696]","[-83.965609459837, -83.9649160646379]",[31791],Worth,13321,Georgia,Mitchell EMC,0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,60,4.00,240.0,0 days 04:00:00


Unnamed: 0,substation,feeder,outage_id,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,timestamp,EMC,alias,outage_comment
0,8,1202.0,C399381,,,2023-04-15 09:07:00-04:00,0 days 00:09:00,1,-97.747517,33.038220,Electric,1,26204,76083,False,2023-04-15 09:17:00-04:00,2023-04-15 14:17:23-04:00,"Wise Electric Coop, Inc.",,
1,8,1202.0,C399381,,,2023-04-15 09:07:00-04:00,0 days 00:24:00,1,-97.747517,33.038220,Electric,1,26204,76083,False,2023-04-15 09:32:00-04:00,2023-04-15 14:32:25-04:00,"Wise Electric Coop, Inc.",,
2,8,1202.0,C399381,,,2023-04-15 09:07:00-04:00,0 days 00:39:00,1,-97.747517,33.038220,Electric,1,26204,76083,False,2023-04-15 09:47:00-04:00,2023-04-15 14:47:25-04:00,"Wise Electric Coop, Inc.",,
3,8,1202.0,C399381,,,2023-04-15 09:07:00-04:00,0 days 00:54:00,1,-97.747517,33.038220,Electric,1,26204,76083,False,2023-04-15 10:02:00-04:00,2023-04-15 15:02:26-04:00,"Wise Electric Coop, Inc.",,
4,8,1202.0,C399381,,,2023-04-15 09:07:00-04:00,0 days 01:09:00,1,-97.747517,33.038220,Electric,1,26204,76083,False,2023-04-15 10:17:00-04:00,2023-04-15 15:17:25-04:00,"Wise Electric Coop, Inc.",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10535,14,1204.0,C407289,,,2024-01-17 07:13:00-05:00,0 days 00:17:00,1,-97.768295,33.155803,Electric,1,26695,unknown,False,2024-01-17 07:32:00-05:00,2024-01-17 13:32:30-05:00,"Wise Electric Coop, Inc.",,
10536,14,1204.0,C407289,,,2024-01-17 07:13:00-05:00,0 days 00:33:00,1,-97.768295,33.155803,Electric,1,26695,unknown,False,2024-01-17 07:47:00-05:00,2024-01-17 13:47:24-05:00,"Wise Electric Coop, Inc.",,
10537,14,1204.0,C407289,,,2024-01-17 07:13:00-05:00,0 days 00:48:00,1,-97.768295,33.155803,Electric,1,26695,unknown,False,2024-01-17 08:02:00-05:00,2024-01-17 14:02:23-05:00,"Wise Electric Coop, Inc.",,
10538,14,1204.0,C407289,,,2024-01-17 07:13:00-05:00,0 days 01:03:00,1,-97.768295,33.155803,Electric,1,26695,unknown,False,2024-01-17 08:17:00-05:00,2024-01-17 14:17:23-05:00,"Wise Electric Coop, Inc.",,


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,C399381,2023-04-15 09:07:00-04:00,2023-04-15 10:16:00-04:00,2023-04-15 14:17:23-04:00,2023-04-15 15:17:25-04:00,[33.0382196355098],[-97.7475172092185],[76083],,,,"Wise Electric Coop, Inc.",0 days 01:09:00,0 days 01:24:00,0 days 01:16:30,75,1.0,75.0,0 days 01:09:00
1,C399382,2023-04-16 03:35:00-04:00,2023-04-16 05:16:00-04:00,2023-04-16 08:47:24-04:00,2023-04-16 10:17:26-04:00,[33.055042122318],[-97.7832866144301],[76073],Wise,48497,Texas,"Wise Electric Coop, Inc.",0 days 01:41:00,0 days 01:56:00,0 days 01:48:30,105,1.0,105.0,0 days 01:41:00
2,C399386,2023-04-16 12:29:00-04:00,2023-04-16 13:46:00-04:00,2023-04-16 17:32:23-04:00,2023-04-16 18:47:22-04:00,[33.1704085039402],[-97.4396965805056],[76234],Wise,48497,Texas,"Wise Electric Coop, Inc.",0 days 01:17:00,0 days 01:32:00,0 days 01:24:30,90,1.0,90.0,0 days 01:17:00
3,C399401,2023-04-17 12:07:00-04:00,2023-04-17 14:00:00-04:00,2023-04-17 17:17:27-04:00,2023-04-17 19:02:24-04:00,[33.2802566323912],[-97.5594800003705],[unknown],,,,"Wise Electric Coop, Inc.",0 days 01:53:00,0 days 02:08:00,0 days 02:00:30,120,1.0,120.0,0 days 01:53:00
4,C399402,2023-04-18 04:55:00-04:00,2023-04-18 05:31:00-04:00,2023-04-18 10:02:25-04:00,2023-04-18 10:32:24-04:00,[33.3358485036046],[-97.6337625202981],[unknown],,,,"Wise Electric Coop, Inc.",0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,45,1.0,45.0,0 days 00:36:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2183,G406396,2023-12-30 02:23:00-05:00,2023-12-30 03:46:00-05:00,2023-12-30 08:32:25-05:00,2023-12-30 09:47:27-05:00,[0.0],[-103.488743884707],[unknown],,,,"Wise Electric Coop, Inc.",0 days 01:23:00,0 days 01:38:00,0 days 01:30:30,90,14.0,1260.0,0 days 19:22:00
2184,G406415,2023-12-30 07:41:00-05:00,2023-12-30 08:46:00-05:00,2023-12-30 14:02:31-05:00,2023-12-30 14:47:31-05:00,[0.0],[-103.488743884707],[unknown],,,,"Wise Electric Coop, Inc.",0 days 01:05:00,0 days 01:20:00,0 days 01:12:30,60,12.0,720.0,0 days 13:00:00
2185,G406964,2024-01-11 20:31:00-05:00,2024-01-11 21:31:00-05:00,2024-01-12 02:47:27-05:00,2024-01-12 03:32:33-05:00,[0.0],[-103.488743884707],[unknown],,,,"Wise Electric Coop, Inc.",0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,60,6.0,360.0,0 days 06:00:00
2186,G406967,2024-01-11 23:23:00-05:00,2024-01-12 00:47:00-05:00,2024-01-12 05:33:15-05:00,2024-01-12 06:48:16-05:00,[0.0],[-103.488743884707],[unknown],,,,"Wise Electric Coop, Inc.",0 days 01:24:00,0 days 01:39:00,0 days 01:31:30,90,2.0,180.0,0 days 02:48:00


Unnamed: 0,substation,feeder,outage_id,alias,outage_comment,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,timestamp,EMC,opt_code,poly
0,27,4.0,D712398,2-25,,Apr 14 11:25 am,"Fri, Apr 14, 11:25 am",2023-04-14 03:21:00-04:00,0 days 00:10:00,2,-95.392524,31.277230,Electric,2,22702,75835,False,2023-04-14 03:32:00-04:00,2023-04-14 08:32:11-04:00,"Houston County Electric Coop, Inc.",,
1,27,4.0,D712398,2-25,,Apr 14 11:25 am,"Fri, Apr 14, 11:25 am",2023-04-14 03:21:00-04:00,0 days 00:25:00,2,-95.392524,31.277230,Electric,2,22702,75835,False,2023-04-14 03:47:00-04:00,2023-04-14 08:47:09-04:00,"Houston County Electric Coop, Inc.",,
2,30,5.0,D712405,24M-91,,Apr 15 10:13 am,"Sat, Apr 15, 10:13 am",2023-04-15 02:13:00-04:00,0 days 00:03:00,31,-95.673220,30.926413,Electric,31,22702,75852,False,2023-04-15 02:17:00-04:00,2023-04-15 07:17:09-04:00,"Houston County Electric Coop, Inc.",,
3,30,5.0,D712405,24M-91,,Apr 15 10:13 am,"Sat, Apr 15, 10:13 am",2023-04-15 02:13:00-04:00,0 days 00:32:00,31,-95.673220,30.926413,Electric,31,22707,75852,False,2023-04-15 02:47:00-04:00,2023-04-15 07:47:09-04:00,"Houston County Electric Coop, Inc.",,
4,30,5.0,D712405,24M-91,,Apr 15 10:13 am,"Sat, Apr 15, 10:13 am",2023-04-15 02:13:00-04:00,0 days 00:48:00,31,-95.673220,30.926413,Electric,31,22707,75852,False,2023-04-15 03:02:00-04:00,2023-04-15 08:02:10-04:00,"Houston County Electric Coop, Inc.",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14078,15,1.0,D742083,6-51,,,,2024-01-17 02:51:00-05:00,0 days 00:40:00,2,-95.578818,31.436583,Electric,2,22859,75844,False,2024-01-17 03:32:00-05:00,2024-01-17 09:32:18-05:00,"Houston County Electric Coop, Inc.",,"[{'lon': '-95.57774545806544', 'lat': '31.4363..."
14079,15,1.0,D742083,6-51,,,,2024-01-17 02:51:00-05:00,0 days 00:54:00,2,-95.578818,31.436583,Electric,2,22859,75844,False,2024-01-17 03:47:00-05:00,2024-01-17 09:47:20-05:00,"Houston County Electric Coop, Inc.",,"[{'lon': '-95.57774545806544', 'lat': '31.4363..."
14080,15,1.0,D742083,6-51,,,,2024-01-17 02:51:00-05:00,0 days 01:10:00,2,-95.578818,31.436583,Electric,2,22859,75844,False,2024-01-17 04:02:00-05:00,2024-01-17 10:02:20-05:00,"Houston County Electric Coop, Inc.",,"[{'lon': '-95.57774545806544', 'lat': '31.4363..."
14081,15,1.0,D742083,6-51,,,,2024-01-17 02:51:00-05:00,0 days 01:25:00,2,-95.578818,31.436583,Electric,2,22859,75844,False,2024-01-17 04:17:00-05:00,2024-01-17 10:17:22-05:00,"Houston County Electric Coop, Inc.",,"[{'lon': '-95.57774545806544', 'lat': '31.4363..."


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,D712398,2023-04-14 03:21:00-04:00,2023-04-14 03:46:00-04:00,2023-04-14 08:32:11-04:00,2023-04-14 08:47:09-04:00,[31.2772300483194],[-95.392523819382],[75835],Houston,48225,Texas,"Houston County Electric Coop, Inc.",0 days 00:25:00,0 days 00:40:00,0 days 00:32:30,30,2.00,60.0,0 days 00:50:00
1,D712405,2023-04-15 02:13:00-04:00,2023-04-15 03:16:00-04:00,2023-04-15 07:17:09-04:00,2023-04-15 08:17:10-04:00,[30.9264126173052],[-95.6732203114424],[75852],Madison,48313,Texas,"Houston County Electric Coop, Inc.",0 days 01:03:00,0 days 01:18:00,0 days 01:10:30,60,31.00,2325.0,1 days 08:33:00
2,D712420,2023-04-15 05:58:00-04:00,2023-04-15 07:16:00-04:00,2023-04-15 11:17:10-04:00,2023-04-15 12:17:08-04:00,[31.7196559897302],[-95.5593032842649],[75801],Anderson,48001,Texas,"Houston County Electric Coop, Inc.",0 days 01:18:00,0 days 01:33:00,0 days 01:25:30,75,44.00,3300.0,2 days 09:12:00
3,D712431,2023-04-15 08:18:00-04:00,2023-04-15 09:01:00-04:00,2023-04-15 13:32:10-04:00,2023-04-15 14:02:10-04:00,[31.1275609041616],[-95.7372749913443],[75835],Houston,48225,Texas,"Houston County Electric Coop, Inc.",0 days 00:43:00,0 days 00:58:00,0 days 00:50:30,45,8.00,360.0,0 days 05:44:00
4,D712434,2023-04-15 13:44:00-04:00,2023-04-15 15:16:00-04:00,2023-04-15 19:02:10-04:00,2023-04-15 20:17:08-04:00,[31.5310367712332],[-95.8850791940002],[unknown],,,,"Houston County Electric Coop, Inc.",0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,90,7.00,630.0,0 days 10:44:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,G740838,2024-01-12 01:55:00-05:00,2024-01-12 02:32:00-05:00,2024-01-12 08:32:42-05:00,2024-01-12 08:32:42-05:00,[31.2216734246887],[-95.38843803389712],[unknown],,,,"Houston County Electric Coop, Inc.",0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,15,48.00,720.0,1 days 05:36:00
1658,G741154,2024-01-12 02:48:00-05:00,2024-01-12 03:58:00-05:00,2024-01-12 10:02:29-05:00,2024-01-12 10:02:29-05:00,[31.264191196486955],[-94.88983868659808],[unknown],,,,"Houston County Electric Coop, Inc.",0 days 01:10:00,0 days 01:25:00,0 days 01:17:30,15,77.00,1155.0,3 days 17:50:00
1659,G741203,2024-01-12 03:08:00-05:00,2024-01-12 04:55:00-05:00,2024-01-12 10:32:23-05:00,2024-01-12 11:02:24-05:00,[31.21380027944924],[-95.04333552409382],[unknown],,,,"Houston County Electric Coop, Inc.",0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,45,8.00,360.0,0 days 14:16:00
1660,G741890,2024-01-15 00:40:00-05:00,2024-01-15 04:46:00-05:00,2024-01-15 09:17:16-05:00,2024-01-15 10:47:14-05:00,"[31.02710556906973, 31.01958508134907, 31.0201...","[-95.43165911528766, -95.43165911528769, -95.4...",[unknown],,,,"Houston County Electric Coop, Inc.",0 days 04:06:00,0 days 04:21:00,0 days 04:13:30,105,68.14,7155.0,11 days 15:23:08.571428571


  self._data = pd.read_csv(file_path)


Unnamed: 0,substation,feeder,outage_id,alias,outage_comment,estimated_restore_time,formatted_ert,start_time,duration,customer_affected,lon,lat,opt_code,poly,service_index_name,outages,NumConsumers,zipcode,isHighTraffic,updateTime,timestamp,EMC
0,12.0,3.0,D467687,,,,,2023-04-14 05:51:00-04:00,0 days 00:10:00,26,-95.199661,32.185865,,"[{'lon': '-95.1851486554595', 'lat': '32.17972...",Electric,7,23096,75791,False,2023-04-14 06:01:00-04:00,2023-04-14 11:01:56-04:00,Cherokee County Electric Coop Association
1,12.0,2.0,D467700,,,,,2023-04-15 18:25:00-04:00,0 days 00:21:00,21,-95.272752,32.199545,,"[{'lon': '-95.2773357505746', 'lat': '32.20323...",Electric,2,23096,75791,False,2023-04-15 18:46:00-04:00,2023-04-15 23:46:57-04:00,Cherokee County Electric Coop Association
2,12.0,2.0,D467700,,,,,2023-04-15 18:25:00-04:00,0 days 00:36:00,21,-95.272752,32.199545,,"[{'lon': '-95.2773357505746', 'lat': '32.20323...",Electric,3,23096,75791,False,2023-04-15 19:01:00-04:00,2023-04-16 00:01:57-04:00,Cherokee County Electric Coop Association
3,12.0,2.0,D467700,,,,,2023-04-15 18:25:00-04:00,0 days 00:51:00,21,-95.272752,32.199545,,"[{'lon': '-95.2773357505746', 'lat': '32.20323...",Electric,3,23096,75791,False,2023-04-15 19:16:00-04:00,2023-04-16 00:16:56-04:00,Cherokee County Electric Coop Association
4,12.0,2.0,D467700,,,,,2023-04-15 18:25:00-04:00,0 days 01:06:00,21,-95.272752,32.199545,,"[{'lon': '-95.2773357505746', 'lat': '32.20323...",Electric,3,23096,75791,False,2023-04-15 19:31:00-04:00,2023-04-16 00:31:55-04:00,Cherokee County Electric Coop Association
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51502,7.0,2.0,C488341,,,,,2024-01-18 03:39:00-05:00,0 days 00:07:00,1,-95.457855,32.156326,,,Electric,1,23492,75762,False,2024-01-18 03:46:00-05:00,2024-01-18 09:46:56-05:00,Cherokee County Electric Coop Association
51503,7.0,2.0,C488341,,,,,2024-01-18 03:39:00-05:00,0 days 00:22:00,1,-95.457855,32.156326,,,Electric,1,23492,75762,False,2024-01-18 04:01:00-05:00,2024-01-18 10:01:57-05:00,Cherokee County Electric Coop Association
51504,7.0,2.0,C488341,,,,,2024-01-18 03:39:00-05:00,0 days 00:37:00,1,-95.457855,32.156326,,,Electric,2,23492,75762,False,2024-01-18 04:16:00-05:00,2024-01-18 10:16:57-05:00,Cherokee County Electric Coop Association
51505,5.0,3.0,C488342,,,,,2024-01-18 04:08:00-05:00,0 days 00:08:00,1,-95.287559,31.877448,,,Electric,2,23492,75766,False,2024-01-18 04:16:00-05:00,2024-01-18 10:16:57-05:00,Cherokee County Electric Coop Association


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,first_timestamp,last_timestamp,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_diff,duration_max,duration_mean,duration_15,customer_affected_mean,cust_affected_x_duration,total_customer_outage_time
0,C480035,2023-09-24 18:28:00-04:00,2023-09-25 13:31:00-04:00,2023-09-25 09:51:39-04:00,2023-09-25 18:36:51-04:00,[32.1608321080001],[-95.452705857],[75762],Smith,48423,Texas,Cherokee County Electric Coop Association,0 days 19:03:00,0 days 19:18:00,0 days 19:10:30,120,1.0,540.0,0 days 19:03:00
1,C480066,2023-09-24 18:43:00-04:00,2023-09-25 13:31:00-04:00,2023-09-25 09:51:39-04:00,2023-09-25 18:36:51-04:00,[32.153927793],[-95.4535533929999],[75762],Smith,48423,Texas,Cherokee County Electric Coop Association,0 days 18:48:00,0 days 19:03:00,0 days 18:55:30,120,1.0,540.0,0 days 18:48:00
2,C480094,2023-09-24 18:45:00-04:00,2023-09-25 08:16:00-04:00,2023-09-25 09:51:39-04:00,2023-09-25 13:22:25-04:00,[32.1546312020001],[-95.3935947689999],[75757],Cherokee,48073,Texas,Cherokee County Electric Coop Association,0 days 13:31:00,0 days 13:46:00,0 days 13:38:30,90,1.0,226.0,0 days 13:31:00
3,C480111,2023-09-24 18:46:00-04:00,2023-09-25 13:31:00-04:00,2023-09-25 09:51:39-04:00,2023-09-25 18:36:51-04:00,[32.1606725400001],[-95.452128882],[75762],Smith,48423,Texas,Cherokee County Electric Coop Association,0 days 18:45:00,0 days 19:00:00,0 days 18:52:30,120,1.0,540.0,0 days 18:45:00
4,C480121,2023-09-24 18:47:00-04:00,2023-09-26 01:46:00-04:00,2023-09-25 09:51:39-04:00,2023-09-26 06:51:29-04:00,[0.0],[-97.488743884387],[unknown],,,,Cherokee County Electric Coop Association,1 days 06:59:00,1 days 07:14:00,1 days 07:06:30,735,1.0,1275.0,1 days 06:59:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4778,G483387,2023-09-24 18:55:00-04:00,2023-09-26 02:01:00-04:00,2023-09-26 07:06:25-04:00,2023-09-26 07:06:25-04:00,[0.0],[-97.4887438843871],[unknown],,,,Cherokee County Electric Coop Association,1 days 07:06:00,1 days 07:21:00,1 days 07:13:30,15,14.0,210.0,18 days 03:24:00
4779,G483389,2023-09-24 19:05:00-04:00,2023-09-26 02:01:00-04:00,2023-09-26 07:06:25-04:00,2023-09-26 07:06:25-04:00,[0.0],[-97.4887438843871],[unknown],,,,Cherokee County Electric Coop Association,1 days 06:56:00,1 days 07:11:00,1 days 07:03:30,15,18.0,270.0,23 days 04:48:00
4780,G483390,2023-09-24 18:47:00-04:00,2023-09-26 02:01:00-04:00,2023-09-26 07:06:25-04:00,2023-09-26 07:06:25-04:00,[0.0],[-97.488743884387],[unknown],,,,Cherokee County Electric Coop Association,1 days 07:14:00,1 days 07:29:00,1 days 07:21:30,15,9.0,135.0,11 days 17:06:00
4781,G483391,2023-09-24 19:04:00-04:00,2023-09-26 02:01:00-04:00,2023-09-26 07:06:25-04:00,2023-09-26 07:06:25-04:00,[0.0],[-97.4887438843871],[unknown],,,,Cherokee County Electric Coop Association,1 days 06:57:00,1 days 07:12:00,1 days 07:04:30,15,12.0,180.0,15 days 11:24:00
