In [167]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
from dateutil import tz
from datetime import datetime
from datetime import timedelta
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.map = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        # TODO: use us zipcode database
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open(f"{self.config['state']}_mapping.json", 'r') as json_file:
                self.map = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        # Base transformation method
        raise NotImplementedError

    def standardize(self):
        # Base transformation method
        # where most of the functionality is
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['utility_provider'] = self.config['name']
        self._data['state'] = self.config['state']
        self._data['county'] = self._data['zipcode'].map(self.map)
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def _check_other_vars(self):
        # TODO: Check other useful variables
        pass
    
    
class GA11TX12(BasePipeline):    
    def __init__(self, config, base_file_path):
        super().__init__(config, base_file_path)
        with open('us_mapping.json', 'r') as json_file:
            self._usmap = json.load(json_file)
    
    def transform(self): # if edited, must recreate pipeline to reset transformed flag
        #### HELPER METHOD
        def _reformat_start_date(row): # taking the row and reformating its 'start_date' based on timestamp
            # Split the date string into components
            # start_date format: 03/15 05:28 pm
            month_day, time, ampm = row['start_date'].split(' ') # taking the row's start_date and parsing it
            # Split the month and day and determine year
            s_month, s_day = month_day.split('/')
            year = None
            # Determining year using timestamp as start_date does not include year
            if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
            # timestamp format: 01-18-2024 15:25:06
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])
                if t_month == '01' and s_month == '12':
                    # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                    year = str(int(t_year) - 1)
                else: # use timestamp year
                    year = t_year 
            else: # if no timestamp
            # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
                year = '2023'

            # Extract the hour and minute from the time
            hour, minute = time.split(':')

            # Convert hour to 24-hour format
            if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour

            # Add leading zeros if necessary
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            # Combine the components into the desired format
            reformatted_date = f'{s_month}-{s_day}-{year} {hour}:{minute}:00'

            return reformatted_date


        # Helper method
        def _reformat_update(row):
            # Format: Mar 15, 5 09, pm
            ## Splitting updateTime into components month, day, hour, min, am/pm
            month_day, time, ampm = row['updateTime'].split(',') # split into date; time; and pm/am 
            # Splitting into month, day
            u_month, u_day = month_day.split(' ')
            month_dict = { 'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 
                        'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12' }
            u_month = month_dict[u_month] # turning the 3 char month name to its two digit form
            
            # Determining year from timestamp as updateTime does not have year
            year = None
            if pd.notna(row['timestamp']): # if the timestamp value for the given row is not null (hopefully it works)
                # timestamp format: 01-18-2024 15:25:06
                timestamp_components = row['timestamp'].split(' ')
                ts_date_comp = timestamp_components[0].split('-')
                t_month, t_day = ts_date_comp[0], ts_date_comp[1]
                t_year = pd.to_numeric(ts_date_comp[2])

                if t_month == '01' and u_month == '12':
                    # if the timestamp is in january but the start time is reported to be in december, make the update time have the previous year (year of Jan - 1)
                    year = str(int(t_year) - 1)
                else: # use timestamp year
                    year = t_year 
            else: # if no timestamp
            # for Walton, Tri-State, Oconee, and Mitchell, the na timestamps are in march 2023
                year = '2023'

            # Extract the hour and minute from the update time
            hour, minute = time.split() # splits based on whtie space which is space in this case and removed leading space
            # Convert hour to 24-hour format
            if 'am' in ampm.lower() and hour == '12': # if 12 am, then set to 00
                hour = '00' 
            if 'pm' in ampm.lower():
                hour = str(int(hour) + 12) if int(hour) < 12 else hour # if pm and less than 12, add 12 hours  --> othertest it is 12 pm

            # Add leading zeros if necessary
            hour = hour.zfill(2)
            minute = minute.zfill(2)

            # Combine the components into the desired format
            reformatted_date = f'{u_month}-{u_day}-{year} {hour}:{minute}:00'

            return reformatted_date
        ## HELPER METHOD ####
        

        try:
            self._data['start_date'] = self._data.apply(_reformat_start_date, axis=1) # reformattin the string to be datetime string form
            self._data['start_date'] = pd.to_datetime(self._data['start_date']) # change to timestamp data format
            self._data['updateTime'] = self._data.apply(_reformat_update, axis=1)
            self._data['updateTime'] = pd.to_datetime(self._data['updateTime'])
            self._data['duration'] = pd.to_timedelta(self._data['duration'])
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp']) # THIS HAS TO BE BEFORE applying reformat functions as splitting only works on timestamp in string form, not datetime

            # Renaming column names to match with superclass standardize
            self._data = self._data.rename(columns={
                'incident_id':'outage_id',
                'zip_code':'zipcode'
            })
            self._transformed = True
        except Exception as e:
            print(f"An error occurred during transformation: {e}")

    def standardize(self):
        # Specific transformation for GA11TX12
        # print(self.config)
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = grouped

    def _compute_metrics(self, group): # overwriting super class because the most accurate duration seems to be calculated from update times
        # group = groupby groupy of a unique outage_id
        ## HELPER METHOD TO VALIDATE DIFFERENT TIME
        
        def _validate(group): # custom aggregating function for each groupby unique outage_id to see wehether the latest outage start date for a outage_id is before the earliest update time
            num_rows = len(group)
            num_unique_lon = len(group['lon'].unique())
            num_unique_lat = len(group['lat'].unique())
            num_unique_outages = len(group['outages'].unique())
            num_unique_customers_aff = len(group['consumers_affected'].unique())
            num_unique_customers = len(group['NumConsumers'].unique())
            num_unique_start_times = len(group['start_date'].unique())
            earliest_start_time = group['start_date'].min()
            latest_start_time = group['start_date'].max()
            earliest_update_time = group['updateTime'].min()
            latest_update_time = group['updateTime'].max()
            earliest_timestamp = group['timestamp'].min()
            latest_timestamp = group['timestamp'].max()
            duration_by_lastupt_laststrt = latest_update_time - latest_start_time
            duration_by_lastupt_firststart = latest_update_time - earliest_start_time
            duration_by_update_time = latest_update_time - earliest_update_time
            duration_by_timestamp = latest_timestamp - earliest_timestamp
            timestamp_upt_diff = latest_timestamp - latest_update_time
            is_update_dur_div_15_min = (duration_by_update_time.total_seconds()/60) % 15 == 0
            max_duration = group['duration'].max()
            update_dur_timestamp_dur_err = abs(duration_by_update_time - duration_by_timestamp) if pd.notna(duration_by_update_time) and pd.notna(duration_by_timestamp) else pd.NaT
            update_dur_max_dur_error = abs(max_duration - duration_by_update_time) if pd.notna(max_duration) and pd.notna(duration_by_update_time) else pd.NaT
            timestamp_dur_max_dur_err = abs(duration_by_timestamp - max_duration) if pd.notna(max_duration) and pd.notna(duration_by_timestamp) else pd.NaT
            does_start_update_intersect = latest_start_time > earliest_update_time and earliest_start_time < latest_update_time# does start and update intersect?
            is_start_after_update = earliest_start_time >= latest_update_time # is updatetime range just completely before the startdate range? 
            is_start_before_update = latest_start_time <= earliest_update_time
            is_timestamp_upt_00_min = (timestamp_upt_diff.total_seconds() // 60) % 60 == 0 if pd.notna(timestamp_upt_diff) else None

            return pd.Series({
                'num_rows': num_rows,
                'num_unique_lon': num_unique_lon,
                'num_unique_lat': num_unique_lat,
                'num_unique_outages': num_unique_outages,
                'num_unique_customers_aff': num_unique_customers_aff,
                'num_unique_customers': num_unique_customers,
                'num_unique_start_times': num_unique_start_times,
                'earliest_start_time': earliest_start_time,
                'latest_start_time': latest_start_time, 
                'earliest_update_time': earliest_update_time,
                'latest_update_time': latest_update_time, 
                'earliest_timestamp': earliest_start_time,
                'latest_timestamp': latest_timestamp,
                'is_start_before_update': is_start_before_update, 
                'does_start_update_intersect': does_start_update_intersect, # is the latest start time before the earliest update time?
                'is_start_after_update': is_start_after_update, # is updatetime range just completely before the startdate range? 
                'max_duration': max_duration, # duration from the given df column
                'duration_by_update': duration_by_update_time,
                'duration_by_timestamp': duration_by_timestamp,
                'timestamp_upt_diff': timestamp_upt_diff,
                'update_dur_max_dur_error': update_dur_max_dur_error,
                'update_dur_timestamp_dur_err': update_dur_timestamp_dur_err,
                'timestamp_dur_max_dur_err': timestamp_dur_max_dur_err,
                'duration_by_lastupt_laststrt': duration_by_lastupt_laststrt, # duration from calculating via update times
                'duration_by_lastupt_firststart': duration_by_lastupt_firststart, 
                'is_update_dur_div_15_min': is_update_dur_div_15_min,
                'is_timestamp_upt_00_min': is_timestamp_upt_00_min
            })        
    ##
        
        validated_provider_per_outage = _validate(group)
        
        duration_diff = validated_provider_per_outage['max_duration']
        duration_max = duration_diff + timedelta(minutes=15) # because 15 minute update intervals
        duration_mean = (duration_diff + duration_max) / 2
        end_time = validated_provider_per_outage['latest_update_time']
        start_time = end_time - duration_diff
        customer_affected_mean = group['consumers_affected'].mean()
        total_customer_outage_time = customer_affected_mean * duration_diff
        zipcode = group['zipcode'].iloc[-1]
        zipcode_values = None
        
        null_zipcode = [None, None, None] 
        try:
            zipcode_values = self.map[zipcode] # the tuple of values from zipcode map (county name, fip, state) 
        except KeyError:
            try:
                zipcode_values = self._usmap[zipcode]
            except KeyError:
                # print(f"Nonexistent zipcode in {self.config['name']}: {zipcode}")        
                zipcode_values = null_zipcode


        return pd.Series({
            'start_time': start_time,
            'end_time': end_time,
            'lat': group['lat'].iloc[-1],
            'long': group['lon'].iloc[-1],
            'zipcode': zipcode,
            'county_name': zipcode_values[0], 
            'county_fips': zipcode_values[1],
            'utility_provider': self.config['name'],
            'state': zipcode_values[2],
            'duration_diff': duration_diff,
            'duration_max': duration_max,
            'duration_mean': duration_mean,
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time
        })


One Time Code to Generate the mappings from US_zips


In [155]:
# us_zip_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/US Zipcode Data/uszips.csv'
# with open(us_zip_path, 'r') as us_zip:
#     us_zip_df = pd.read_csv(us_zip)
#     us_zip_df.to_csv('filename.csv', index=False)

      # # Creating a US zip file  
#     us_zip_dict = dict(zip(us_zip_df['zip'], zip(us_zip_df['county_name'], us_zip_df['county_fips'], us_zip_df['state_name'])))
#     with open("us_mapping.json", 'w') as json_file:
#         json.dump(us_zip_dict, json_file)


# # Creating GA mapping
# states = ['Georgia', 'Texas'] # states to generate mappings for (update as needed)
# state_abbr_map = {
#     'Georgia': 'ga',
#     'Texas': 'tx'
# }
# for state in states:
#     state_df = us_zip_df[us_zip_df['state_name'] == state]
#     state_zip_dict = dict(zip(state_df['zip'], zip(state_df['county_name'], state_df['county_fips'], state_df['state_name'])))
#     with open(f"{state_abbr_map[state]}_mapping.json", 'w') as json_file:
#         json.dump(state_zip_dict, json_file)

Establishing config files

In [156]:
local_config_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/app/pipeline/config.yaml'
with open(local_config_path, 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['local_base_file_path']

# Test Code to Feel out and Validate Dataframe

Loading configs manually and separately

In [157]:
config_list = [
    (walton_conf := {'name': 'Walton EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (tris_conf := {'name': 'Tri-State EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (oconee_conf := {'name': 'Oconee EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (mitchell_conf := {'name': 'Mitchell EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}),
    (wise_conf := {'name': 'Wise Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}),
    (houston_conf := {'name': 'Houston County Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}),
    (cherokee_conf := {'name': 'Cherokee County Electric Coop Association', 'state': 'tx', 'layout': 12, 'type': 'o'})
]

pipelines = [
    (walton_pipe := GA11TX12(walton_conf, base_file_path)),
    (tris_pipe := GA11TX12(tris_conf, base_file_path)),
    (oconee_pipe := GA11TX12(oconee_conf, base_file_path)),
    (mitchell_pipe := GA11TX12(mitchell_conf, base_file_path)),
    (wise_pipe := GA11TX12(wise_conf, base_file_path)),
    (houston_pipe := GA11TX12(houston_conf, base_file_path)),
    (cherokee_pipe := GA11TX12(cherokee_conf, base_file_path))
]

for pipeline in pipelines:
    pipeline.load_data()
    
# for pipeline in pipelines:
#     pipeline.transform()

dataframes = [
    (walton_df := walton_pipe._data),
    (tris_df := tris_pipe._data),
    (oconee_df := oconee_pipe._data),
    (mitchell_df := mitchell_pipe._data),
    (wise_df := wise_pipe._data),
    (houston_df := houston_pipe._data),
    (cherokee_df := cherokee_pipe._data)
]

display(walton_df)

/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Walton EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Tri-State EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Oconee EMC.csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Wise Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Houston County Electric Coop, Inc..csv
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Cherokee County Electric Coop Association.csv


  self._data = pd.read_csv(file_path)


Unnamed: 0,substation,feeder,incident_id,estimated_restore_time,formatted_ert,start_date,duration,consumers_affected,lon,lat,service_index_name,outages,NumConsumers,zip_code,isHighTraffic,updateTime,EMC,alias,outage_comment,timestamp,opt_code,poly
0,14.0,3.0,C950581,,,03/15 04:28 pm,00 hr 11 min,1,-83.669285,33.911106,Electric,1,137416,30656,False,"Mar 15, 3 39, pm",Walton EMC,,,,,
1,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 05 min,1,-83.703931,33.760364,Electric,1,137416,30655,False,"Mar 15, 3 54, pm",Walton EMC,,,,,
2,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 21 min,1,-83.703931,33.760364,Electric,1,137416,30655,False,"Mar 15, 4 09, pm",Walton EMC,,,,,
3,1.0,3.0,C950582,,,03/15 04:48 pm,00 hr 35 min,1,-83.703931,33.760364,Electric,2,137416,30655,False,"Mar 15, 4 24, pm",Walton EMC,,,,,
4,14.0,1.0,C950583,,,03/15 05:14 pm,00 hr 10 min,1,-83.768580,33.900673,Electric,2,137416,30620,False,"Mar 15, 4 24, pm",Walton EMC,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71389,11.0,5.0,D1009642,,,01/18 08:15 am,01 hr 54 min,3,-83.440692,33.872791,Electric,15,139225,30677,False,"Jan 18, 9 10, am",Walton EMC,EXPERIMENT STAT/ 15 KVA,,01-18-2024 15:25:06,,"[{'lon': '-83.4402683290629', 'lat': '33.87192..."
71390,6.0,1.0,C1009637,,,01/18 07:37 am,02 hr 32 min,1,-84.043121,33.810690,Electric,15,139225,30039,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,
71391,34.0,3.0,C1009640,,,01/18 08:08 am,02 hr 00 min,1,-83.435911,33.739100,Electric,15,139225,30638,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,
71392,4.0,2.0,C1009654,,,01/18 09:28 am,00 hr 41 min,1,-83.512373,33.571093,Electric,15,139225,30650,False,"Jan 18, 9 10, am",Walton EMC,,,01-18-2024 15:25:06,,


Seeing what columns have mostly null values

In [158]:
null_rate = 0
print("Average null value rate per column across all providers")
for dataframe in dataframes:
    null_rate = null_rate + dataframe.isnull().sum() / len(dataframe)

print(null_rate / len(dataframes))


Average null value rate per column across all providers
EMC                       0.000000
NumConsumers              0.000000
alias                     0.710222
consumers_affected        0.000000
duration                  0.000000
estimated_restore_time    0.945823
feeder                    0.017576
formatted_ert             0.945823
incident_id               0.000000
isHighTraffic             0.000000
lat                       0.019902
lon                       0.019902
opt_code                       NaN
outage_comment            0.911656
outages                   0.000000
poly                           NaN
service_index_name        0.000000
start_date                0.000000
substation                0.016832
timestamp                 0.009554
updateTime                0.000000
zip_code                  0.000000
dtype: float64


Seeing the rate of null values, we see that across all the providers:
- outage_comment
- estimated_restore_time
- formatted ert
- alias

have significantly high % of nulls. Thus we will not use them.
Other unused variables:

- feeder
- isHighTraffic
- poly
- opt_code
- substation

### Sample Views of Transformation, Validation, and Compute Metrics

In [159]:
# Control which df to test on
test_conf = mitchell_conf 
test_p = GA11TX12(test_conf, base_file_path)
test_p.load_data()


/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)


#### Issues with Raw Data and Validating Transform:
- The updateTime and start_date do not have the year included, so we need to utilize the timestamp to calculate the years in order to convert to a datetime format to calculate metrics
- However, some rows have null timestamps.

In [160]:
# Display the rows of each dataframe where timestamp is null
for dataframe in dataframes:
    display(dataframe[dataframe['timestamp'].isna()][['incident_id', 'start_date', 'updateTime', 'timestamp']])

Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,C950581,03/15 04:28 pm,"Mar 15, 3 39, pm",
1,C950582,03/15 04:48 pm,"Mar 15, 3 54, pm",
2,C950582,03/15 04:48 pm,"Mar 15, 4 09, pm",
3,C950582,03/15 04:48 pm,"Mar 15, 4 24, pm",
4,C950583,03/15 05:14 pm,"Mar 15, 4 24, pm",
...,...,...,...,...
1735,C951589,03/29 03:09 pm,"Mar 29, 4 09, pm",
1736,C951590,03/29 04:46 pm,"Mar 29, 4 09, pm",
1737,C951589,03/29 03:09 pm,"Mar 29, 4 24, pm",
1738,C951589,03/29 03:09 pm,"Mar 29, 4 39, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,D760313,03/16 09:34 pm,"Mar 16, 8 39, pm",
1,D760313,03/16 09:34 pm,"Mar 16, 8 54, pm",
2,D760313,03/16 09:34 pm,"Mar 16, 9 09, pm",
3,D760353,03/19 09:16 am,"Mar 19, 8 24, am",
4,D760353,03/19 09:16 am,"Mar 19, 8 39, am",
...,...,...,...,...
202,D760657,03/28 11:19 am,"Mar 28, 12 09, pm",
203,D760660,03/28 01:59 pm,"Mar 28, 1 09, pm",
204,D760660,03/28 01:59 pm,"Mar 28, 1 24, pm",
205,D760660,03/28 01:59 pm,"Mar 28, 1 39, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,C434216,03/15 04:31 pm,"Mar 15, 3 40, pm",
1,C434216,03/15 04:31 pm,"Mar 15, 3 55, pm",
2,C434216,03/15 04:31 pm,"Mar 15, 4 10, pm",
3,C434216,03/15 04:31 pm,"Mar 15, 4 25, pm",
4,C434219,03/16 07:42 pm,"Mar 16, 6 55, pm",
...,...,...,...,...
1140,D434618,03/29 04:52 pm,"Mar 29, 4 10, pm",
1141,D434617,03/29 04:35 pm,"Mar 29, 4 25, pm",
1142,D434618,03/29 04:52 pm,"Mar 29, 4 25, pm",
1143,D434617,03/29 04:35 pm,"Mar 29, 4 40, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp
0,G437006,03/15 07:14 am,"Mar 15, 3 25, pm",
1,C483513,03/15 02:24 pm,"Mar 15, 3 25, pm",
2,G437006,03/15 07:14 am,"Mar 15, 3 40, pm",
3,G437006,03/15 07:14 am,"Mar 15, 3 55, pm",
4,G437006,03/15 07:14 am,"Mar 15, 4 10, pm",
...,...,...,...,...
2262,G437006,03/29 07:31 am,"Mar 29, 3 40, pm",
2263,G437006,03/29 07:31 am,"Mar 29, 3 55, pm",
2264,G437006,03/29 07:31 am,"Mar 29, 4 10, pm",
2265,G437006,03/29 07:31 am,"Mar 29, 4 25, pm",


Unnamed: 0,incident_id,start_date,updateTime,timestamp


Unnamed: 0,incident_id,start_date,updateTime,timestamp


Unnamed: 0,incident_id,start_date,updateTime,timestamp


As you can see, only the first 4 of the 7 dataframes (which coincide with the GA providers) have null timestampes and they are all in March 2023 (this initial dataset created from 3/2023 to 1/2024).

Hence, from now on for future updates of the dataset, we will assume there are NO MORE null timestamps and the only null ones will be 3/2023

#### Transformation Before and After

In [161]:
print("Before transform")
print(test_p._data.info())
display(test_p._data[['incident_id', 'start_date', 'updateTime']])
test_p.transform()
print("after transform")
print(test_p._data.info())
display(test_p._data[['outage_id', 'start_date', 'updateTime']])


Before transform
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123812 entries, 0 to 123811
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   substation              109696 non-null  float64
 1   feeder                  109696 non-null  float64
 2   incident_id             123812 non-null  object 
 3   alias                   0 non-null       float64
 4   outage_comment          45473 non-null   object 
 5   estimated_restore_time  1 non-null       object 
 6   formatted_ert           1 non-null       object 
 7   start_date              123812 non-null  object 
 8   duration                123812 non-null  object 
 9   consumers_affected      123812 non-null  int64  
 10  lon                     106572 non-null  float64
 11  lat                     106572 non-null  float64
 12  opt_code                11893 non-null   object 
 13  service_index_name      123812 non-null  object 
 14  out

Unnamed: 0,incident_id,start_date,updateTime
0,G437006,03/15 07:14 am,"Mar 15, 3 25, pm"
1,C483513,03/15 02:24 pm,"Mar 15, 3 25, pm"
2,G437006,03/15 07:14 am,"Mar 15, 3 40, pm"
3,G437006,03/15 07:14 am,"Mar 15, 3 55, pm"
4,G437006,03/15 07:14 am,"Mar 15, 4 10, pm"
...,...,...,...
123807,G437006,01/17 06:32 am,"Jan 18, 9 25, am"
123808,D500951,01/15 07:10 am,"Jan 18, 9 25, am"
123809,D500998,01/17 07:20 am,"Jan 18, 9 25, am"
123810,C501047,01/18 09:20 am,"Jan 18, 9 25, am"


after transform
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123812 entries, 0 to 123811
Data columns (total 22 columns):
 #   Column                  Non-Null Count   Dtype          
---  ------                  --------------   -----          
 0   substation              109696 non-null  float64        
 1   feeder                  109696 non-null  float64        
 2   outage_id               123812 non-null  object         
 3   alias                   0 non-null       float64        
 4   outage_comment          45473 non-null   object         
 5   estimated_restore_time  1 non-null       object         
 6   formatted_ert           1 non-null       object         
 7   start_date              123812 non-null  datetime64[ns] 
 8   duration                123812 non-null  timedelta64[ns]
 9   consumers_affected      123812 non-null  int64          
 10  lon                     106572 non-null  float64        
 11  lat                     106572 non-null  float64        
 12  

Unnamed: 0,outage_id,start_date,updateTime
0,G437006,2023-03-15 07:14:00,2023-03-15 15:25:00
1,C483513,2023-03-15 14:24:00,2023-03-15 15:25:00
2,G437006,2023-03-15 07:14:00,2023-03-15 15:40:00
3,G437006,2023-03-15 07:14:00,2023-03-15 15:55:00
4,G437006,2023-03-15 07:14:00,2023-03-15 16:10:00
...,...,...,...
123807,G437006,2024-01-17 06:32:00,2024-01-18 09:25:00
123808,D500951,2024-01-15 07:10:00,2024-01-18 09:25:00
123809,D500998,2024-01-17 07:20:00,2024-01-18 09:25:00
123810,C501047,2024-01-18 09:20:00,2024-01-18 09:25:00


## Validating Data via Summary Metrics

Now that we transformed the data, we can do calculations on the various data.

In [162]:
def _validate(group): # custom aggregating function for each groupby unique outage_id to see wehether the latest outage start date for a outage_id is before the earliest update time
    num_rows = len(group)
    num_unique_lon = len(group['lon'].unique())
    num_unique_lat = len(group['lat'].unique())
    num_unique_outages = len(group['outages'].unique())
    num_unique_customers_aff = len(group['consumers_affected'].unique())
    num_unique_customers = len(group['NumConsumers'].unique())
    num_unique_start_times = len(group['start_date'].unique())
    earliest_start_time = group['start_date'].min()
    latest_start_time = group['start_date'].max()
    earliest_update_time = group['updateTime'].min()
    latest_update_time = group['updateTime'].max()
    earliest_timestamp = group['timestamp'].min()
    latest_timestamp = group['timestamp'].max()
    duration_by_lastupt_laststrt = latest_update_time - latest_start_time
    duration_by_lastupt_firststart = latest_update_time - earliest_start_time
    duration_by_update_time = latest_update_time - earliest_update_time
    duration_by_timestamp = latest_timestamp - earliest_timestamp
    timestamp_upt_diff = latest_timestamp - latest_update_time
    is_update_dur_div_15_min = (duration_by_update_time.total_seconds()/60) % 15 == 0
    max_duration = group['duration'].max()
    update_dur_timestamp_dur_err = abs(duration_by_update_time - duration_by_timestamp) if pd.notna(duration_by_update_time) and pd.notna(duration_by_timestamp) else pd.NaT
    update_dur_max_dur_error = abs(max_duration - duration_by_update_time) if pd.notna(max_duration) and pd.notna(duration_by_update_time) else pd.NaT
    timestamp_dur_max_dur_err = abs(duration_by_timestamp - max_duration) if pd.notna(max_duration) and pd.notna(duration_by_timestamp) else pd.NaT
    does_start_update_intersect = latest_start_time > earliest_update_time and earliest_start_time < latest_update_time# does start and update intersect?
    is_start_after_update = earliest_start_time >= latest_update_time # is updatetime range just completely before the startdate range? 
    is_start_before_update = latest_start_time <= earliest_update_time
    is_timestamp_upt_00_min = (timestamp_upt_diff.total_seconds() // 60) % 60 == 0 if pd.notna(timestamp_upt_diff) else None

    return pd.Series({
        'num_rows': num_rows,
        'num_unique_lon': num_unique_lon,
        'num_unique_lat': num_unique_lat,
        'num_unique_outages': num_unique_outages,
        'num_unique_customers_aff': num_unique_customers_aff,
        'num_unique_customers': num_unique_customers,
        'num_unique_start_times': num_unique_start_times,
        'earliest_start_time': earliest_start_time,
        'latest_start_time': latest_start_time, 
        'earliest_update_time': earliest_update_time,
        'latest_update_time': latest_update_time, 
        'earliest_timestamp': earliest_start_time,
        'latest_timestamp': latest_timestamp,
        'is_start_before_update': is_start_before_update, 
        'does_start_update_intersect': does_start_update_intersect, # is the latest start time before the earliest update time?
        'is_start_after_update': is_start_after_update, # is updatetime range just completely before the startdate range? 
        'max_duration': max_duration, # duration from the given df column
        'duration_by_update': duration_by_update_time,
        'duration_by_timestamp': duration_by_timestamp,
        'timestamp_upt_diff': timestamp_upt_diff,
        'update_dur_max_dur_error': update_dur_max_dur_error,
        'update_dur_timestamp_dur_err': update_dur_timestamp_dur_err,
        'timestamp_dur_max_dur_err': timestamp_dur_max_dur_err,
        'duration_by_lastupt_laststrt': duration_by_lastupt_laststrt, # duration from calculating via update times
        'duration_by_lastupt_firststart': duration_by_lastupt_firststart, 
        'is_update_dur_div_15_min': is_update_dur_div_15_min,
        'is_timestamp_upt_00_min': is_timestamp_upt_00_min
    })

In [163]:
# Testing on a dataframe
grouped = test_p._data.groupby("outage_id").apply(_validate)
display(grouped)

  grouped = test_p._data.groupby("outage_id").apply(_validate)


Unnamed: 0_level_0,num_rows,num_unique_lon,num_unique_lat,num_unique_outages,num_unique_customers_aff,num_unique_customers,num_unique_start_times,earliest_start_time,latest_start_time,earliest_update_time,latest_update_time,earliest_timestamp,latest_timestamp,is_start_before_update,does_start_update_intersect,is_start_after_update,max_duration,duration_by_update,duration_by_timestamp,timestamp_upt_diff,update_dur_max_dur_error,update_dur_timestamp_dur_err,timestamp_dur_max_dur_err,duration_by_lastupt_laststrt,duration_by_lastupt_firststart,is_update_dur_div_15_min,is_timestamp_upt_00_min
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
C483513,1,1,1,1,1,1,1,2023-03-15 14:24:00,2023-03-15 14:24:00,2023-03-15 15:25:00,2023-03-15 15:25:00,2023-03-15 14:24:00,NaT,True,False,False,0 days 02:00:00,0 days 00:00:00,NaT,NaT,0 days 02:00:00,NaT,NaT,0 days 01:01:00,0 days 01:01:00,True,
C483521,1,1,1,1,1,1,1,2023-03-16 07:22:00,2023-03-16 07:22:00,2023-03-16 06:25:00,2023-03-16 06:25:00,2023-03-16 07:22:00,NaT,False,False,True,0 days 00:02:00,0 days 00:00:00,NaT,NaT,0 days 00:02:00,NaT,NaT,-1 days +23:03:00,-1 days +23:03:00,True,
C483523,6,1,1,2,1,1,1,2023-03-16 13:27:00,2023-03-16 13:27:00,2023-03-16 12:55:00,2023-03-16 14:10:00,2023-03-16 13:27:00,NaT,False,True,False,0 days 01:42:00,0 days 01:15:00,NaT,NaT,0 days 00:27:00,NaT,NaT,0 days 00:43:00,0 days 00:43:00,True,
C483525,6,1,1,1,1,2,1,2023-03-17 06:04:00,2023-03-17 06:04:00,2023-03-17 05:25:00,2023-03-17 06:55:00,2023-03-17 06:04:00,NaT,False,True,False,0 days 01:49:00,0 days 01:30:00,NaT,NaT,0 days 00:19:00,NaT,NaT,0 days 00:51:00,0 days 00:51:00,True,
C483528,1,1,1,1,1,1,1,2023-03-17 08:34:00,2023-03-17 08:34:00,2023-03-17 07:40:00,2023-03-17 07:40:00,2023-03-17 08:34:00,NaT,False,False,True,0 days 00:05:00,0 days 00:00:00,NaT,NaT,0 days 00:05:00,NaT,NaT,-1 days +23:06:00,-1 days +23:06:00,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D501048,4,1,1,3,1,1,1,2024-01-18 09:35:00,2024-01-18 09:35:00,2024-01-18 08:40:00,2024-01-18 09:25:00,2024-01-18 09:35:00,2024-01-18 15:25:40,False,False,True,0 days 00:48:00,0 days 00:45:00,0 days 00:44:53,0 days 06:00:40,0 days 00:03:00,0 days 00:00:07,0 days 00:03:07,-1 days +23:50:00,-1 days +23:50:00,True,True
G437006,27691,785,786,274,59,103,264,2023-03-15 07:14:00,2024-01-17 06:32:00,2023-03-15 15:25:00,2024-01-18 09:25:00,2023-03-15 07:14:00,2024-01-18 15:25:40,False,True,False,7 days 23:27:00,308 days 18:00:00,294 days 17:29:57,0 days 06:00:40,300 days 18:33:00,14 days 00:30:03,286 days 18:02:57,1 days 02:53:00,309 days 02:11:00,True,True
G484576,1,1,1,1,1,1,1,2023-04-14 10:25:00,2023-04-14 10:25:00,2023-04-14 09:55:00,2023-04-14 09:55:00,2023-04-14 10:25:00,2023-04-14 14:55:40,False,False,True,0 days 00:30:00,0 days 00:00:00,0 days 00:00:00,0 days 05:00:40,0 days 00:30:00,0 days 00:00:00,0 days 00:30:00,-1 days +23:30:00,-1 days +23:30:00,True,True
G495149,4,2,2,4,2,1,1,2023-08-30 09:40:00,2023-08-30 09:40:00,2023-08-30 08:55:00,2023-08-30 09:40:00,2023-08-30 09:40:00,2023-08-30 14:40:56,False,False,True,0 days 01:00:00,0 days 00:45:00,0 days 00:45:08,0 days 05:00:56,0 days 00:15:00,0 days 00:00:08,0 days 00:14:52,0 days 00:00:00,0 days 00:00:00,True,True


As we notice, while we have start times, update times, and timestamps:
- there could be multiple start times
- the range of start times could intersect or come after the range of update times which doesn't make any sense
- the timestamps are hours after the updateTimes and suspiciously in hour intervals

## Summary of Validation Metrics across all Providers

In [164]:
for pipeline in pipelines:
    pipeline.transform()

In [165]:
summary_data = []
for pipeline in pipelines:
    validated = pipeline._data.groupby('outage_id').apply(_validate, include_groups=False).reset_index()
        # validated is the validated rows for a provider where each row is a unique outage_id
    subdf = {
        "Provider": pipeline.config['name'],
        "Avg num rows per outage_id": validated['num_rows'].mean(),
        "Num outages": len(validated),
        "Average unique outages per outage": validated['num_unique_outages'].mean(),
        "Average unique num cust aff": validated['num_unique_customers_aff'].mean(),
        "Average unique num cust": validated['num_unique_customers'].mean(),
        "Average unique start times": validated['num_unique_start_times'].mean(),
        "Unique Longitude Avg": validated['num_unique_lon'].mean(),
        "Unique Long Median": validated['num_unique_lon'].median(),
        "Unique Long Max": validated['num_unique_lon'].max(),
        "Unique Long Std": validated['num_unique_lon'].std(),
        "Unique Latitude Avg": validated['num_unique_lat'].mean(),
        "Unique Lat Median": validated['num_unique_lat'].median(),
        "Unique Lat Max": validated['num_unique_lat'].max(),
        "Unique Lat Std": validated['num_unique_lat'].std(),
        "Unique startDate Avg": validated['num_unique_start_times'].mean(),
        "Unique startDate Median": validated['num_unique_start_times'].median(),
        "Unique startDate Max": validated['num_unique_start_times'].max(),
        "Unique startDate Std": validated['num_unique_start_times'].std(),
        "Average update_dur_max_dur error": validated['update_dur_max_dur_error'].mean(),
        "Average update_dur_timestamp_dur error": validated['update_dur_timestamp_dur_err'].mean(),
        "Average timestamp_dur_max_dur error": validated['timestamp_dur_max_dur_err'].mean(),
        "start intersects update or start > update freq": (len(validated[validated['is_start_before_update'] == False]) / len(validated)),
        "is_update_dur_div_15_min freq": len(validated[validated['is_update_dur_div_15_min']]) / len(validated),
        "is_timestamp_upt_00_min freq": len(validated[validated['is_timestamp_upt_00_min'] == True]) / len(validated)
    }
    summary_data.append(subdf) # row-wise appending
display(pd.DataFrame(summary_data))

Unnamed: 0,Provider,Avg num rows per outage_id,Num outages,Average unique outages per outage,Average unique num cust aff,Average unique num cust,Average unique start times,Unique Longitude Avg,Unique Long Median,Unique Long Max,Unique Long Std,Unique Latitude Avg,Unique Lat Median,Unique Lat Max,Unique Lat Std,Unique startDate Avg,Unique startDate Median,Unique startDate Max,Unique startDate Std,Average update_dur_max_dur error,Average update_dur_timestamp_dur error,Average timestamp_dur_max_dur error,start intersects update or start > update freq,is_update_dur_div_15_min freq,is_timestamp_upt_00_min freq
0,Walton EMC,4.860041,14690,3.426004,1.00354,1.016201,1.002178,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.002178,1.0,3,0.050816,0 days 00:32:28.019060585,0 days 00:06:51.464185981,0 days 00:34:51.336847249,0.902723,0.87209,0.225459
1,Tri-State EMC,25.818353,1591,13.199874,1.069767,1.179133,1.011314,1.003143,1.0,2,0.055989,1.003143,1.0,2,0.055989,1.011314,1.0,3,0.111582,0 days 01:28:30.999371464,0 days 00:00:03.621309370,0 days 01:29:55.084724005,0.85858,0.964173,0.715902
2,Oconee EMC,7.532451,7935,6.288847,1.001512,1.012728,1.0,1.000126,1.0,2,0.011226,1.000126,1.0,2,0.011226,1.0,1.0,1,0.0,0 days 00:30:19.720226843,0 days 00:00:26.727929584,0 days 00:30:56.415528113,0.889225,0.798362,0.521613
3,Mitchell EMC,16.858932,7344,4.888617,1.014434,1.097086,1.039352,1.10689,1.0,785,9.148502,1.107026,1.0,786,9.160171,1.039352,1.0,264,3.069569,0 days 01:25:27.312091503,0 days 00:03:44.620383226,0 days 01:24:13.790058317,0.954929,0.837554,0.689134
4,"Wise Electric Coop, Inc.",4.817185,2188,2.347349,1.002285,1.053016,1.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.0,1.0,1,0.0,0 days 00:10:17.221206581,0 days 00:00:03.721206581,0 days 00:10:15.956124314,0.0,0.987203,0.901737
5,"Houston County Electric Coop, Inc.",8.483735,1660,5.476506,1.036747,1.056024,1.001205,1.03012,1.0,6,0.271728,1.029518,1.0,6,0.270683,1.001205,1.0,2,0.0347,0 days 01:30:09.831325301,0 days 00:02:09.906024096,0 days 01:29:49.469879518,0.0,0.796386,0.753012
6,Cherokee County Electric Coop Association,10.775523,4780,8.092259,1.000418,1.116527,1.000628,1.0,1.0,1,0.0,1.0,1.0,1,0.0,1.000628,1.0,2,0.025047,0 days 01:44:48.288702928,0 days 00:00:25.038075313,0 days 01:44:54.156485355,0.000628,0.999372,0.300837


Start date and End date?
From the summary table above, we notice how the 4 GA providers have a majority of their start_date ranges intersecting with their updateTime ranges (far right column) making start date a dubious metric to use for a start date.

How about the duration?
- Most of the update time differences are in 15 min intervals meaning it is not really precise
- Some providers have their latest timestamp strictly some hours (ex: 6:00:00) after the latest updatetime (a non trivial %) meaning the timestamp may be dependent on the updatetime (maybe timestamp is some timezone diff away from updatetime)

Comparing max duration vs diff in update vs diff in timestamp:
- The max duration vs the difference in update yields non-trivial differences (several hours to over a day long), so that means that the difference update times could not be the entirety of the outage (ex: the first updatetime is associated with a duration meaning an outage is ongoing at this point) (ex2: outages with only 1 line meaning the diff in updatetime is 0 but either the updatetime should be the start/end of the outage).
- The max duration vs the diff in timestamp yields non-trivial differences as well.
- Curiously, diff in update and diff in timestamp show the smallest difference between them which could allude that update and timestamp are more related with each other than max duration is to either.
- With the clues that update time and timestamp may not be precise and independent, duration seems like a more promising candidate.

Therefore, by process of eliminating updateTime diff, timestamp diff, and starttime diff, we will use the duration column for the duration calculation

- Thus, we will use the last updatetime as the ending of the outage and subtract the max_duration from it to get the start time.

# Running the Pipeline

In [168]:
# Instantiate a BasePipeline object for each provider in the configuration
for provider in config['providers']:
    pipeline = GA11TX12(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)

{'name': 'Walton EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Walton EMC.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C1000672,2023-11-18 17:09:00,2023-11-21 15:40:00,33.71,-84.01,30012,Rockdale,13247.0,Walton EMC,Georgia,2 days 22:31:00,2 days 22:46:00,2 days 22:38:30,1.0,2 days 22:31:00
1,C1000694,2023-11-20 08:17:00,2023-11-20 10:25:00,33.78,-83.86,30052,Walton,13297.0,Walton EMC,Georgia,0 days 02:08:00,0 days 02:23:00,0 days 02:15:30,1.0,0 days 02:08:00
2,C1000697,2023-11-20 08:42:00,2023-11-20 09:10:00,33.74,-83.70,30655,Walton,13297.0,Walton EMC,Georgia,0 days 00:28:00,0 days 00:43:00,0 days 00:35:30,1.0,0 days 00:28:00
3,C1000701,2023-11-20 09:20:00,2023-11-20 09:25:00,33.91,-83.41,30606,Clarke,13059.0,Walton EMC,Georgia,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,1.0,0 days 00:05:00
4,C1000702,2023-11-20 10:06:00,2023-11-20 12:10:00,33.75,-83.99,30012,Rockdale,13247.0,Walton EMC,Georgia,0 days 02:04:00,0 days 02:19:00,0 days 02:11:30,1.0,0 days 02:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14685,G995287,2023-09-08 20:49:00,2023-09-08 23:24:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 02:35:00,0 days 02:50:00,0 days 02:42:30,83.9,9 days 00:44:30
14686,G996351,2023-09-12 15:20:00,2023-09-12 16:54:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 01:34:00,0 days 01:49:00,0 days 01:41:30,3.0,0 days 04:42:00
14687,G996585,2023-09-15 04:24:00,2023-09-15 05:09:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 00:45:00,0 days 01:00:00,0 days 00:52:30,2.0,0 days 01:30:00
14688,G997542,2023-09-28 20:01:00,2023-09-28 21:09:00,0.00,-85.49,unknown,,,Walton EMC,,0 days 01:08:00,0 days 01:23:00,0 days 01:15:30,12.0,0 days 13:36:00


{'name': 'Tri-State EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Tri-State EMC.csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,D760313,2023-03-16 20:33:00,2023-03-16 21:09:00,35.12,-84.33,37391,Polk,47139.0,Tri-State EMC,Tennessee,0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,3.0,0 days 01:48:00
1,D760353,2023-03-19 08:16:00,2023-03-19 16:09:00,34.93,-84.32,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 07:53:00,0 days 08:08:00,0 days 08:00:30,780.0,256 days 05:00:00
2,D760483,2023-03-20 00:05:00,2023-03-20 01:24:00,34.92,-84.28,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 01:19:00,0 days 01:34:00,0 days 01:26:30,823.0,45 days 03:37:00
3,D760513,2023-03-20 06:37:00,2023-03-20 08:24:00,34.99,-84.27,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,1.0,0 days 01:47:00
4,D760514,2023-03-20 10:50:00,2023-03-20 11:39:00,34.84,-84.30,30513,Fannin,13111.0,Tri-State EMC,Georgia,0 days 00:49:00,0 days 01:04:00,0 days 00:56:30,1.0,0 days 00:49:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586,D774504,2024-01-17 22:23:00,2024-01-18 00:39:00,34.90,-84.27,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 02:16:00,0 days 02:31:00,0 days 02:23:30,2.0,0 days 04:32:00
1587,D774507,2024-01-18 03:48:00,2024-01-18 06:24:00,35.09,-84.30,unknown,,,Tri-State EMC,,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,2.0,0 days 05:12:00
1588,D774509,2024-01-18 03:48:00,2024-01-18 06:24:00,35.09,-84.30,28906,Cherokee,37039.0,Tri-State EMC,North Carolina,0 days 02:36:00,0 days 02:51:00,0 days 02:43:30,7.0,0 days 18:12:00
1589,D774512,2024-01-18 08:07:00,2024-01-18 08:39:00,34.94,-84.28,30559,Fannin,13111.0,Tri-State EMC,Georgia,0 days 00:32:00,0 days 00:47:00,0 days 00:39:30,3.0,0 days 01:36:00


{'name': 'Oconee EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Oconee EMC.csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C434216,2023-03-15 15:32:00,2023-03-15 16:25:00,32.70,-83.50,31295,,,Oconee EMC,,0 days 00:53:00,0 days 01:08:00,0 days 01:00:30,1.0,0 days 00:53:00
1,C434219,2023-03-16 18:43:00,2023-03-16 19:40:00,32.46,-83.18,unknown,,,Oconee EMC,,0 days 00:57:00,0 days 01:12:00,0 days 01:04:30,1.0,0 days 00:57:00
2,C434221,2023-03-17 14:03:00,2023-03-17 14:40:00,32.62,-83.35,31044,Twiggs,13289.0,Oconee EMC,Georgia,0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,1.0,0 days 00:37:00
3,C434222,2023-03-17 14:16:00,2023-03-17 14:40:00,32.62,-83.35,31044,Twiggs,13289.0,Oconee EMC,Georgia,0 days 00:24:00,0 days 00:39:00,0 days 00:31:30,1.0,0 days 00:24:00
4,C434224,2023-03-17 18:16:00,2023-03-17 18:55:00,32.56,-82.95,31021,Laurens,13175.0,Oconee EMC,Georgia,0 days 00:39:00,0 days 00:54:00,0 days 00:46:30,1.0,0 days 00:39:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7930,D444934,2024-01-10 12:08:00,2024-01-10 13:40:00,32.61,-82.97,31021,Laurens,13175.0,Oconee EMC,Georgia,0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,1.0,0 days 01:32:00
7931,D444941,2024-01-10 15:59:00,2024-01-10 18:25:00,32.65,-82.97,unknown,,,Oconee EMC,,0 days 02:26:00,0 days 02:41:00,0 days 02:33:30,1.0,0 days 02:26:00
7932,D444972,2024-01-13 10:55:00,2024-01-13 13:10:00,32.90,-83.25,31054,Wilkinson,13319.0,Oconee EMC,Georgia,0 days 02:15:00,0 days 02:30:00,0 days 02:22:30,4.0,0 days 09:00:00
7933,D444980,2024-01-16 09:11:00,2024-01-16 10:40:00,32.90,-83.25,31054,Wilkinson,13319.0,Oconee EMC,Georgia,0 days 01:29:00,0 days 01:44:00,0 days 01:36:30,5.0,0 days 07:25:00


{'name': 'Mitchell EMC', 'state': 'ga', 'layout': 11, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/ga/layout_11/per_outage_Mitchell EMC.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C483513,2023-03-15 13:25:00,2023-03-15 15:25:00,31.21,-84.37,31730,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 02:00:00,0 days 02:15:00,0 days 02:07:30,1.00,0 days 02:00:00
1,C483521,2023-03-16 06:23:00,2023-03-16 06:25:00,31.18,-84.10,31779,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 00:02:00,0 days 00:17:00,0 days 00:09:30,1.00,0 days 00:02:00
2,C483523,2023-03-16 12:28:00,2023-03-16 14:10:00,31.25,-84.43,unknown,,,Mitchell EMC,,0 days 01:42:00,0 days 01:57:00,0 days 01:49:30,1.00,0 days 01:42:00
3,C483525,2023-03-17 05:06:00,2023-03-17 06:55:00,31.40,-83.72,31795,Worth,13321.0,Mitchell EMC,Georgia,0 days 01:49:00,0 days 02:04:00,0 days 01:56:30,1.00,0 days 01:49:00
4,C483528,2023-03-17 07:35:00,2023-03-17 07:40:00,31.24,-84.26,31730,Mitchell,13205.0,Mitchell EMC,Georgia,0 days 00:05:00,0 days 00:20:00,0 days 00:12:30,1.00,0 days 00:05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7339,D501048,2024-01-18 08:37:00,2024-01-18 09:25:00,31.39,-84.67,39813,Calhoun,13037.0,Mitchell EMC,Georgia,0 days 00:48:00,0 days 01:03:00,0 days 00:55:30,1.00,0 days 00:48:00
7340,G437006,2024-01-10 09:58:00,2024-01-18 09:25:00,31.43,-84.16,unknown,,,Mitchell EMC,,7 days 23:27:00,7 days 23:42:00,7 days 23:34:30,1.21,9 days 16:17:23.312267523
7341,G484576,2023-04-14 09:25:00,2023-04-14 09:55:00,31.55,-84.31,31721,Dougherty,13095.0,Mitchell EMC,Georgia,0 days 00:30:00,0 days 00:45:00,0 days 00:37:30,45.00,0 days 22:30:00
7342,G495149,2023-08-30 08:40:00,2023-08-30 09:40:00,31.55,-83.96,31791,Worth,13321.0,Mitchell EMC,Georgia,0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,4.00,0 days 04:00:00


{'name': 'Wise Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Wise Electric Coop, Inc..csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C399381,2023-04-15 13:08:00,2023-04-15 14:17:00,33.04,-97.75,76083,,,"Wise Electric Coop, Inc.",,0 days 01:09:00,0 days 01:24:00,0 days 01:16:30,1.0,0 days 01:09:00
1,C399382,2023-04-16 07:36:00,2023-04-16 09:17:00,33.06,-97.78,76073,Wise,48497.0,"Wise Electric Coop, Inc.",Texas,0 days 01:41:00,0 days 01:56:00,0 days 01:48:30,1.0,0 days 01:41:00
2,C399386,2023-04-16 16:30:00,2023-04-16 17:47:00,33.17,-97.44,76234,Wise,48497.0,"Wise Electric Coop, Inc.",Texas,0 days 01:17:00,0 days 01:32:00,0 days 01:24:30,1.0,0 days 01:17:00
3,C399401,2023-04-17 16:09:00,2023-04-17 18:02:00,33.28,-97.56,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:53:00,0 days 02:08:00,0 days 02:00:30,1.0,0 days 01:53:00
4,C399402,2023-04-18 08:56:00,2023-04-18 09:32:00,33.34,-97.63,unknown,,,"Wise Electric Coop, Inc.",,0 days 00:36:00,0 days 00:51:00,0 days 00:43:30,1.0,0 days 00:36:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2183,G406396,2023-12-30 07:24:00,2023-12-30 08:47:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:23:00,0 days 01:38:00,0 days 01:30:30,14.0,0 days 19:22:00
2184,G406415,2023-12-30 12:42:00,2023-12-30 13:47:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:05:00,0 days 01:20:00,0 days 01:12:30,12.0,0 days 13:00:00
2185,G406964,2024-01-12 01:32:00,2024-01-12 02:32:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:00:00,0 days 01:15:00,0 days 01:07:30,6.0,0 days 06:00:00
2186,G406967,2024-01-12 04:24:00,2024-01-12 05:48:00,0.00,-103.49,unknown,,,"Wise Electric Coop, Inc.",,0 days 01:24:00,0 days 01:39:00,0 days 01:31:30,2.0,0 days 02:48:00


{'name': 'Houston County Electric Coop, Inc.', 'state': 'tx', 'layout': 12, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Houston County Electric Coop, Inc..csv


  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,D712398,2023-04-14 07:22:00,2023-04-14 07:47:00,31.28,-95.39,75835,Houston,48225.0,"Houston County Electric Coop, Inc.",Texas,0 days 00:25:00,0 days 00:40:00,0 days 00:32:30,2.00,0 days 00:50:00
1,D712405,2023-04-15 06:14:00,2023-04-15 07:17:00,30.93,-95.67,75852,Madison,48313.0,"Houston County Electric Coop, Inc.",Texas,0 days 01:03:00,0 days 01:18:00,0 days 01:10:30,31.00,1 days 08:33:00
2,D712420,2023-04-15 09:59:00,2023-04-15 11:17:00,31.72,-95.56,75801,Anderson,48001.0,"Houston County Electric Coop, Inc.",Texas,0 days 01:18:00,0 days 01:33:00,0 days 01:25:30,44.00,2 days 09:12:00
3,D712431,2023-04-15 12:19:00,2023-04-15 13:02:00,31.13,-95.74,75835,Houston,48225.0,"Houston County Electric Coop, Inc.",Texas,0 days 00:43:00,0 days 00:58:00,0 days 00:50:30,8.00,0 days 05:44:00
4,D712434,2023-04-15 17:45:00,2023-04-15 19:17:00,31.53,-95.89,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:32:00,0 days 01:47:00,0 days 01:39:30,7.00,0 days 10:44:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,G740838,2024-01-12 06:55:00,2024-01-12 07:32:00,31.22,-95.39,unknown,,,"Houston County Electric Coop, Inc.",,0 days 00:37:00,0 days 00:52:00,0 days 00:44:30,48.00,1 days 05:36:00
1656,G741154,2024-01-12 07:49:00,2024-01-12 08:59:00,31.26,-94.89,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:10:00,0 days 01:25:00,0 days 01:17:30,77.00,3 days 17:50:00
1657,G741203,2024-01-12 08:09:00,2024-01-12 09:56:00,31.21,-95.04,unknown,,,"Houston County Electric Coop, Inc.",,0 days 01:47:00,0 days 02:02:00,0 days 01:54:30,8.00,0 days 14:16:00
1658,G741890,2024-01-15 05:41:00,2024-01-15 09:47:00,31.02,-95.43,unknown,,,"Houston County Electric Coop, Inc.",,0 days 04:06:00,0 days 04:21:00,0 days 04:13:30,68.14,11 days 15:23:08.571428571


{'name': 'Cherokee County Electric Coop Association', 'state': 'tx', 'layout': 12, 'type': 'o'}
/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/raw_data/tx/layout_12/per_outage_Cherokee County Electric Coop Association.csv


  self._data = pd.read_csv(file_path)
  grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)


Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,utility_provider,state,duration_diff,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,C480035,2023-09-24 22:28:00,2023-09-25 17:31:00,32.16,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 19:03:00,0 days 19:18:00,0 days 19:10:30,1.0,0 days 19:03:00
1,C480066,2023-09-24 22:43:00,2023-09-25 17:31:00,32.15,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 18:48:00,0 days 19:03:00,0 days 18:55:30,1.0,0 days 18:48:00
2,C480094,2023-09-24 22:45:00,2023-09-25 12:16:00,32.15,-95.39,75757,Cherokee,48073.0,Cherokee County Electric Coop Association,Texas,0 days 13:31:00,0 days 13:46:00,0 days 13:38:30,1.0,0 days 13:31:00
3,C480111,2023-09-24 22:46:00,2023-09-25 17:31:00,32.16,-95.45,75762,Smith,48423.0,Cherokee County Electric Coop Association,Texas,0 days 18:45:00,0 days 19:00:00,0 days 18:52:30,1.0,0 days 18:45:00
4,C480121,2023-09-24 22:47:00,2023-09-26 05:46:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:59:00,1 days 07:14:00,1 days 07:06:30,1.0,1 days 06:59:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4775,G483387,2023-09-24 22:55:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 07:06:00,1 days 07:21:00,1 days 07:13:30,14.0,18 days 03:24:00
4776,G483389,2023-09-24 23:05:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:56:00,1 days 07:11:00,1 days 07:03:30,18.0,23 days 04:48:00
4777,G483390,2023-09-24 22:47:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 07:14:00,1 days 07:29:00,1 days 07:21:30,9.0,11 days 17:06:00
4778,G483391,2023-09-24 23:04:00,2023-09-26 06:01:00,0.00,-97.49,unknown,,,Cherokee County Electric Coop Association,,1 days 06:57:00,1 days 07:12:00,1 days 07:04:30,12.0,15 days 11:24:00
