In [1]:
import pandas as pd
import numpy as np
import ast
import pytz
import os
import json
import yaml
from arcgis.geocoding import reverse_geocode
from arcgis.geometry import Geometry
from arcgis.gis import GIS
from dateutil import tz
from datetime import datetime
from datetime import timedelta
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline_old:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.geomap = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix 
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        # TODO: use us zipcode database
        try:
            file_path = self.construct_file_path()
            self._data = pd.read_csv(file_path)
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
            with open('zip_to_state_name.json', 'r') as json_file:
                self.geomap['zip_to_state_name'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        raise NotImplementedError

    def standardize(self):
        # Base transformation method
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        """
        Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
        given dataframe being transformed with standardized column names
        """
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass

class GA5_old(BasePipeline_old):
    def transform(self):
        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')

        def reformat_time(time):
            # format: 2024-01-18 09:04:15 but can also be in milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC)
            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')
            if isinstance(time, str) and time.isdigit():
                # Convert millisecond timestamp to datetime
                return pd.to_datetime(int(time), unit='ms', utc=True).tz_convert(eastern)
                
            # else the time is string format of datetime
            elif isinstance(time, str) and ":" in time:
                return pd.to_datetime(time, utc=True).tz_convert(eastern)
            
            elif isinstance(time, datetime): # is datetime object already
                return time
            
            else: # is null or extraneous values that should be null (like '-1000')
                return pd.NaT


        try:
            """
            Transforming the dataframe
            - Some of the time columns has millisecond format, error codes, and NaN so we need to separately reformat before pd.to_datetime
            - etrTime has times before 2023-01-01, so we will set them to NaT
            - "county" values look like city names so renaming accordingly
            """
            # Masks for extracting rows with millisecond format, errors, or NA
            minimum_datetime = pd.to_datetime('2023-01-01 23:59:59-05:00', utc=True).tz_convert('US/Eastern')
            pre_time = lambda x: pd.to_datetime(x, utc=True).tz_convert('US/Eastern') < minimum_datetime
            
            start_time_ms = self._data['startTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
            lastUptTime_ms = self._data['lastUpdatedTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
            etrTime_ms_null = self._data['etrTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA) # or pd.NA
            timeSt_null = self._data['timestamp'].isna()
            extraneous_mask = start_time_ms | lastUptTime_ms | etrTime_ms_null | timeSt_null

            extraneous_rows = self._data[extraneous_mask]

            extraneous_rows['startTime'] = extraneous_rows['startTime'].apply(reformat_time)
            extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
            extraneous_rows['etrTime'] = extraneous_rows['etrTime'].apply(reformat_time)
            extraneous_rows['timestamp'] = extraneous_rows['timestamp'].apply(reformat_time)

            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')

            self._data.loc[extraneous_rows.index, 'startTime'] = extraneous_rows['startTime']
            self._data.loc[extraneous_rows.index, 'lastUpdatedTime'] = extraneous_rows['lastUpdatedTime']
            self._data.loc[extraneous_rows.index, 'etrTime'] = extraneous_rows['etrTime']
            self._data.loc[extraneous_rows.index, 'timestamp'] = extraneous_rows['timestamp']
        
            self._data['startTime'] = pd.to_datetime(self._data['startTime'], utc=True).dt.tz_convert(eastern)
            self._data['lastUpdatedTime'] = pd.to_datetime(self._data['lastUpdatedTime'], utc=True).dt.tz_convert(eastern)
            self._data['etrTime'] =pd.to_datetime(self._data['etrTime'], utc=True).dt.tz_convert(eastern)
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)

            # Replace times where computed datetime < 01-01-2023
            time_col = ['startTime', 'lastUpdatedTime', 'etrTime', 'timestamp']
            for col in time_col:
                self._data.loc[self._data[col] < minimum_datetime, col] = pd.NaT

            self._data.rename(columns={
                'id':'outage_id',
                'startTime': 'start_time',
                'numPeople':'customer_affected',
                'EMC': 'utility_provider',
                'zip_code': 'zipcode',
                'latitude': 'lat',
                'longitude': 'long',
                'county': 'city'
            }, inplace=True)
        except Exception as e:
            print(f"An error occurred during transformation: {e}")
        

    def standardize(self):
        self.load_data()
        self.transform()
        grouped = self._data.groupby(['outage_id', 'start_time']).apply(self._compute_metrics).reset_index().round(2)
        self._data = grouped

    # gis = GIS("http://www.arcgis.com", "JK9035", "60129@GR0W3R5") # signing in to get access to arcGIS api
    def _compute_metrics(self, group):
        """
        Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
        given dataframe being transformed with standardized column names
        """
        def get_zipcode(long, lat):
            location = reverse_geocode((Geometry({"x":float(long), "y":float(lat), "spatialReference":{"wkid": 4326}})))
            return location['address']['Postal']

        start_time = group['start_time'].min()
        duration_diff = group['etrTime'].max() - start_time if group['etrTime'].notna().all() else group['timestamp'].max() - group['timestamp'].min()
        end_time = group['etrTime'].max() if group['etrTime'].notna().all() else start_time + duration_diff
        
        lat = group['lat'].iloc[-1]
        long = group['long'].iloc[-1]

        zipcode_map = self.geomap['zip_to_county_name']
        zipcode = group['zipcode'].iloc[-1] if group['zipcode'].iloc[-1] != 'unknown' else '00000' # get_zipcode(long, lat)
        county_name = self.geomap['zip_to_county_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        county_fips = self.geomap['zip_to_county_fips'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        state = self.geomap['zip_to_state_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        
        utility_provider = group['utility_provider'].iloc[-1]
        duration_max = duration_diff + timedelta(minutes=15)
        duration_mean = (duration_diff + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        total_customer_outage_time = customer_affected_mean * duration_diff

        return pd.Series({
            # 'start_time': start_time,
            'end_time': end_time,
            'lat': lat,
            'long': long,
            'zipcode': zipcode,
            'county_name': county_name,
            'county_fips': county_fips,
            'state': state,
            'utility_provider': utility_provider,
            'duration_max': duration_max,
            'duration_mean': duration_mean,
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time
        })


Establishing config files

In [2]:
local_config_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/app/pipeline/config.yaml'
with open(local_config_path, 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['local_base_file_path']

# Testcode to feel out the dataset


loading config manually

In [3]:
col_config = {'name': 'Colquitt EMC', 'state': 'ga', 'layout': 5, 'type': 'o'}
col_pipe = GA5_old(col_config, base_file_path)
col_pipe.load_data()
col_df = col_pipe._data

display(col_df)


Unnamed: 0,id,type,startTime,lastUpdatedTime,etrTime,title,numPeople,status,cause,identifier,latitude,longitude,description,county,state,EMC,zip_code,timestamp
0,1387175,OUTAGE,1678646254000,1678741805000,-1000,Outage,1,,Manual,32611,31.17911,-83.60590,Device Operation (suebolin),Moultrie,UAS,Colquitt EMC,31771,
1,1387940,OUTAGE,1678876419000,1678877104000,1678883619000,Outage,1,,TROUBLE_CALL,32783,30.98839,-83.20358,New Prediction,Valdosta,UAS,Colquitt EMC,31605,
2,1388250,OUTAGE,1678900434000,1678900805000,1678907634000,Outage,1,,TROUBLE_CALL,32809,31.42243,-83.60985,New Prediction,Tifton,SNT,Colquitt EMC,31793,
3,1388449,OUTAGE,1678921517000,1678921805000,1678928717000,Outage,1,,TROUBLE_CALL,32822,30.93008,-83.41995,New Prediction,Valdosta,SNT,Colquitt EMC,31632,
4,1388470,OUTAGE,1678930132000,1678930204000,1678937332000,Outage,1,,TROUBLE_CALL,32823,31.32806,-83.58069,New Prediction,Tifton,NEW,Colquitt EMC,31775,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423741,1543589,OUTAGE,2024-01-18 09:04:15,2024-01-18 09:15:04,,Outage,1,,TROUBLE_CALL,70575,31.44082,-83.57342,New Prediction,Tifton,UAS,Colquitt EMC,31793,01-18-2024 15:33:24
423742,1543608,OUTAGE,2024-01-18 13:01:22,2024-01-18 13:10:04,,Outage,1,,TROUBLE_CALL,70582,30.77363,-83.19981,New Prediction,Valdosta,SNT,Colquitt EMC,31606,01-18-2024 15:33:24
423743,1543612,OUTAGE,2024-01-18 13:24:36,2024-01-18 13:35:04,,Outage,1,,TROUBLE_CALL,70584,31.00051,-83.22879,New Prediction,Valdosta,SNT,Colquitt EMC,31645,01-18-2024 15:33:24
423744,1543632,OUTAGE,2024-01-18 14:39:54,2024-01-18 14:50:04,,Outage,1,,TROUBLE_CALL,70589,31.52155,-83.63948,New Prediction,Tifton,SNT,Colquitt EMC,unknown,01-18-2024 15:33:24


### Checking columns, num unique, and null values

In [4]:
df = col_df.copy()
df.info()

non_null_count = df.count()
null_count = len(df) - non_null_count
non_null_prop = non_null_count / len(df)
unique_count = df.nunique()

# Create DataFrame with non-null proportions and null counts
result_df = pd.DataFrame({
    'non_null_proportion': non_null_prop,
    'null_count': null_count,
    'num_unique': unique_count
})

display(result_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423746 entries, 0 to 423745
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               423746 non-null  int64  
 1   type             423746 non-null  object 
 2   startTime        423746 non-null  object 
 3   lastUpdatedTime  423746 non-null  object 
 4   etrTime          404131 non-null  object 
 5   title            423746 non-null  object 
 6   numPeople        423746 non-null  int64  
 7   status           0 non-null       float64
 8   cause            402693 non-null  object 
 9   identifier       423746 non-null  int64  
 10  latitude         423746 non-null  float64
 11  longitude        423746 non-null  float64
 12  description      423746 non-null  object 
 13  county           407846 non-null  object 
 14  state            423746 non-null  object 
 15  EMC              423746 non-null  object 
 16  zip_code         423746 non-null  obje

Unnamed: 0,non_null_proportion,null_count,num_unique
id,1.0,0,17138
type,1.0,0,1
startTime,1.0,0,12765
lastUpdatedTime,1.0,0,12622
etrTime,0.95371,19615,12650
title,1.0,0,1
numPeople,1.0,0,667
status,0.0,423746,0
cause,0.950317,21053,3
identifier,1.0,0,17138


In [5]:
print(f"Unique 'county' names: {df['county'].unique()}")
print(f"Unique 'state' values: {df['state'].unique()}")
print(f"Unique 'zip_code' vlaues: {df['zip_code'].unique()}")

Unique 'county' names: ['Moultrie' 'Valdosta' 'Tifton' nan]
Unique 'state' values: ['UAS' 'SNT' 'NEW' 'S-SNT' 'INC' 'S-RST' 'RST' 'M-UAS' 'S-UAS' 'S-INC'
 'S-NEW' 'M-NEW' 'M-SNT' 'ENR' 'ONS']
Unique 'zip_code' vlaues: ['31771' '31605' '31793' '31632' '31775' '31636' '31768' '31788' '31638'
 'unknown' '31601' '31639' '31794' '31795' '31744' '31637' '31602' '31778'
 '31641' '31603' '31789' '31620' '31643' '31625' '31606' '31756' '31749'
 '31727' '31622' '31647' '31720' '31722' '31791' '31626' '31627' '31645'
 '31629' '31747' '31784' '31698' '31699' '31765' '31775:31789:31791'
 '31773' '31753' '31776' '31733' '31604' '31744:31791' '31738'
 '31756:31757' '31635' '31739' '31625:31632' '31705']


Note:
- 'id' and 'identifier have the same # of unique values so we will use 'id' as a single outage instance
- "county" is actually city names?
- "state" is not geography state but state of the outage

Columns with null values:
- etrTime (assumably "estimatedTimeRestored Time"?)
- status
- cause
- county (but is city names so not really relevant)
- timestamp

Columns to be used for final cleaned dataframe:
- 'id'
- 'startTime' (check if rigorous)
- 'lastUpdatedTime' (check if rigorous)
- 'etrTime' (check if rigorous)
- 'numPeople'
- 'latitude' (both lat and long will be used to calculate the zipcodes that are missing)
- 'longitude'
- 'county' (missing values will be computed by reverse_geocode from the coordinates)
- 'state' (maybe to give more context to outages)
- 'EMC'
- 'zip_code' 
- 'timestamp' (check if rigorous)

# Transforming:

## Finding Errors
We should check 'etrTime' and 'timestamp' to see if there are certain patterns in the null values

In [6]:
display(df[df['etrTime'].isna()][['startTime', 'lastUpdatedTime', 'etrTime', 'timestamp']])

Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
404131,2024-01-06 16:02:54,2024-01-06 16:10:06,,01-06-2024 16:18:24
404132,2024-01-06 16:02:44,2024-01-06 16:10:06,,01-06-2024 16:18:24
404133,2024-01-06 16:03:14,2024-01-06 16:10:06,,01-06-2024 16:18:24
404134,2024-01-06 16:02:44,2024-01-06 16:10:06,,01-06-2024 16:18:24
404135,2024-01-06 16:02:54,2024-01-06 16:10:06,,01-06-2024 16:18:24
...,...,...,...,...
423741,2024-01-18 09:04:15,2024-01-18 09:15:04,,01-18-2024 15:33:24
423742,2024-01-18 13:01:22,2024-01-18 13:10:04,,01-18-2024 15:33:24
423743,2024-01-18 13:24:36,2024-01-18 13:35:04,,01-18-2024 15:33:24
423744,2024-01-18 14:39:54,2024-01-18 14:50:04,,01-18-2024 15:33:24


All null etrTime are within 01-2024 but there are over 19000 rows with null etrTimes.
We will check to see whether etrTime or timestamp are better metrics to use by comparing the durations between them.

In [7]:
display(df[df['timestamp'].isna()][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
0,1678646254000,1678741805000,-1000,
1,1678876419000,1678877104000,1678883619000,
2,1678900434000,1678900805000,1678907634000,
3,1678921517000,1678921805000,1678928717000,
4,1678930132000,1678930204000,1678937332000,
...,...,...,...,...
730,1680119060000,1680119104000,1680126260000,
731,1680120767000,1680120904000,1680127967000,
732,1680120988000,1680121804000,1680128188000,
733,1680121702000,1680121804000,1680128902000,


The first 735 rows have null timestamps and when converting for those rows with null timestamps the # of milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC) to time, we see that these are all outages within 03/2023 (like ga_11_tx_12)
- 1678646254000 = 2023-03-11 11:57:34
- 1680122104000 = 2023-03-30 14:48:22

Since there are only 735 null timestamps, we will seriously consider using timestamps for the duration

In [8]:
display(df[df['timestamp'].isna() & df['etrTime'].isna()][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp


No rows in the df where both 'timestamp' and 'etrTime' is null. 
Thus, we will see whether etrTime is a better endtime to use than timestamp or if it will just be the alternative endtime whenever we have no timestamp to use. 


### Checking for which rows have millisecond time formats

In [9]:
# Finding which rows have ms time formats (including 1678741805000 and error numbers like "-1000")
time_col = ['startTime', 'lastUpdatedTime','etrTime', 'timestamp']
for col in time_col:
    print(f"Rows in df where '{col}' in millisecond format")
    mill = df[df[col].apply(lambda x: isinstance(x, str) and (x.isdigit() or ":" not in x))].reset_index(drop=False)
    display(mill[['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

mill_st = df['startTime'].apply(lambda x: isinstance(x, str) and (x.isdigit() or ":" not in x))
# display(df[mill_st][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

mill_lut = df['lastUpdatedTime'].apply(lambda x: isinstance(x, str) and (x.isdigit() or ":" not in x))
# display(df[mill_lut][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

mill_etr = df['etrTime'].apply(lambda x: isinstance(x, str) and (x.isdigit() or ":" not in x))
# display(df[mill_etr][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])
null_etr = df['etrTime'].isna()

mill_ts = df['timestamp'].apply(lambda x: isinstance(x, str) and (x.isdigit() or ":" not in x))
# display(df[mill_ts][['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])
null_ts = df['timestamp'].isna()

# st_not_in_etr = mill_st[~mill_st.isin(mill_etr)]
# print("Rows where 'startTime' in ms that are not in 'etrTime':")
# display(st_not_in_etr[['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

print("Rows in df with startTime, lastUpdatedTime, and etrTime in ms format/error code and null timestamps ")
mill = df[mill_st & mill_lut & mill_etr & null_ts]
display(mill[['startTime', 'lastUpdatedTime', 'etrTime','timestamp']])

Rows in df where 'startTime' in millisecond format


Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
0,1678646254000,1678741805000,-1000,
1,1678876419000,1678877104000,1678883619000,
2,1678900434000,1678900805000,1678907634000,
3,1678921517000,1678921805000,1678928717000,
4,1678930132000,1678930204000,1678937332000,
...,...,...,...,...
730,1680119060000,1680119104000,1680126260000,
731,1680120767000,1680120904000,1680127967000,
732,1680120988000,1680121804000,1680128188000,
733,1680121702000,1680121804000,1680128902000,


Rows in df where 'lastUpdatedTime' in millisecond format


Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
0,1678646254000,1678741805000,-1000,
1,1678876419000,1678877104000,1678883619000,
2,1678900434000,1678900805000,1678907634000,
3,1678921517000,1678921805000,1678928717000,
4,1678930132000,1678930204000,1678937332000,
...,...,...,...,...
730,1680119060000,1680119104000,1680126260000,
731,1680120767000,1680120904000,1680127967000,
732,1680120988000,1680121804000,1680128188000,
733,1680121702000,1680121804000,1680128902000,


Rows in df where 'etrTime' in millisecond format


Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
0,1678646254000,1678741805000,-1000,
1,1678876419000,1678877104000,1678883619000,
2,1678900434000,1678900805000,1678907634000,
3,1678921517000,1678921805000,1678928717000,
4,1678930132000,1678930204000,1678937332000,
...,...,...,...,...
730,1680119060000,1680119104000,1680126260000,
731,1680120767000,1680120904000,1680127967000,
732,1680120988000,1680121804000,1680128188000,
733,1680121702000,1680121804000,1680128902000,


Rows in df where 'timestamp' in millisecond format


Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp


Rows in df with startTime, lastUpdatedTime, and etrTime in ms format/error code and null timestamps 


Unnamed: 0,startTime,lastUpdatedTime,etrTime,timestamp
0,1678646254000,1678741805000,-1000,
1,1678876419000,1678877104000,1678883619000,
2,1678900434000,1678900805000,1678907634000,
3,1678921517000,1678921805000,1678928717000,
4,1678930132000,1678930204000,1678937332000,
...,...,...,...,...
730,1680119060000,1680119104000,1680126260000,
731,1680120767000,1680120904000,1680127967000,
732,1680120988000,1680121804000,1680128188000,
733,1680121702000,1680121804000,1680128902000,


Note how the startTime's, lastUpdatedTime's, and etrTime's with millisecond time formats are all within the first 735 rows.
This also corresponds with the rows of the df where timestamp is null.

Thus, we will manually extract the rows that have millisecond time format, errors, or NA's and reformat them in parallel to pd.to_datetime() every other row to cut down on runtime.

## Checking whether startTime, timestamp, or etrTime is a better metric to use for startTime's / endTime

#### Let's first transform the dataframe so we can compare the different time metrics under datetime format.

In [10]:
start_time_error_ms = df['startTime'].apply(lambda x: (isinstance(x, str) and ":" not in x and int(x) < 0 ))
lastUptTime_error_ms = df['lastUpdatedTime'].apply(lambda x: (isinstance(x, str) and ":" not in x and int(x) < 0))
etrTime_error_ms = df['etrTime'].apply(lambda x: (isinstance(x, str) and ":" not in x and int(x) < 0)) # or pd.NA
timeSt_error_ms = df['timestamp'].apply(lambda x: (isinstance(x, str) and ":" not in x and int(x) < 0))
mask = start_time_error_ms | lastUptTime_error_ms | etrTime_error_ms | timeSt_error_ms

error_1000 = df[mask]
display(error_1000)

Unnamed: 0,id,type,startTime,lastUpdatedTime,etrTime,title,numPeople,status,cause,identifier,latitude,longitude,description,county,state,EMC,zip_code,timestamp
0,1387175,OUTAGE,1678646254000,1678741805000,-1000,Outage,1,,Manual,32611,31.17911,-83.6059,Device Operation (suebolin),Moultrie,UAS,Colquitt EMC,31771,
189,1389689,OUTAGE,1679335607000,1679347504000,-1000,Outage,206,,Manual,33056,31.18155,-83.83748,Device Operation (angelarodney),Moultrie,SNT,Colquitt EMC,31768,
193,1389787,OUTAGE,1679359020000,1679359504000,-1000,Outage,1,,,33084,29.80126,-91.39814,Fuzzy outage,,UAS,Colquitt EMC,unknown,
198,1389795,OUTAGE,1679359800000,1679360104000,-1000,Outage,1,,,33087,29.80126,-91.39814,Fuzzy outage,,NEW,Colquitt EMC,unknown,
201,1389795,OUTAGE,1679359800000,1679361004000,-1000,Outage,1,,,33087,29.80126,-91.39814,Fuzzy outage,,UAS,Colquitt EMC,unknown,
266,1390223,OUTAGE,1679486346000,1679516104000,-1000,Outage,41,,TROUBLE_CALL,33135,30.8912,-83.48722,Revised Prediction,Valdosta,SNT,Colquitt EMC,31643,
269,1390823,OUTAGE,1679530668000,1679531404000,-1000,Outage,722,,Manual,33170,31.06802,-83.35209,Device Operation (nms1),Tifton,M-UAS,Colquitt EMC,31632,
432,1392581,OUTAGE,1679765523000,1679766305000,-1000,Outage,24,,Manual,33420,31.21651,-83.77639,Device Operation (nms1),Moultrie,M-UAS,Colquitt EMC,31768,
643,1393530,OUTAGE,1679957282000,1679962204000,-1000,Outage,27,,TROUBLE_CALL,33736,30.84861,-83.36535,Revised Prediction,Valdosta,SNT,Colquitt EMC,unknown,


In [11]:
## Test block to explore datatype of different values and test conditionals in test_Transform

# print(df['etrTime'])

# if df is transformed, rerun entire file
temp2 = df['etrTime'].iloc[3]
temp1 = df['etrTime'].iloc[0] # "-1000"
temp3 = df['etrTime'].iloc[404126]
print(f"temp1={temp1}")
print(f"{temp1} type: {type(temp1)}")
# print(int(temp))
# print(int(temp) < 0)
# print(type(int(temp)))
nan = np.nan
print(f"Is {temp2} is digit?: {temp2.isdigit()}")
print(f" is {temp1} a string and : not in variable?: {isinstance(temp1, str) and (temp1.isdigit() or ':' not in temp1) or pd.NA}")
print(f" is {temp2} a string and : not in variable?: {isinstance(temp2, str) and (temp2.isdigit() or ':' not in temp2) or pd.NA}")
print(f" is {temp3} a string and contains ':': {isinstance(temp3, str) and ':' in temp3}")
print(f" is {temp3} an instance of datetime?: {isinstance(temp3, datetime)}")
print(f" is NaN a string {isinstance(nan, str)}")
print(f" is {temp3} > '2023-01-01 23:59:59-05:00'?: {temp3 > '2023-01-01 23:59:59-05:00'}")


temp1=-1000
-1000 type: <class 'str'>
Is 1678928717000 is digit?: True
 is -1000 a string and : not in variable?: True
 is 1678928717000 a string and : not in variable?: True
 is 2023-12-16 20:58:17 a string and contains ':': True
 is 2023-12-16 20:58:17 an instance of datetime?: False
 is NaN a string False
 is 2023-12-16 20:58:17 > '2023-01-01 23:59:59-05:00'?: True


In [12]:
def test_Transform(dataframe):
    eastern = tz.gettz('US/Eastern')
    utc = tz.gettz('UTC')

    def reformat_time(time):
        # format: 2024-01-18 09:04:15 but can also be in milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC)
        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')
        if isinstance(time, str) and time.isdigit():
            # Convert millisecond timestamp to datetime
            return pd.to_datetime(int(time), unit='ms', utc=True).tz_convert(eastern)
            
        # else the time is string format of datetime
        elif isinstance(time, str) and ":" in time:
            return pd.to_datetime(time, utc=True).tz_convert(eastern)
        
        elif isinstance(time, datetime): # is datetime object already
            return time
        
        else: # is null or extraneous values that should be null (like '-1000')
            return pd.NaT


    try:
        """
        Transforming the dataframe
        - Some of the time columns has millisecond format, error codes, and NaN so we need to separately reformat before pd.to_datetime
        - etrTime has times before 2023-01-01, so we will set them to NaT
        - "county" values look like city names so renaming accordingly
        """
        # Subsetting the dataframe based on time format

        # Masks for extracting rows with millisecond format, errors, or NA
        minimum_datetime = pd.to_datetime('2023-01-01 23:59:59-05:00', utc=True).tz_convert('US/Eastern')
        pre_time = lambda x: pd.to_datetime(x, utc=True).tz_convert('US/Eastern') < minimum_datetime

        start_time_ms = dataframe['startTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
        lastUptTime_ms = dataframe['lastUpdatedTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
        etrTime_ms_null = dataframe['etrTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA) # or pd.NA
        timeSt_null = dataframe['timestamp'].isna()
        extraneous_mask = start_time_ms | lastUptTime_ms | etrTime_ms_null | timeSt_null

        extraneous_rows = dataframe[extraneous_mask]

        extraneous_rows['startTime'] = extraneous_rows['startTime'].apply(reformat_time)
        extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
        extraneous_rows['etrTime'] = extraneous_rows['etrTime'].apply(reformat_time)
        extraneous_rows['timestamp'] = extraneous_rows['timestamp'].apply(reformat_time)

        ## not_extr_rows = dataframe[~extraneous_mask]

        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')

        dataframe.loc[extraneous_rows.index, 'startTime'] = extraneous_rows['startTime']
        dataframe.loc[extraneous_rows.index, 'lastUpdatedTime'] = extraneous_rows['lastUpdatedTime']
        dataframe.loc[extraneous_rows.index, 'etrTime'] = extraneous_rows['etrTime']
        dataframe.loc[extraneous_rows.index, 'timestamp'] = extraneous_rows['timestamp']
    
        dataframe['startTime'] = pd.to_datetime(dataframe['startTime'], utc=True).dt.tz_convert(eastern)
        dataframe['lastUpdatedTime'] = pd.to_datetime(dataframe['lastUpdatedTime'], utc=True).dt.tz_convert(eastern)
        dataframe['etrTime'] =pd.to_datetime(dataframe['etrTime'], utc=True).dt.tz_convert(eastern)
        dataframe['timestamp'] = pd.to_datetime(dataframe['timestamp'], utc=True).dt.tz_convert(eastern)

        # Replace times where computed datetime < 01-01-2023
        time_col = ['startTime', 'lastUpdatedTime', 'etrTime', 'timestamp']
        for col in time_col:
            dataframe.loc[dataframe[col] < minimum_datetime, col] = pd.NaT

        dataframe.rename(columns={
            'id':'outage_id',
            'startTime': 'start_time',
            'numPeople':'customer_affected',
            'EMC': 'utility_provider',
            'zip_code': 'zipcode',
            'latitude': 'lat',
            'longitude': 'long',
            'county': 'city'
        }, inplace=True)
    except Exception as e:
        print(f"An error occurred during transformation: {e}")
        

In [13]:
test_config = {'name': 'Colquitt EMC', 'state': 'ga', 'layout': 5, 'type': 'o'}
test = GA5_old(test_config, base_file_path)
test.load_data()
test_df = test._data
print("Before transformation:")
display(test_df)
print(test_df.info())
test_Transform(test_df)

# Code to debug and see whether transformation changes carried through to final transformed dataset
# time_col = ['start_time', 'lastUpdatedTime','etrTime', 'timestamp']
# for col in time_col:
#     unique_types = set(test_df[col].apply(type))
#     print(f"Unique datatypes of {col}: {unique_types}") 
#     dtype = str # pd._libs.tslibs.timestamps.Timestamp
#     filtered_values = test_df[test_df[col].apply(lambda x: isinstance(x, dtype))][col]
#     print(f"Printing all rows in {col} where dtype is {dtype}")
#     display(filtered_values)

Before transformation:


Unnamed: 0,id,type,startTime,lastUpdatedTime,etrTime,title,numPeople,status,cause,identifier,latitude,longitude,description,county,state,EMC,zip_code,timestamp
0,1387175,OUTAGE,1678646254000,1678741805000,-1000,Outage,1,,Manual,32611,31.17911,-83.60590,Device Operation (suebolin),Moultrie,UAS,Colquitt EMC,31771,
1,1387940,OUTAGE,1678876419000,1678877104000,1678883619000,Outage,1,,TROUBLE_CALL,32783,30.98839,-83.20358,New Prediction,Valdosta,UAS,Colquitt EMC,31605,
2,1388250,OUTAGE,1678900434000,1678900805000,1678907634000,Outage,1,,TROUBLE_CALL,32809,31.42243,-83.60985,New Prediction,Tifton,SNT,Colquitt EMC,31793,
3,1388449,OUTAGE,1678921517000,1678921805000,1678928717000,Outage,1,,TROUBLE_CALL,32822,30.93008,-83.41995,New Prediction,Valdosta,SNT,Colquitt EMC,31632,
4,1388470,OUTAGE,1678930132000,1678930204000,1678937332000,Outage,1,,TROUBLE_CALL,32823,31.32806,-83.58069,New Prediction,Tifton,NEW,Colquitt EMC,31775,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423741,1543589,OUTAGE,2024-01-18 09:04:15,2024-01-18 09:15:04,,Outage,1,,TROUBLE_CALL,70575,31.44082,-83.57342,New Prediction,Tifton,UAS,Colquitt EMC,31793,01-18-2024 15:33:24
423742,1543608,OUTAGE,2024-01-18 13:01:22,2024-01-18 13:10:04,,Outage,1,,TROUBLE_CALL,70582,30.77363,-83.19981,New Prediction,Valdosta,SNT,Colquitt EMC,31606,01-18-2024 15:33:24
423743,1543612,OUTAGE,2024-01-18 13:24:36,2024-01-18 13:35:04,,Outage,1,,TROUBLE_CALL,70584,31.00051,-83.22879,New Prediction,Valdosta,SNT,Colquitt EMC,31645,01-18-2024 15:33:24
423744,1543632,OUTAGE,2024-01-18 14:39:54,2024-01-18 14:50:04,,Outage,1,,TROUBLE_CALL,70589,31.52155,-83.63948,New Prediction,Tifton,SNT,Colquitt EMC,unknown,01-18-2024 15:33:24


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423746 entries, 0 to 423745
Data columns (total 18 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               423746 non-null  int64  
 1   type             423746 non-null  object 
 2   startTime        423746 non-null  object 
 3   lastUpdatedTime  423746 non-null  object 
 4   etrTime          404131 non-null  object 
 5   title            423746 non-null  object 
 6   numPeople        423746 non-null  int64  
 7   status           0 non-null       float64
 8   cause            402693 non-null  object 
 9   identifier       423746 non-null  int64  
 10  latitude         423746 non-null  float64
 11  longitude        423746 non-null  float64
 12  description      423746 non-null  object 
 13  county           407846 non-null  object 
 14  state            423746 non-null  object 
 15  EMC              423746 non-null  object 
 16  zip_code         423746 non-null  obje

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['startTime'] = extraneous_rows['startTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['etrTime'] = ex

In [14]:
print("Transformed dataframe:")
display(test_df)
print(test_df.info())

Transformed dataframe:


Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
0,1387175,OUTAGE,2023-03-12 14:37:34-04:00,2023-03-13 17:10:05-04:00,NaT,Outage,1,,Manual,32611,31.17911,-83.60590,Device Operation (suebolin),Moultrie,UAS,Colquitt EMC,31771,NaT
1,1387940,OUTAGE,2023-03-15 06:33:39-04:00,2023-03-15 06:45:04-04:00,2023-03-15 08:33:39-04:00,Outage,1,,TROUBLE_CALL,32783,30.98839,-83.20358,New Prediction,Valdosta,UAS,Colquitt EMC,31605,NaT
2,1388250,OUTAGE,2023-03-15 13:13:54-04:00,2023-03-15 13:20:05-04:00,2023-03-15 15:13:54-04:00,Outage,1,,TROUBLE_CALL,32809,31.42243,-83.60985,New Prediction,Tifton,SNT,Colquitt EMC,31793,NaT
3,1388449,OUTAGE,2023-03-15 19:05:17-04:00,2023-03-15 19:10:05-04:00,2023-03-15 21:05:17-04:00,Outage,1,,TROUBLE_CALL,32822,30.93008,-83.41995,New Prediction,Valdosta,SNT,Colquitt EMC,31632,NaT
4,1388470,OUTAGE,2023-03-15 21:28:52-04:00,2023-03-15 21:30:04-04:00,2023-03-15 23:28:52-04:00,Outage,1,,TROUBLE_CALL,32823,31.32806,-83.58069,New Prediction,Tifton,NEW,Colquitt EMC,31775,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423741,1543589,OUTAGE,2024-01-18 04:04:15-05:00,2024-01-18 04:15:04-05:00,NaT,Outage,1,,TROUBLE_CALL,70575,31.44082,-83.57342,New Prediction,Tifton,UAS,Colquitt EMC,31793,2024-01-18 10:33:24-05:00
423742,1543608,OUTAGE,2024-01-18 08:01:22-05:00,2024-01-18 08:10:04-05:00,NaT,Outage,1,,TROUBLE_CALL,70582,30.77363,-83.19981,New Prediction,Valdosta,SNT,Colquitt EMC,31606,2024-01-18 10:33:24-05:00
423743,1543612,OUTAGE,2024-01-18 08:24:36-05:00,2024-01-18 08:35:04-05:00,NaT,Outage,1,,TROUBLE_CALL,70584,31.00051,-83.22879,New Prediction,Valdosta,SNT,Colquitt EMC,31645,2024-01-18 10:33:24-05:00
423744,1543632,OUTAGE,2024-01-18 09:39:54-05:00,2024-01-18 09:50:04-05:00,NaT,Outage,1,,TROUBLE_CALL,70589,31.52155,-83.63948,New Prediction,Tifton,SNT,Colquitt EMC,unknown,2024-01-18 10:33:24-05:00


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 423746 entries, 0 to 423745
Data columns (total 18 columns):
 #   Column             Non-Null Count   Dtype                                     
---  ------             --------------   -----                                     
 0   outage_id          423746 non-null  int64                                     
 1   type               423746 non-null  object                                    
 2   start_time         423746 non-null  datetime64[ns, tzfile('America/New_York')]
 3   lastUpdatedTime    423746 non-null  datetime64[ns, tzfile('America/New_York')]
 4   etrTime            306237 non-null  datetime64[ns, tzfile('America/New_York')]
 5   title              423746 non-null  object                                    
 6   customer_affected  423746 non-null  int64                                     
 7   status             0 non-null       float64                                   
 8   cause              402693 non-null  object  

## Comparing and validating times

In [15]:
test_Transform(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['startTime'] = extraneous_rows['startTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['etrTime'] = ex

In [16]:
def _validate(group):
    # Check the number of levels in the index
    num_levels = group.index.nlevels
    utility_provider = group['utility_provider'].iloc[-1]
    num_rows = len(group)
    num_unique_lon = len(group['long'].unique())
    num_unique_lat = len(group['lat'].unique())

    num_unique_start_times = len(group['start_time'].unique())
    earliest_start_time = group['start_time'].min()
    latest_start_time = group['start_time'].max()
    diff_in_start = latest_start_time - earliest_start_time

    num_unique_lut = group['lastUpdatedTime'].nunique()
    earliest_lut = group['lastUpdatedTime'].min()
    latest_lut = group['lastUpdatedTime'].max()
    diff_in_lut = latest_lut - earliest_lut

    num_unique_etrTime = group['etrTime'].nunique()
    earliest_etrTime = group['etrTime'].min()
    latest_etrTime = group['etrTime'].max()

    earliest_timestamp = group['timestamp'].min()
    latest_timestamp = group['timestamp'].max()

    duration_by_timestamp = latest_timestamp - earliest_timestamp
    duration_by_etrstart = latest_etrTime - earliest_start_time if pd.notna(latest_etrTime) else pd.NaT
    duration_by_lutstart = latest_lut - earliest_start_time
    duration_by_timestampstart = latest_timestamp - earliest_start_time

    endtime_by_timestamp_start = earliest_start_time + duration_by_timestamp
    
    seconds_rem = duration_by_timestamp.total_seconds() % 900 # 900 secs = 15 minutes
    is_timestamp_dur_div_15_min = seconds_rem <= 5 or seconds_rem >= 895 # if time is 5 seconds within being divisible by 15 minutes
    timestampstart_timestamp_dur_error = abs(duration_by_timestampstart - duration_by_timestamp) if pd.notna(duration_by_timestampstart) and pd.notna(duration_by_timestamp) else pd.NaT
    etrstart_timestamp_dur_error = abs(duration_by_etrstart - duration_by_timestamp) if pd.notna(duration_by_etrstart) and pd.notna(duration_by_timestamp) else pd.NaT
    etrstart_timestampstart_dur_error = abs(duration_by_etrstart - duration_by_timestampstart) if pd.notna(duration_by_etrstart) and pd.notna(duration_by_timestampstart) else pd.NaT
    lutstart_etrstart_dur_error = abs(duration_by_lutstart - duration_by_etrstart) if pd.notna(duration_by_etrstart) and pd.notna(duration_by_lutstart) else pd.NaT
    lutstart_timestamp_dur_error = abs(duration_by_lutstart - duration_by_timestamp) if pd.notna(duration_by_lutstart) and pd.notna(duration_by_timestamp) else pd.NaT


    return pd.Series({
        'utility_provider': utility_provider,
        'num_rows': num_rows,
        'num_unique_long': num_unique_lon,
        'num_unique_lat': num_unique_lat,
        'num_unique_start_times': num_unique_start_times,
        'earliest_start_time': earliest_start_time,
        'latest_start_time': latest_start_time,
        'diff_in_start': diff_in_start,
        'earliest_timestamp': earliest_timestamp,
        'latest_timestamp': latest_timestamp, 
        'num_unique_latestUpdatedTime': num_unique_lut,
        'num_unique_etrTime': num_unique_etrTime,
        'earliest_etrTime': earliest_etrTime,
        'latest_etrTime': latest_etrTime,
        'duration_by_timestamp': duration_by_timestamp,
        'endtime_by_timestamp_start': endtime_by_timestamp_start,
        'duration_by_etrstart': duration_by_etrstart,
        'duration_by_timestampstart': duration_by_timestampstart,
        'duration_by_lutstart': duration_by_lutstart,
        'is_timestamp_dur_div_15_min': is_timestamp_dur_div_15_min,
        'timestampstart_timestamp_dur_error': timestampstart_timestamp_dur_error,
        'etrstart_timestamp_dur_error': etrstart_timestamp_dur_error,
        'etrstart_timestampstart_dur_error': etrstart_timestampstart_dur_error,
        'lutstart_etrstart_dur_error': lutstart_etrstart_dur_error,
        'lutstart_timestamp_dur_error': lutstart_timestamp_dur_error
    })


In [17]:
validated = df.groupby('outage_id').apply(_validate)
display(validated)

  validated = df.groupby('outage_id').apply(_validate)


Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1387175,Colquitt EMC,1,1,1,1,2023-03-12 14:37:34-04:00,2023-03-12 14:37:34-04:00,0 days,NaT,NaT,1,0,NaT,NaT,NaT,NaT,NaT,NaT,1 days 02:32:31,False,NaT,NaT,NaT,NaT,NaT
1387940,Colquitt EMC,1,1,1,1,2023-03-15 06:33:39-04:00,2023-03-15 06:33:39-04:00,0 days,NaT,NaT,1,1,2023-03-15 08:33:39-04:00,2023-03-15 08:33:39-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:11:25,False,NaT,NaT,NaT,0 days 01:48:35,NaT
1388250,Colquitt EMC,1,1,1,1,2023-03-15 13:13:54-04:00,2023-03-15 13:13:54-04:00,0 days,NaT,NaT,1,1,2023-03-15 15:13:54-04:00,2023-03-15 15:13:54-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:11,False,NaT,NaT,NaT,0 days 01:53:49,NaT
1388449,Colquitt EMC,1,1,1,1,2023-03-15 19:05:17-04:00,2023-03-15 19:05:17-04:00,0 days,NaT,NaT,1,1,2023-03-15 21:05:17-04:00,2023-03-15 21:05:17-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:04:48,False,NaT,NaT,NaT,0 days 01:55:12,NaT
1388470,Colquitt EMC,2,1,1,1,2023-03-15 21:28:52-04:00,2023-03-15 21:28:52-04:00,0 days,NaT,NaT,2,1,2023-03-15 23:28:52-04:00,2023-03-15 23:28:52-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:12,False,NaT,NaT,NaT,0 days 01:53:48,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543621,Colquitt EMC,2,1,1,1,2024-01-18 09:03:45-05:00,2024-01-18 09:03:45-05:00,0 days,2024-01-18 09:18:24-05:00,2024-01-18 09:33:23-05:00,1,0,NaT,NaT,0 days 00:14:59,2024-01-18 09:18:44-05:00,NaT,0 days 00:29:38,0 days 00:06:20,True,0 days 00:14:39,NaT,NaT,NaT,0 days 00:08:39
1543632,Colquitt EMC,4,1,1,1,2024-01-18 09:39:54-05:00,2024-01-18 09:39:54-05:00,0 days,2024-01-18 09:48:23-05:00,2024-01-18 10:33:24-05:00,2,0,NaT,NaT,0 days 00:45:01,2024-01-18 10:24:55-05:00,NaT,0 days 00:53:30,0 days 00:10:10,True,0 days 00:08:29,NaT,NaT,NaT,0 days 00:34:51
1543634,Colquitt EMC,3,1,1,1,2024-01-18 09:45:56-05:00,2024-01-18 09:45:56-05:00,0 days,2024-01-18 10:03:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:30:00,2024-01-18 10:15:56-05:00,NaT,0 days 00:47:28,0 days 00:09:09,True,0 days 00:17:28,NaT,NaT,NaT,0 days 00:20:51
1543666,Colquitt EMC,1,1,1,1,2024-01-18 10:16:45-05:00,2024-01-18 10:16:45-05:00,0 days,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:00:00,2024-01-18 10:16:45-05:00,NaT,0 days 00:16:39,0 days 00:13:19,True,0 days 00:16:39,NaT,NaT,NaT,0 days 00:13:19


In [18]:
print("# of nulls for each validated column:")
print(len(validated) - validated.count())

# of nulls for each validated column:
utility_provider                         0
num_rows                                 0
num_unique_long                          0
num_unique_lat                           0
num_unique_start_times                   0
earliest_start_time                      0
latest_start_time                        0
diff_in_start                            0
earliest_timestamp                     470
latest_timestamp                       470
num_unique_latestUpdatedTime             0
num_unique_etrTime                       0
earliest_etrTime                      2418
latest_etrTime                        2418
duration_by_timestamp                  470
endtime_by_timestamp_start             470
duration_by_etrstart                  2418
duration_by_timestampstart             470
duration_by_lutstart                     0
is_timestamp_dur_div_15_min              0
timestampstart_timestamp_dur_error     470
etrstart_timestamp_dur_error          2883
etrstart_timesta

In [19]:
# Distribution of the # of unique times
print("Unique start times distribution:")
print(validated['num_unique_start_times'].describe())
print("Unique latestUpdatedTime distribution:")
print(validated['num_unique_latestUpdatedTime'].describe())
print("Unique start etrTime distribution:")
print(validated['num_unique_etrTime'].describe())
print("Unique lat distribution:")
print(validated['num_unique_lat'].describe())
print("Unique long distribution:")
print(validated['num_unique_long'].describe())

Unique start times distribution:
count    17138.000000
mean         1.011962
std          0.109252
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: num_unique_start_times, dtype: float64
Unique latestUpdatedTime distribution:
count    17138.000000
mean         1.609464
std          0.961084
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         11.000000
Name: num_unique_latestUpdatedTime, dtype: float64
Unique start etrTime distribution:
count    17138.000000
mean         0.902614
std          0.425624
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: num_unique_etrTime, dtype: float64
Unique lat distribution:
count    17138.000000
mean         1.091668
std          0.328473
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          4.000000
Name: num_unique_lat, dtype

We notice that there are not all outages have a single unique start_time / latestUpdateTime / etrTime.



#### Checking ID's with multiple start_times/etrTime/lat/long

In [20]:
mult_start_val = validated[validated['num_unique_start_times'] > 1]
mult_start_id = mult_start_val.index.tolist()
mult_lut_val = validated[validated['num_unique_latestUpdatedTime'] > 1]
mult_lut_id = mult_lut_val.index.tolist()
mult_etr_val = validated[validated['num_unique_etrTime'] > 1]
mult_etr_id = mult_etr_val.index.tolist()
mult_lat_val = validated[validated['num_unique_lat'] > 1]
mult_lat_id = mult_lat_val.index.tolist()
mult_long_val = validated[validated['num_unique_long'] > 1]
mult_long_id = mult_lat_val.index.tolist()

# display(mult_start_id)
# display(mult_etr_id)
print(f"# of unique outages with multiple start_time's: {len(mult_start_id)}")
print(f"# of unique outages with multiple latestUpdatedTime: {len(mult_lut_id)}")
print(f"# of unique outages with multiple etr_id's: {len(mult_etr_id)}")

# of unique outages with multiple start_time's: 204
# of unique outages with multiple latestUpdatedTime: 6897
# of unique outages with multiple etr_id's: 700


Since there are only 204 outages out of +17000 outages with multiple start_id's, we will seriously consider using them.
We'll try to see what patterns there could be for the multiple start_id's.

In [21]:
mult_start = df[df['outage_id'].isin(mult_start_id)]
display(mult_start)

# filtering out rows where etrTime is NA
df2 = df[~df['etrTime'].isna()]
mult_start2 = mult_start[~mult_start['etrTime'].isna()]

print("For outages with multiple start dates + excluding NA etrTimes")
grouped = mult_start2.groupby('outage_id').agg({'start_time': 'nunique', 'etrTime': 'nunique', 'lat': 'nunique', 'long': 'nunique'})
display(grouped)

print()
print("For all outages (from original df) + excluding NA etrTimes")
grouped2 = df2.groupby('outage_id').agg({'start_time': 'nunique', 'etrTime': 'nunique', 'lat': 'nunique', 'long': 'nunique'})
display(grouped2)

print()
print(f"For each and every unique outage with multi-startTimes, # of unique starttimes = # of unique latitudes = # of unique longitudes?: {((grouped['start_time'] == grouped['lat']) & (grouped['lat'] == grouped['long'])).all()}")
print(f"For each and every unique outage for the entire original df, # of unique starttimes = # of unique latitudes = # of unique longitudes?: {((grouped2['start_time'] == grouped2['lat']) & (grouped2['lat'] == grouped2['long'])).all()}")
print(f"{((grouped['start_time'] == grouped['lat']) & (grouped['lat'] == grouped['long'])).sum()} out of {len(grouped)} outages in multi-start outage df where # of unique startTimes = # of unique lats = # of unique long")
print(f"{((grouped2['start_time'] == grouped2['lat']) & (grouped2['lat'] == grouped2['long'])).sum()} out of {len(grouped2)} outages in general df where # of unique startTimes = # of unique lats = # of unique longs")

print()
print(f"For each and every unique outage with multi-startTimes, # of unique etrTimes = # of unique latitudes = # of unique longitudes?: {((grouped['etrTime'] == grouped['lat']) & (grouped['lat'] == grouped['long'])).all()}")
print(f"For each unique outage for the entire original df, # of unique etrTimes = # of unique latitudes = # of unique longitudes?: {((grouped2['etrTime'] == grouped2['lat']) & (grouped2['lat'] == grouped2['long'])).all()}")
print(f"{((grouped['etrTime'] == grouped['lat']) & (grouped['lat'] == grouped['long'])).sum()} out of {len(grouped)} outages in multi-start outage df where # of unique etrTimes = # of unique lats = # of unique long")
print(f"{((grouped2['etrTime'] == grouped2['lat']) & (grouped2['lat'] == grouped2['long'])).sum()} out of {len(grouped2)} outages in general df where # of unique etrTimes = # of unique lats = # of unique longs")

print()
print(f"For each and every unique outage with multi-startTimes, # of unique starttimes = # of unique etrTimes?: {((grouped['start_time'] == grouped['etrTime'])).all()}")
print(f"For each and eveyr unique outage for the entire original df, # of unique starttimes = # of unique etrTimes?: {((grouped2['start_time'] == grouped2['etrTime'])).all()}")
print(f"{(grouped['start_time'] == grouped['etrTime']).sum()} out of {len(grouped)} outages in multi-start outage df where # of unique starttimes = # of unique etrTimes")
print(f"{(grouped2['start_time'] == grouped2['etrTime']).sum()} out of {len(grouped2)} outages in general df where # of unique starttimes = # of unique etrTimes")




Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
176,1389689,OUTAGE,2023-03-20 14:07:17-04:00,2023-03-20 14:10:04-04:00,2023-03-20 17:07:17-04:00,Outage,193,,TROUBLE_CALL,33056,31.18155,-83.83748,Revised Prediction,Moultrie,NEW,Colquitt EMC,31768,NaT
180,1389689,OUTAGE,2023-03-20 14:07:17-04:00,2023-03-20 14:20:04-04:00,2023-03-20 17:07:17-04:00,Outage,193,,Manual,33056,31.18155,-83.83748,Device Operation (angelarodney),Moultrie,SNT,Colquitt EMC,31768,NaT
189,1389689,OUTAGE,2023-03-20 14:06:47-04:00,2023-03-20 17:25:04-04:00,NaT,Outage,206,,Manual,33056,31.18155,-83.83748,Device Operation (angelarodney),Moultrie,SNT,Colquitt EMC,31768,NaT
236,1390223,OUTAGE,2023-03-22 08:03:17-04:00,2023-03-22 08:10:04-04:00,2023-03-22 09:03:17-04:00,Outage,35,,TROUBLE_CALL,33135,30.89120,-83.48722,Revised Prediction,Valdosta,SNT,Colquitt EMC,31643,NaT
266,1390223,OUTAGE,2023-03-22 07:59:06-04:00,2023-03-22 16:15:04-04:00,NaT,Outage,41,,TROUBLE_CALL,33135,30.89120,-83.48722,Revised Prediction,Valdosta,SNT,Colquitt EMC,31643,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418428,1537717,OUTAGE,2024-01-09 09:06:38-05:00,2024-01-10 04:30:05-05:00,NaT,Outage,1,,Manual,69977,30.92138,-83.67514,Device Operation (hunterreagan),Moultrie,SNT,Colquitt EMC,31778,2024-01-10 14:18:23-05:00
418447,1537717,OUTAGE,2024-01-09 09:06:38-05:00,2024-01-10 04:30:05-05:00,NaT,Outage,1,,Manual,69977,30.92138,-83.67514,Device Operation (hunterreagan),Moultrie,SNT,Colquitt EMC,31778,2024-01-10 14:33:24-05:00
418466,1537717,OUTAGE,2024-01-09 09:06:38-05:00,2024-01-10 04:30:05-05:00,NaT,Outage,1,,Manual,69977,30.92138,-83.67514,Device Operation (hunterreagan),Moultrie,SNT,Colquitt EMC,31778,2024-01-10 14:48:24-05:00
418485,1537717,OUTAGE,2024-01-09 09:06:38-05:00,2024-01-10 04:30:05-05:00,NaT,Outage,1,,Manual,69977,30.92138,-83.67514,Device Operation (hunterreagan),Moultrie,SNT,Colquitt EMC,31778,2024-01-10 15:03:23-05:00


For outages with multiple start dates + excluding NA etrTimes


Unnamed: 0_level_0,start_time,etrTime,lat,long
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1389689,1,1,1,1
1390223,1,1,1,1
1392517,2,2,2,2
1393523,2,2,2,2
1399998,2,2,2,2
...,...,...,...,...
1516011,2,2,2,2
1516377,2,2,2,2
1524247,2,2,2,2
1527302,2,2,2,2



For all outages (from original df) + excluding NA etrTimes


Unnamed: 0_level_0,start_time,etrTime,lat,long
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1387940,1,1,1,1
1388250,1,1,1,1
1388449,1,1,1,1
1388470,1,1,1,1
1388477,1,1,1,1
...,...,...,...,...
1527371,1,1,1,1
1527372,1,1,1,1
1527373,1,1,1,1
1527374,1,1,1,1



For each and every unique outage with multi-startTimes, # of unique starttimes = # of unique latitudes = # of unique longitudes?: False
For each and every unique outage for the entire original df, # of unique starttimes = # of unique latitudes = # of unique longitudes?: False
148 out of 188 outages in multi-start outage df where # of unique startTimes = # of unique lats = # of unique long
13844 out of 14720 outages in general df where # of unique startTimes = # of unique lats = # of unique longs

For each and every unique outage with multi-startTimes, # of unique etrTimes = # of unique latitudes = # of unique longitudes?: False
For each unique outage for the entire original df, # of unique etrTimes = # of unique latitudes = # of unique longitudes?: False
177 out of 188 outages in multi-start outage df where # of unique etrTimes = # of unique lats = # of unique long
14380 out of 14720 outages in general df where # of unique etrTimes = # of unique lats = # of unique longs

For each and 

It does seem like most of the outages have unique startDates that are correlated with different locations and different unique etrTimes.

However, I am cautious to assume whether this subdivides each outageID into more granular outages.

In [22]:
# Finding outages with multiple locations and location outliers 
mult_loc = df[df['outage_id'].isin(mult_lat_id) & df['outage_id'].isin(mult_long_id)]
display(mult_loc)

grouped = mult_loc.groupby('outage_id')
lat_range = grouped['lat'].max() - grouped['lat'].min() 
long_range = grouped['long'].max() - grouped['long'].min() 
print(f"Outage ID with multiple locations where the lat_range is max: outage_id #{lat_range.idxmax()} with {lat_range.max()} diff in lat")
print(f"Outage ID with multiple locations where the long_range is max: outage_id #{long_range.idxmax()} with {long_range.max()} diff in long")

haha = df[(df['outage_id'] == lat_range.idxmax()) | (df['outage_id'] == long_range.idxmax())]
display(haha)
groupie = haha.groupby('outage_id').agg({'lat': ['max', 'min'], 'long': ['max', 'min']})
display(groupie)

Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
15,1388529,OUTAGE,2023-03-16 09:29:02-04:00,2023-03-16 09:30:04-04:00,2023-03-16 11:29:02-04:00,Outage,1,,TROUBLE_CALL,32829,31.06903,-83.45990,New Prediction,Tifton,NEW,Colquitt EMC,unknown,NaT
16,1388529,OUTAGE,2023-03-16 09:29:02-04:00,2023-03-16 09:35:05-04:00,2023-03-16 11:29:02-04:00,Outage,2,,TROUBLE_CALL,32829,31.06905,-83.45926,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,NaT
21,1388554,OUTAGE,2023-03-16 10:24:27-04:00,2023-03-16 10:30:04-04:00,2023-03-16 12:24:27-04:00,Outage,1,,TROUBLE_CALL,32835,31.18621,-83.87458,New Prediction,Moultrie,SNT,Colquitt EMC,31768,NaT
22,1388554,OUTAGE,2023-03-16 10:24:27-04:00,2023-03-16 10:40:04-04:00,2023-03-16 13:24:27-04:00,Outage,206,,TROUBLE_CALL,32835,31.18155,-83.83748,Revised Prediction,Moultrie,SNT,Colquitt EMC,31768,NaT
81,1388930,OUTAGE,2023-03-17 14:48:56-04:00,2023-03-17 14:55:04-04:00,2023-03-17 15:48:56-04:00,Outage,10,,TROUBLE_CALL,32915,31.12517,-83.23273,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423045,1543057,OUTAGE,2024-01-17 06:31:23-05:00,2024-01-17 07:45:05-05:00,NaT,Outage,7,,Manual,70541,30.67864,-83.49780,Device Operation (suebolin),Moultrie,SNT,Colquitt EMC,31643,2024-01-17 07:48:24-05:00
423055,1543057,OUTAGE,2024-01-17 06:31:23-05:00,2024-01-17 07:45:05-05:00,NaT,Outage,7,,Manual,70541,30.67864,-83.49780,Device Operation (suebolin),Moultrie,SNT,Colquitt EMC,31643,2024-01-17 08:03:24-05:00
423065,1543057,OUTAGE,2024-01-17 06:31:23-05:00,2024-01-17 07:45:05-05:00,NaT,Outage,7,,Manual,70541,30.67864,-83.49780,Device Operation (suebolin),Moultrie,SNT,Colquitt EMC,31643,2024-01-17 08:18:23-05:00
423195,1543425,OUTAGE,2024-01-17 13:40:01-05:00,2024-01-17 13:45:05-05:00,NaT,Outage,1,,TROUBLE_CALL,70563,31.43404,-83.53061,New Prediction,Tifton,UAS,Colquitt EMC,31793,2024-01-17 13:48:23-05:00


Outage ID with multiple locations where the lat_range is max: outage_id #1525427 with 535535.2171 diff in lat
Outage ID with multiple locations where the long_range is max: outage_id #1525172 with 2640537.1954 diff in long


Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
402066,1525172,OUTAGE,2023-12-11 17:08:38-05:00,2023-12-11 17:15:05-05:00,2023-12-11 19:08:38-05:00,Outage,2,,TROUBLE_CALL,65999,31.21301,-8.306668e+01,Revised Prediction,Tifton,UAS,Colquitt EMC,unknown,2023-12-11 17:18:24-05:00
402070,1525172,OUTAGE,2023-12-11 17:08:38-05:00,2023-12-11 17:15:05-05:00,2023-12-11 19:08:38-05:00,Outage,2,,TROUBLE_CALL,65999,31.21301,-8.306668e+01,Revised Prediction,Tifton,UAS,Colquitt EMC,unknown,2023-12-11 17:33:24-05:00
402072,1525172,OUTAGE,2023-12-11 17:08:38-05:00,2023-12-11 17:15:05-05:00,2023-12-11 19:08:38-05:00,Outage,2,,TROUBLE_CALL,65999,31.21301,-8.306668e+01,Revised Prediction,Tifton,UAS,Colquitt EMC,unknown,2023-12-11 17:48:23-05:00
402073,1525172,OUTAGE,2023-12-11 17:08:38-05:00,2023-12-11 17:15:05-05:00,2023-12-11 19:08:38-05:00,Outage,2,,TROUBLE_CALL,65999,31.21301,-8.306668e+01,Revised Prediction,Tifton,UAS,Colquitt EMC,unknown,2023-12-11 18:03:24-05:00
402074,1525172,OUTAGE,2023-12-11 17:08:38-05:00,2023-12-11 17:15:05-05:00,2023-12-11 19:08:38-05:00,Outage,2,,TROUBLE_CALL,65999,31.21301,-8.306668e+01,Revised Prediction,Tifton,UAS,Colquitt EMC,unknown,2023-12-11 18:18:24-05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
403093,1525427,OUTAGE,2023-12-12 08:21:25-05:00,2023-12-13 12:05:06-05:00,2023-12-12 10:21:25-05:00,Outage,2,,TROUBLE_CALL,66009,535566.68707,2.551576e+06,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-12-13 23:03:23-05:00
403097,1525427,OUTAGE,2023-12-12 08:21:25-05:00,2023-12-13 12:05:06-05:00,2023-12-12 10:21:25-05:00,Outage,2,,TROUBLE_CALL,66009,535566.68707,2.551576e+06,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-12-13 23:18:24-05:00
403101,1525427,OUTAGE,2023-12-12 08:21:25-05:00,2023-12-13 12:05:06-05:00,2023-12-12 10:21:25-05:00,Outage,2,,TROUBLE_CALL,66009,535566.68707,2.551576e+06,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-12-13 23:33:24-05:00
403105,1525427,OUTAGE,2023-12-12 08:21:25-05:00,2023-12-13 12:05:06-05:00,2023-12-12 10:21:25-05:00,Outage,2,,TROUBLE_CALL,66009,535566.68707,2.551576e+06,Revised Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-12-13 23:48:23-05:00


Unnamed: 0_level_0,lat,lat,long,long
Unnamed: 0_level_1,max,min,max,min
outage_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1525172,442866.24188,31.21301,2640454.0,-83.06668
1525427,535566.68707,31.46997,2551576.0,-83.34875


In [23]:
threshold = 0.01
print(f"Outages with multiple locations where the range in different latitudes is >{threshold}: {(lat_range > threshold).sum()} out of {len(lat_range)} outages")
print(f"Outages with multiple locations where the range in different longitutdes is >{threshold}: {(long_range > threshold).sum()} out of {len(long_range)} outages")

Outages with multiple locations where the range in different latitudes is >0.01: 470 out of 1380 outages
Outages with multiple locations where the range in different longitutdes is >0.01: 506 out of 1380 outages


Since a considerable number of multi-location outages have only a difference of at least 0.01 (around 1 km difference), there could be multiple different outages based on location for each outageID.

However, with the previous observations, the different locations are not 1-to-1 to the different start-times. 

Thus, we will not consider multiple outages per outageID based on location (as that is too much work lol).

In [24]:
df[df['outage_id'] == validated['num_unique_etrTime'].idxmax()]
validated[validated.index == validated['num_unique_etrTime'].idxmax()]

Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1485746,Colquitt EMC,17,4,4,2,2023-09-01 09:16:00-04:00,2023-09-01 12:43:38-04:00,0 days 03:27:38,2023-09-01 09:33:24-04:00,2023-09-01 14:18:24-04:00,4,4,2023-09-01 10:16:00-04:00,2023-09-01 15:43:38-04:00,0 days 04:45:00,2023-09-01 14:01:00-04:00,0 days 06:27:38,0 days 05:02:24,0 days 03:39:10,True,0 days 00:17:24,0 days 01:42:38,0 days 01:25:14,0 days 02:48:28,0 days 01:05:50


Let's see if we can group by both outageID and startTime and see if that makes a difference.

In [25]:
outage_start = df.groupby(['outage_id', 'start_time'])
agg = outage_start.agg({'etrTime': 'nunique', 'lat': 'nunique', 'long': 'nunique'})
display(agg)
has_etr = agg[agg['etrTime'] != 0]
display(has_etr)

print(f"For each and every unique outage-startTime combination, # of unique etrTimes = 1: {((has_etr['etrTime'] == 1)).all()}")
print(f"For each and every unique outage-startTime combination, # of unique etrTimes = # of unique lats = # of unique longs?: {((has_etr['etrTime'] == has_etr['lat']) & (has_etr['lat'] == has_etr['long'])).all()}")
print(f"{(has_etr['etrTime'] == 1).sum()} out of {len(has_etr)} outages in outageId-startTime df where # of unique starttimes = # of unique etrTimes")
print(f"{((has_etr['etrTime'] == has_etr['lat']) & (has_etr['lat'] == has_etr['long'])).sum()} out of {len(has_etr)} outages in outageId-startTime df where # of unique etrTimes = # of unique lats = # of unique longs")


display(agg[(agg['lat'] != 1) | (agg['long'] != 1)])

Unnamed: 0_level_0,Unnamed: 1_level_0,etrTime,lat,long
outage_id,start_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1387175,2023-03-12 14:37:34-04:00,0,1,1
1387940,2023-03-15 06:33:39-04:00,1,1,1
1388250,2023-03-15 13:13:54-04:00,1,1,1
1388449,2023-03-15 19:05:17-04:00,1,1,1
1388470,2023-03-15 21:28:52-04:00,1,1,1
...,...,...,...,...
1543621,2024-01-18 09:03:45-05:00,0,1,1
1543632,2024-01-18 09:39:54-05:00,0,1,1
1543634,2024-01-18 09:45:56-05:00,0,1,1
1543666,2024-01-18 10:16:45-05:00,0,1,1


Unnamed: 0_level_0,Unnamed: 1_level_0,etrTime,lat,long
outage_id,start_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1387940,2023-03-15 06:33:39-04:00,1,1,1
1388250,2023-03-15 13:13:54-04:00,1,1,1
1388449,2023-03-15 19:05:17-04:00,1,1,1
1388470,2023-03-15 21:28:52-04:00,1,1,1
1388477,2023-03-16 03:02:10-04:00,1,1,1
...,...,...,...,...
1527371,2023-12-16 13:58:17-05:00,1,1,1
1527372,2023-12-16 13:58:07-05:00,1,1,1
1527373,2023-12-16 13:58:17-05:00,1,1,1
1527374,2023-12-16 13:58:17-05:00,1,1,1


For each and every unique outage-startTime combination, # of unique etrTimes = 1: False
For each and every unique outage-startTime combination, # of unique etrTimes = # of unique lats = # of unique longs?: False
14294 out of 14871 outages in outageId-startTime df where # of unique starttimes = # of unique etrTimes
14209 out of 14871 outages in outageId-startTime df where # of unique etrTimes = # of unique lats = # of unique longs


Unnamed: 0_level_0,Unnamed: 1_level_0,etrTime,lat,long
outage_id,start_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1388529,2023-03-16 09:29:02-04:00,1,2,2
1388554,2023-03-16 10:24:27-04:00,2,2,2
1388930,2023-03-17 14:48:56-04:00,2,2,2
1390026,2023-03-21 13:39:54-04:00,1,2,2
1391042,2023-03-24 08:44:28-04:00,1,2,2
...,...,...,...,...
1541701,2024-01-15 15:26:02-05:00,0,2,2
1541823,2024-01-15 18:54:46-05:00,0,2,2
1543053,2024-01-17 06:19:35-05:00,0,2,2
1543057,2024-01-17 06:31:23-05:00,0,3,3


We see that even if we group-by outage_ID and startTime, we don't get every outage to have a 1 unique etrTime. 

However, we do see that there are serveral outage+start_time combinations where there are more than 1 unique latitudes/longitudes.

We will define a summary_statistics function to summarize the metrics given by validate and see how the summary_stats compare between grouping by outage_ID and grouping by outage_ID + startTime.

In [26]:
os_validated = outage_start.apply(_validate)
display(os_validated)


  os_validated = outage_start.apply(_validate)


Unnamed: 0_level_0,Unnamed: 1_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,start_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1387175,2023-03-12 14:37:34-04:00,Colquitt EMC,1,1,1,1,2023-03-12 14:37:34-04:00,2023-03-12 14:37:34-04:00,0 days,NaT,NaT,1,0,NaT,NaT,NaT,NaT,NaT,NaT,1 days 02:32:31,False,NaT,NaT,NaT,NaT,NaT
1387940,2023-03-15 06:33:39-04:00,Colquitt EMC,1,1,1,1,2023-03-15 06:33:39-04:00,2023-03-15 06:33:39-04:00,0 days,NaT,NaT,1,1,2023-03-15 08:33:39-04:00,2023-03-15 08:33:39-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:11:25,False,NaT,NaT,NaT,0 days 01:48:35,NaT
1388250,2023-03-15 13:13:54-04:00,Colquitt EMC,1,1,1,1,2023-03-15 13:13:54-04:00,2023-03-15 13:13:54-04:00,0 days,NaT,NaT,1,1,2023-03-15 15:13:54-04:00,2023-03-15 15:13:54-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:11,False,NaT,NaT,NaT,0 days 01:53:49,NaT
1388449,2023-03-15 19:05:17-04:00,Colquitt EMC,1,1,1,1,2023-03-15 19:05:17-04:00,2023-03-15 19:05:17-04:00,0 days,NaT,NaT,1,1,2023-03-15 21:05:17-04:00,2023-03-15 21:05:17-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:04:48,False,NaT,NaT,NaT,0 days 01:55:12,NaT
1388470,2023-03-15 21:28:52-04:00,Colquitt EMC,2,1,1,1,2023-03-15 21:28:52-04:00,2023-03-15 21:28:52-04:00,0 days,NaT,NaT,2,1,2023-03-15 23:28:52-04:00,2023-03-15 23:28:52-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:12,False,NaT,NaT,NaT,0 days 01:53:48,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543621,2024-01-18 09:03:45-05:00,Colquitt EMC,2,1,1,1,2024-01-18 09:03:45-05:00,2024-01-18 09:03:45-05:00,0 days,2024-01-18 09:18:24-05:00,2024-01-18 09:33:23-05:00,1,0,NaT,NaT,0 days 00:14:59,2024-01-18 09:18:44-05:00,NaT,0 days 00:29:38,0 days 00:06:20,True,0 days 00:14:39,NaT,NaT,NaT,0 days 00:08:39
1543632,2024-01-18 09:39:54-05:00,Colquitt EMC,4,1,1,1,2024-01-18 09:39:54-05:00,2024-01-18 09:39:54-05:00,0 days,2024-01-18 09:48:23-05:00,2024-01-18 10:33:24-05:00,2,0,NaT,NaT,0 days 00:45:01,2024-01-18 10:24:55-05:00,NaT,0 days 00:53:30,0 days 00:10:10,True,0 days 00:08:29,NaT,NaT,NaT,0 days 00:34:51
1543634,2024-01-18 09:45:56-05:00,Colquitt EMC,3,1,1,1,2024-01-18 09:45:56-05:00,2024-01-18 09:45:56-05:00,0 days,2024-01-18 10:03:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:30:00,2024-01-18 10:15:56-05:00,NaT,0 days 00:47:28,0 days 00:09:09,True,0 days 00:17:28,NaT,NaT,NaT,0 days 00:20:51
1543666,2024-01-18 10:16:45-05:00,Colquitt EMC,1,1,1,1,2024-01-18 10:16:45-05:00,2024-01-18 10:16:45-05:00,0 days,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:00:00,2024-01-18 10:16:45-05:00,NaT,0 days 00:16:39,0 days 00:13:19,True,0 days 00:16:39,NaT,NaT,NaT,0 days 00:13:19


## Summary stats for Colquitt to see distributions in metrics across all outages


In [27]:
def summary_stats(validated):
    # non_nat_values = validated['timestampstart_endstart_dur_error'][~validated['timestampstart_endstart_dur_error'].isna()]

    subdf = {
        "Provider": validated['utility_provider'].iloc[0],
        "Num Unique Outages": len(validated),
        "Avg Num Rows": validated['num_rows'].mean(),
        "Avg Num Unique Longitude": validated['num_unique_long'].mean(),
        "Avg Num Unique Latitude": validated['num_unique_lat'].mean(),
        "Avg Num Unique start_time": validated['num_unique_start_times'].mean(),
        "Median Num Unique start_time": validated['num_unique_start_times'].median(),
        "Max Num Unique start_time": validated['num_unique_start_times'].max(),
        "Proportion of Outages with Multiple start_time": f"{len(validated[validated['num_unique_start_times']>1])/len(validated)} ({len(validated[validated['num_unique_start_times']>1])} outages)",
        "Avg Diff in start_time": validated['diff_in_start'].mean(),
        "Median Diff in start_time": validated['diff_in_start'].median(),
        "Max Diff in start_time": validated['diff_in_start'].max(),
        "Avg Num Unique latestUpdatedTime": validated['num_unique_latestUpdatedTime'].mean(),
        "Avg Num Unique etrTime": validated['num_unique_etrTime'].mean(), 
        "Max Num Unique etrTime": validated['num_unique_etrTime'].max(),
        "Num of Outages with an non-NA etrTime": f"{(len(validated) - len(validated[validated['num_unique_etrTime'] == 0]))} ({(len(validated) - len(validated[validated['num_unique_etrTime'] == 0]))/len(validated)})",
        "is_timestamp_dur_div_15_min freq (with 5 sec of error)": len(validated[validated['is_timestamp_dur_div_15_min']]) / len(validated),
        "Avg duration_by_timestamp": validated['duration_by_timestamp'].mean(),
        "Max duration_by_timestamp": validated['duration_by_timestamp'].max(),
        "Min duration_by_timestamp": validated['duration_by_timestamp'].min(),
        "Avg duration_by_timestampstart": validated['duration_by_timestampstart'].mean(),
        "Avg duration_by_lutstart": validated['duration_by_lutstart'].mean(),
        "Avg duration_by_etrstart": validated['duration_by_etrstart'].mean(),
        "Avg timestampstart_timestamp_dur_error": validated['timestampstart_timestamp_dur_error'].mean(),  
        "Median timestampstart_timestamp_dur_error": validated['timestampstart_timestamp_dur_error'].median(),
        "Max timestampstart_timestamp_dur_error": validated['timestampstart_timestamp_dur_error'].max(),
        "Proportion of timestampstart dur within 30 min of timestamp dur": f"{len(validated[abs(validated['timestampstart_timestamp_dur_error']) < timedelta(minutes=30)])/len(validated)} ({len(validated[abs(validated['timestampstart_timestamp_dur_error']) < timedelta(minutes=30)])})",
        "Avg lutstart_etrstart_dur_error": validated['lutstart_etrstart_dur_error'].mean(),
        "Max lutstart_etrstart_dur_error": validated['lutstart_etrstart_dur_error'].max(),
        "Avg etrstart_timestamp_dur_error": validated['etrstart_timestamp_dur_error'].mean(), 
        "Max etrstart_timestamp_dur_error": validated['etrstart_timestamp_dur_error'].max()
    }
    return subdf

In [28]:
# for outages grouped by only outage_id
summary = []
summary.append(summary_stats(validated))
summary = pd.DataFrame(summary) 

# for outages group by outage_id and start_date
os_sum = []
os_sum.append(summary_stats(os_validated))
os_sum = pd.DataFrame(os_sum)

print("Grouped by Outage-ID only Summary")
display(summary)
print("Grouped by Outage_ID and startTime summary")
display(os_sum)

Grouped by Outage-ID only Summary


Unnamed: 0,Provider,Num Unique Outages,Avg Num Rows,Avg Num Unique Longitude,Avg Num Unique Latitude,Avg Num Unique start_time,Median Num Unique start_time,Max Num Unique start_time,Proportion of Outages with Multiple start_time,Avg Diff in start_time,Median Diff in start_time,Max Diff in start_time,Avg Num Unique latestUpdatedTime,Avg Num Unique etrTime,Max Num Unique etrTime,Num of Outages with an non-NA etrTime,is_timestamp_dur_div_15_min freq (with 5 sec of error),Avg duration_by_timestamp,Max duration_by_timestamp,Min duration_by_timestamp,Avg duration_by_timestampstart,Avg duration_by_lutstart,Avg duration_by_etrstart,Avg timestampstart_timestamp_dur_error,Median timestampstart_timestamp_dur_error,Max timestampstart_timestamp_dur_error,Proportion of timestampstart dur within 30 min of timestamp dur,Avg lutstart_etrstart_dur_error,Max lutstart_etrstart_dur_error,Avg etrstart_timestamp_dur_error,Max etrstart_timestamp_dur_error
0,Colquitt EMC,17138,24.725522,1.091609,1.091668,1.011962,1.0,3,0.01190337262224297 (204 outages),0 days 00:06:50.648383708,0 days,11 days 01:59:42,1.609464,0.902614,4,14720 (0.8589100245069436),0.947193,0 days 06:23:37.434545236,24 days 17:00:01,0 days,0 days 08:33:38.558255339,0 days 04:58:47.046271443,0 days 02:05:27.898369565,0 days 02:10:02.185025197,0 days 00:12:35,8 days 06:37:17,0.8829501692146108 (15132),0 days 03:43:52.913247282,12 days 20:02:41,0 days 05:35:43.446018940,24 days 15:00:01


Grouped by Outage_ID and startTime summary


Unnamed: 0,Provider,Num Unique Outages,Avg Num Rows,Avg Num Unique Longitude,Avg Num Unique Latitude,Avg Num Unique start_time,Median Num Unique start_time,Max Num Unique start_time,Proportion of Outages with Multiple start_time,Avg Diff in start_time,Median Diff in start_time,Max Diff in start_time,Avg Num Unique latestUpdatedTime,Avg Num Unique etrTime,Max Num Unique etrTime,Num of Outages with an non-NA etrTime,is_timestamp_dur_div_15_min freq (with 5 sec of error),Avg duration_by_timestamp,Max duration_by_timestamp,Min duration_by_timestamp,Avg duration_by_timestampstart,Avg duration_by_lutstart,Avg duration_by_etrstart,Avg timestampstart_timestamp_dur_error,Median timestampstart_timestamp_dur_error,Max timestampstart_timestamp_dur_error,Proportion of timestampstart dur within 30 min of timestamp dur,Avg lutstart_etrstart_dur_error,Max lutstart_etrstart_dur_error,Avg etrstart_timestamp_dur_error,Max etrstart_timestamp_dur_error
0,Colquitt EMC,17343,24.433258,1.080955,1.081013,1.0,1.0,1,0.0 (0 outages),0 days,0 days,0 days,1.59044,0.891945,3,14871 (0.8574641065559592),0.947472,0 days 06:18:52.623629142,24 days 17:00:01,0 days,0 days 08:29:57.999110794,0 days 04:54:49.146341463,0 days 01:58:52.421357003,0 days 02:11:06.762404410,0 days 00:12:36,8 days 06:37:17,0.8827769128755117 (15310),0 days 03:42:10.460628068,12 days 20:02:41,0 days 05:32:37.284296028,24 days 15:00:01


Looking at the average durations, the average durations seem to be reasonable across both group_by_outage_id and group_by_outageid&startdate with the latter having slightly shorter durations.


However, given that we are able to identify outages with a unique start-time and the duration metrics given earlier seem proper, we will identify outages by a outageID&start_date basis.

Let's check out the distributions of each of the durations.

In [29]:
# for the group_by_outage_id df
print("For the group_by_outage_id df")
dur_by_timestamp_dist = validated['duration_by_timestamp'].describe()
dur_by_lutstart_dist = validated['duration_by_lutstart'].describe()
dur_by_etrstart_dist = validated['duration_by_etrstart'].describe()

print("Duration by timestamp distributions:")
print(dur_by_timestamp_dist)
print()

print("Duration by lastUpdatedTime - startTime distribution:")
print(dur_by_lutstart_dist)
print()

print("Duration by etrTime - startTime distribution:")
print(dur_by_etrstart_dist)


print()
print()
print("For the group by outage_id and startTime df")
dur_by_timestamp_dist = os_validated['duration_by_timestamp'].describe()
dur_by_lutstart_dist = os_validated['duration_by_lutstart'].describe()
dur_by_etrstart_dist = os_validated['duration_by_etrstart'].describe()

print("Duration by timestamp distributions:")
print(dur_by_timestamp_dist)
print()

print("Duration by lastUpdatedTime - startTime distribution:")
print(dur_by_lutstart_dist)
print()

print("Duration by etrTime - startTime distribution:")
print(dur_by_etrstart_dist)



For the group_by_outage_id df
Duration by timestamp distributions:
count                        16668
mean     0 days 06:23:37.434545236
std      0 days 17:34:00.495489119
min                0 days 00:00:00
25%                0 days 00:15:00
50%                0 days 01:14:59
75%                0 days 04:15:01
max               24 days 17:00:01
Name: duration_by_timestamp, dtype: object

Duration by lastUpdatedTime - startTime distribution:
count                        17138
mean     0 days 04:58:47.046271443
std      0 days 15:59:21.545195059
min              -1 days +23:52:44
25%                0 days 00:04:31
50%                0 days 00:08:24
75%                0 days 01:07:38
max               12 days 22:02:41
Name: duration_by_lutstart, dtype: object

Duration by etrTime - startTime distribution:
count                        14720
mean     0 days 02:05:27.898369565
std      0 days 02:23:56.619620105
min                0 days 01:00:00
25%                0 days 02:00:00
50%        

In [30]:
# Extraneous duration_by_timestamp error
max_dur_by_ts_id = validated['duration_by_timestamp'].idxmax()
display(validated[validated.index == max_dur_by_ts_id])

# Extraneous duration by lutstart
max_dur_by_ls_id = validated['duration_by_lutstart'].idxmax()
min_dur_by_ls_id = validated['duration_by_lutstart'].idxmin()
display(validated[validated.index == max_dur_by_ls_id])

Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1454100,Colquitt EMC,2189,1,1,1,2023-07-14 20:37:23-04:00,2023-07-14 20:37:23-04:00,0 days,2023-07-14 20:48:23-04:00,2023-08-08 13:48:24-04:00,5,1,2023-07-14 22:37:23-04:00,2023-07-14 22:37:23-04:00,24 days 17:00:01,2023-08-08 13:37:24-04:00,0 days 02:00:00,24 days 17:11:01,12 days 22:02:41,True,0 days 00:11:00,24 days 15:00:01,24 days 15:11:01,12 days 20:02:41,11 days 18:57:20


Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1454100,Colquitt EMC,2189,1,1,1,2023-07-14 20:37:23-04:00,2023-07-14 20:37:23-04:00,0 days,2023-07-14 20:48:23-04:00,2023-08-08 13:48:24-04:00,5,1,2023-07-14 22:37:23-04:00,2023-07-14 22:37:23-04:00,24 days 17:00:01,2023-08-08 13:37:24-04:00,0 days 02:00:00,24 days 17:11:01,12 days 22:02:41,True,0 days 00:11:00,24 days 15:00:01,24 days 15:11:01,12 days 20:02:41,11 days 18:57:20


This outlier outage in duration_timestamp and duration_by_lastUpdatedTime-start are both the same outage_id.
It has one unique start, one unique location, one unique etrTime. However, only its duration by etrTime - startTime makes sense of 2 hours while every other duration metric is several weeks - including duration by timestamp.

Let's check out the negative duration_by_etrStart.

In [31]:
# outage ID where duration by lastUpdatedTime - startTime is negative
display(validated[validated.index == min_dur_by_ls_id])
min_dur_by_ls_temp = df[df['outage_id'] == min_dur_by_ls_id]
display(min_dur_by_ls_temp) # id = 1425483

Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1425483,Colquitt EMC,13,1,1,1,2023-06-13 18:02:22-04:00,2023-06-13 18:02:22-04:00,0 days,2023-06-13 17:24:56-04:00,2023-06-13 18:03:23-04:00,3,1,2023-06-13 21:02:22-04:00,2023-06-13 21:02:22-04:00,0 days 00:38:27,2023-06-13 18:40:49-04:00,0 days 03:00:00,0 days 00:01:01,-1 days +23:52:44,False,0 days 00:37:26,0 days 02:21:33,0 days 02:58:59,0 days 03:07:16,0 days 00:45:43


Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
29116,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:24:56-04:00
29214,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:26:00-04:00
29318,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:28:02-04:00
29422,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:30:41-04:00
29573,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:32:01-04:00
29724,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:34:40-04:00
29875,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:36:22-04:00
30096,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:39:21-04:00
30318,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:05:05-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-NEW,Colquitt EMC,31768,2023-06-13 17:41:28-04:00
30539,1425483,OUTAGE,2023-06-13 18:02:22-04:00,2023-06-13 17:45:06-04:00,2023-06-13 21:02:22-04:00,Outage,960,,Manual,41124,31.21651,-83.77633,Device Operation (nms1),Moultrie,S-INC,Colquitt EMC,31768,2023-06-13 17:48:22-04:00


We see that even if there is one edge case, there are instances where the reported lastUpdatedTime is before the first startTime thus invalidating using lastUpdatedTime as a endTime metric.
For this edge case, the duration by etrTime - startTime is suspiciously exactly 3 hours while the duration by timestamp seems the most likely.

Let's check out extraneous duration_by_etrStart

In [32]:
# Extraneous duration by etrstart
# Max
max_dur_by_etrs_id = validated['duration_by_etrstart'].idxmax()
print("Outage ID where duration_by_etrstart is the longest")
display(validated[validated.index == max_dur_by_etrs_id])

display(df[df['outage_id'] == max_dur_by_etrs_id])

# Min
min_dur_by_etrs_id = validated['duration_by_etrstart'].idxmin()
print("Outage ID where duration_by_etrstart is the shortest")
display(validated[validated.index == min_dur_by_etrs_id])

display(df[df['outage_id'] == min_dur_by_etrs_id])

Outage ID where duration_by_etrstart is the longest


Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1502604,Colquitt EMC,489,2,2,2,2023-10-18 09:36:45-04:00,2023-10-23 16:45:18-04:00,5 days 07:08:33,2023-10-18 09:48:24-04:00,2023-10-23 19:18:24-04:00,4,2,2023-10-18 11:36:45-04:00,2023-10-23 19:45:18-04:00,5 days 09:30:00,2023-10-23 19:06:45-04:00,5 days 10:08:33,5 days 09:41:39,5 days 07:28:19,True,0 days 00:11:39,0 days 00:38:33,0 days 00:26:54,0 days 02:40:14,0 days 02:01:41


Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
374245,1502604,OUTAGE,2023-10-18 09:36:45-04:00,2023-10-18 09:40:05-04:00,2023-10-18 11:36:45-04:00,Outage,1,,TROUBLE_CALL,62493,31.26506,-83.29296,New Prediction,Tifton,NEW,Colquitt EMC,unknown,2023-10-18 09:48:24-04:00
374256,1502604,OUTAGE,2023-10-18 09:36:45-04:00,2023-10-18 09:50:04-04:00,2023-10-18 11:36:45-04:00,Outage,1,,TROUBLE_CALL,62493,31.26506,-83.29296,New Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-10-18 10:03:24-04:00
374267,1502604,OUTAGE,2023-10-18 09:36:45-04:00,2023-10-18 09:50:04-04:00,2023-10-18 11:36:45-04:00,Outage,1,,TROUBLE_CALL,62493,31.26506,-83.29296,New Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-10-18 10:18:24-04:00
374278,1502604,OUTAGE,2023-10-18 09:36:45-04:00,2023-10-18 09:50:04-04:00,2023-10-18 11:36:45-04:00,Outage,1,,TROUBLE_CALL,62493,31.26506,-83.29296,New Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-10-18 10:33:24-04:00
374287,1502604,OUTAGE,2023-10-18 09:36:45-04:00,2023-10-18 09:50:04-04:00,2023-10-18 11:36:45-04:00,Outage,1,,TROUBLE_CALL,62493,31.26506,-83.29296,New Prediction,Tifton,SNT,Colquitt EMC,unknown,2023-10-18 10:48:24-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376816,1502604,OUTAGE,2023-10-23 16:45:18-04:00,2023-10-23 17:05:04-04:00,2023-10-23 19:45:18-04:00,Outage,867,,Manual,62493,31.20015,-83.30167,Device Operation (nms1),Tifton,S-RST,Colquitt EMC,31639,2023-10-23 18:18:24-04:00
376830,1502604,OUTAGE,2023-10-23 16:45:18-04:00,2023-10-23 17:05:04-04:00,2023-10-23 19:45:18-04:00,Outage,867,,Manual,62493,31.20015,-83.30167,Device Operation (nms1),Tifton,S-RST,Colquitt EMC,31639,2023-10-23 18:33:23-04:00
376836,1502604,OUTAGE,2023-10-23 16:45:18-04:00,2023-10-23 17:05:04-04:00,2023-10-23 19:45:18-04:00,Outage,867,,Manual,62493,31.20015,-83.30167,Device Operation (nms1),Tifton,S-RST,Colquitt EMC,31639,2023-10-23 18:48:23-04:00
376842,1502604,OUTAGE,2023-10-23 16:45:18-04:00,2023-10-23 17:05:04-04:00,2023-10-23 19:45:18-04:00,Outage,867,,Manual,62493,31.20015,-83.30167,Device Operation (nms1),Tifton,S-RST,Colquitt EMC,31639,2023-10-23 19:03:24-04:00


Outage ID where duration_by_etrstart is the shortest


Unnamed: 0_level_0,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
outage_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1388519,Colquitt EMC,2,1,1,1,2023-03-16 08:36:58-04:00,2023-03-16 08:36:58-04:00,0 days,NaT,NaT,2,1,2023-03-16 09:36:58-04:00,2023-03-16 09:36:58-04:00,NaT,NaT,0 days 01:00:00,NaT,0 days 00:13:05,False,NaT,NaT,NaT,0 days 00:46:55,NaT


Unnamed: 0,outage_id,type,start_time,lastUpdatedTime,etrTime,title,customer_affected,status,cause,identifier,lat,long,description,city,state,utility_provider,zipcode,timestamp
8,1388519,OUTAGE,2023-03-16 08:36:58-04:00,2023-03-16 08:40:03-04:00,2023-03-16 09:36:58-04:00,Outage,2,,TROUBLE_CALL,32825,31.16408,-83.8673,Revised Prediction,Moultrie,NEW,Colquitt EMC,31768,NaT
9,1388519,OUTAGE,2023-03-16 08:36:58-04:00,2023-03-16 08:50:03-04:00,2023-03-16 09:36:58-04:00,Outage,2,,TROUBLE_CALL,32825,31.16408,-83.8673,Revised Prediction,Moultrie,SNT,Colquitt EMC,31768,NaT


ID 1502604 has a max duration_by_etrTime of 5 days, and this 5 day duration seems to be consistent across all duration metrics.


#### Checking time columns where the time is before 2023-01-01 which could cause extraneous durations.

The following codeblock was to validate which columns had times before 2023-01-01 (before when the scraper begins).
The logic to find those rows with those extraneous times and replace them with NaT is implemented.

In [33]:
time_col = ['start_time', 'lastUpdatedTime','etrTime', 'timestamp']
for col in time_col:
    rem = df[df[col] <= '2023-01-01 23:59:59-05:00']
    print(f"# of outages where the {col} is before the scraping begins: {rem['outage_id'].nunique()}")

pre_time_outage_list = df[df['etrTime'] <= '2023-01-01 23:59:59-05:00']['outage_id'].unique().tolist() # 1618 outages
removed_pre = df[~df['outage_id'].isin(pre_time_outage_list)]
extraneous_etr_outage_list = df[(df['etrTime'] <= '2023-01-01 23:59:59-05:00') | (df['etrTime'].isna())]['outage_id'].unique().tolist()
removed_ext_etr = df[~df['outage_id'].isin(extraneous_etr_outage_list)]

# Revalidating and redoing summary of filtered df with pre_time etrTimes removed
rem_pre_val = removed_pre.groupby('outage_id').apply(_validate, include_groups=False).reset_index()
print("Validated outages where rows with etrTimes before 2023-01-01 are removed")
display(rem_pre_val)
summary2 = []
summary2.append(summary_stats(rem_pre_val))
summary2 = pd.DataFrame(summary2) 

# Revalidating and redoing summary for excluding all extraneous etrTime
rem_ext_etr_val = removed_ext_etr.groupby('outage_id').apply(_validate, include_groups=False).reset_index()
# display(validated3)
summary3 = []
summary3.append(summary_stats(rem_ext_etr_val))
summary3 = pd.DataFrame(summary3) 

# revalidating and redoing summary for drop.na all columns?

# Comparing all 3 summaries
print("Original summary of df with all rows:")
display(summary)
print("New summary of df with etrTime < 03-2023 removed:")
display(summary2)
print("Summary of df with etrTime < 03-2023 and etrTime == NA removed")
display(summary3)

#

# of outages where the start_time is before the scraping begins: 0
# of outages where the lastUpdatedTime is before the scraping begins: 0
# of outages where the etrTime is before the scraping begins: 0
# of outages where the timestamp is before the scraping begins: 0


Validated outages where rows with etrTimes before 2023-01-01 are removed


Unnamed: 0,outage_id,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
0,1387175,Colquitt EMC,1,1,1,1,2023-03-12 14:37:34-04:00,2023-03-12 14:37:34-04:00,0 days,NaT,NaT,1,0,NaT,NaT,NaT,NaT,NaT,NaT,1 days 02:32:31,False,NaT,NaT,NaT,NaT,NaT
1,1387940,Colquitt EMC,1,1,1,1,2023-03-15 06:33:39-04:00,2023-03-15 06:33:39-04:00,0 days,NaT,NaT,1,1,2023-03-15 08:33:39-04:00,2023-03-15 08:33:39-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:11:25,False,NaT,NaT,NaT,0 days 01:48:35,NaT
2,1388250,Colquitt EMC,1,1,1,1,2023-03-15 13:13:54-04:00,2023-03-15 13:13:54-04:00,0 days,NaT,NaT,1,1,2023-03-15 15:13:54-04:00,2023-03-15 15:13:54-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:11,False,NaT,NaT,NaT,0 days 01:53:49,NaT
3,1388449,Colquitt EMC,1,1,1,1,2023-03-15 19:05:17-04:00,2023-03-15 19:05:17-04:00,0 days,NaT,NaT,1,1,2023-03-15 21:05:17-04:00,2023-03-15 21:05:17-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:04:48,False,NaT,NaT,NaT,0 days 01:55:12,NaT
4,1388470,Colquitt EMC,2,1,1,1,2023-03-15 21:28:52-04:00,2023-03-15 21:28:52-04:00,0 days,NaT,NaT,2,1,2023-03-15 23:28:52-04:00,2023-03-15 23:28:52-04:00,NaT,NaT,0 days 02:00:00,NaT,0 days 00:06:12,False,NaT,NaT,NaT,0 days 01:53:48,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17133,1543621,Colquitt EMC,2,1,1,1,2024-01-18 09:03:45-05:00,2024-01-18 09:03:45-05:00,0 days,2024-01-18 09:18:24-05:00,2024-01-18 09:33:23-05:00,1,0,NaT,NaT,0 days 00:14:59,2024-01-18 09:18:44-05:00,NaT,0 days 00:29:38,0 days 00:06:20,True,0 days 00:14:39,NaT,NaT,NaT,0 days 00:08:39
17134,1543632,Colquitt EMC,4,1,1,1,2024-01-18 09:39:54-05:00,2024-01-18 09:39:54-05:00,0 days,2024-01-18 09:48:23-05:00,2024-01-18 10:33:24-05:00,2,0,NaT,NaT,0 days 00:45:01,2024-01-18 10:24:55-05:00,NaT,0 days 00:53:30,0 days 00:10:10,True,0 days 00:08:29,NaT,NaT,NaT,0 days 00:34:51
17135,1543634,Colquitt EMC,3,1,1,1,2024-01-18 09:45:56-05:00,2024-01-18 09:45:56-05:00,0 days,2024-01-18 10:03:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:30:00,2024-01-18 10:15:56-05:00,NaT,0 days 00:47:28,0 days 00:09:09,True,0 days 00:17:28,NaT,NaT,NaT,0 days 00:20:51
17136,1543666,Colquitt EMC,1,1,1,1,2024-01-18 10:16:45-05:00,2024-01-18 10:16:45-05:00,0 days,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,1,0,NaT,NaT,0 days 00:00:00,2024-01-18 10:16:45-05:00,NaT,0 days 00:16:39,0 days 00:13:19,True,0 days 00:16:39,NaT,NaT,NaT,0 days 00:13:19


Original summary of df with all rows:


Unnamed: 0,Provider,Num Unique Outages,Avg Num Rows,Avg Num Unique Longitude,Avg Num Unique Latitude,Avg Num Unique start_time,Median Num Unique start_time,Max Num Unique start_time,Proportion of Outages with Multiple start_time,Avg Diff in start_time,Median Diff in start_time,Max Diff in start_time,Avg Num Unique latestUpdatedTime,Avg Num Unique etrTime,Max Num Unique etrTime,Num of Outages with an non-NA etrTime,is_timestamp_dur_div_15_min freq (with 5 sec of error),Avg duration_by_timestamp,Max duration_by_timestamp,Min duration_by_timestamp,Avg duration_by_timestampstart,Avg duration_by_lutstart,Avg duration_by_etrstart,Avg timestampstart_timestamp_dur_error,Median timestampstart_timestamp_dur_error,Max timestampstart_timestamp_dur_error,Proportion of timestampstart dur within 30 min of timestamp dur,Avg lutstart_etrstart_dur_error,Max lutstart_etrstart_dur_error,Avg etrstart_timestamp_dur_error,Max etrstart_timestamp_dur_error
0,Colquitt EMC,17138,24.725522,1.091609,1.091668,1.011962,1.0,3,0.01190337262224297 (204 outages),0 days 00:06:50.648383708,0 days,11 days 01:59:42,1.609464,0.902614,4,14720 (0.8589100245069436),0.947193,0 days 06:23:37.434545236,24 days 17:00:01,0 days,0 days 08:33:38.558255339,0 days 04:58:47.046271443,0 days 02:05:27.898369565,0 days 02:10:02.185025197,0 days 00:12:35,8 days 06:37:17,0.8829501692146108 (15132),0 days 03:43:52.913247282,12 days 20:02:41,0 days 05:35:43.446018940,24 days 15:00:01


New summary of df with etrTime < 03-2023 removed:


Unnamed: 0,Provider,Num Unique Outages,Avg Num Rows,Avg Num Unique Longitude,Avg Num Unique Latitude,Avg Num Unique start_time,Median Num Unique start_time,Max Num Unique start_time,Proportion of Outages with Multiple start_time,Avg Diff in start_time,Median Diff in start_time,Max Diff in start_time,Avg Num Unique latestUpdatedTime,Avg Num Unique etrTime,Max Num Unique etrTime,Num of Outages with an non-NA etrTime,is_timestamp_dur_div_15_min freq (with 5 sec of error),Avg duration_by_timestamp,Max duration_by_timestamp,Min duration_by_timestamp,Avg duration_by_timestampstart,Avg duration_by_lutstart,Avg duration_by_etrstart,Avg timestampstart_timestamp_dur_error,Median timestampstart_timestamp_dur_error,Max timestampstart_timestamp_dur_error,Proportion of timestampstart dur within 30 min of timestamp dur,Avg lutstart_etrstart_dur_error,Max lutstart_etrstart_dur_error,Avg etrstart_timestamp_dur_error,Max etrstart_timestamp_dur_error
0,Colquitt EMC,17138,24.725522,1.091609,1.091668,1.011962,1.0,3,0.01190337262224297 (204 outages),0 days 00:06:50.648383708,0 days,11 days 01:59:42,1.609464,0.902614,4,14720 (0.8589100245069436),0.947193,0 days 06:23:37.434545236,24 days 17:00:01,0 days,0 days 08:33:38.558255339,0 days 04:58:47.046271443,0 days 02:05:27.898369565,0 days 02:10:02.185025197,0 days 00:12:35,8 days 06:37:17,0.8829501692146108 (15132),0 days 03:43:52.913247282,12 days 20:02:41,0 days 05:35:43.446018940,24 days 15:00:01


Summary of df with etrTime < 03-2023 and etrTime == NA removed


Unnamed: 0,Provider,Num Unique Outages,Avg Num Rows,Avg Num Unique Longitude,Avg Num Unique Latitude,Avg Num Unique start_time,Median Num Unique start_time,Max Num Unique start_time,Proportion of Outages with Multiple start_time,Avg Diff in start_time,Median Diff in start_time,Max Diff in start_time,Avg Num Unique latestUpdatedTime,Avg Num Unique etrTime,Max Num Unique etrTime,Num of Outages with an non-NA etrTime,is_timestamp_dur_div_15_min freq (with 5 sec of error),Avg duration_by_timestamp,Max duration_by_timestamp,Min duration_by_timestamp,Avg duration_by_timestampstart,Avg duration_by_lutstart,Avg duration_by_etrstart,Avg timestampstart_timestamp_dur_error,Median timestampstart_timestamp_dur_error,Max timestampstart_timestamp_dur_error,Proportion of timestampstart dur within 30 min of timestamp dur,Avg lutstart_etrstart_dur_error,Max lutstart_etrstart_dur_error,Avg etrstart_timestamp_dur_error,Max etrstart_timestamp_dur_error
0,Colquitt EMC,14338,20.564793,1.070372,1.070582,1.009973,1.0,2,0.009973497000976427 (143 outages),0 days 00:03:09.505300599,0 days,5 days 07:08:33,1.550844,1.048473,4,14338 (1.0),0.938206,0 days 05:18:00.092029403,24 days 17:00:01,0 days,0 days 05:33:43.670005765,0 days 02:29:40.989329055,0 days 02:01:13.054679871,0 days 00:15:44.529115018,0 days 00:11:39,8 days 06:37:17,0.9383456549030548 (13454),0 days 03:27:36.457037243,12 days 20:02:41,0 days 05:11:12.252306140,24 days 15:00:01


Comparing summary with the original df and summary2 with the pre_2023 outages removed, the average duration by etrTime-startTime looks much more accurate now (2 hours instead of -1521 days).

The duration errors have also gone down considerably.

In [34]:

dur_by_etrstart_dist = rem_pre_val['duration_by_etrstart'].describe()
print("Duration by etrTime - startTime distribution for outages without pre_2023 errors:")
print(dur_by_etrstart_dist)

dur_by_etrstart_dist2 = rem_ext_etr_val['duration_by_etrstart'].describe()
print("Duration by etrTime - startTime distribution for outages without pre_2023 errors and without etrNulls:")
print(dur_by_etrstart_dist2)


new_max_id = rem_pre_val['duration_by_etrstart'].idxmax()
new_max_id2 = rem_ext_etr_val['duration_by_etrstart'].idxmax()
temp = df[df['outage_id'] == new_max_id]
display(rem_pre_val[rem_pre_val.index == new_max_id])
display(rem_ext_etr_val[rem_ext_etr_val.index == new_max_id2])

Duration by etrTime - startTime distribution for outages without pre_2023 errors:
count                        14720
mean     0 days 02:05:27.898369565
std      0 days 02:23:56.619620105
min                0 days 01:00:00
25%                0 days 02:00:00
50%                0 days 02:00:00
75%                0 days 02:00:00
max                5 days 10:08:33
Name: duration_by_etrstart, dtype: object
Duration by etrTime - startTime distribution for outages without pre_2023 errors and without etrNulls:
count                        14338
mean     0 days 02:01:13.054679871
std      0 days 01:31:10.114548561
min                0 days 01:00:00
25%                0 days 02:00:00
50%                0 days 02:00:00
75%                0 days 02:00:00
max                5 days 10:08:33
Name: duration_by_etrstart, dtype: object


Unnamed: 0,outage_id,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
14111,1502604,Colquitt EMC,489,2,2,2,2023-10-18 09:36:45-04:00,2023-10-23 16:45:18-04:00,5 days 07:08:33,2023-10-18 09:48:24-04:00,2023-10-23 19:18:24-04:00,4,2,2023-10-18 11:36:45-04:00,2023-10-23 19:45:18-04:00,5 days 09:30:00,2023-10-23 19:06:45-04:00,5 days 10:08:33,5 days 09:41:39,5 days 07:28:19,True,0 days 00:11:39,0 days 00:38:33,0 days 00:26:54,0 days 02:40:14,0 days 02:01:41


Unnamed: 0,outage_id,utility_provider,num_rows,num_unique_long,num_unique_lat,num_unique_start_times,earliest_start_time,latest_start_time,diff_in_start,earliest_timestamp,latest_timestamp,num_unique_latestUpdatedTime,num_unique_etrTime,earliest_etrTime,latest_etrTime,duration_by_timestamp,endtime_by_timestamp_start,duration_by_etrstart,duration_by_timestampstart,duration_by_lutstart,is_timestamp_dur_div_15_min,timestampstart_timestamp_dur_error,etrstart_timestamp_dur_error,etrstart_timestampstart_dur_error,lutstart_etrstart_dur_error,lutstart_timestamp_dur_error
12508,1502604,Colquitt EMC,489,2,2,2,2023-10-18 09:36:45-04:00,2023-10-23 16:45:18-04:00,5 days 07:08:33,2023-10-18 09:48:24-04:00,2023-10-23 19:18:24-04:00,4,2,2023-10-18 11:36:45-04:00,2023-10-23 19:45:18-04:00,5 days 09:30:00,2023-10-23 19:06:45-04:00,5 days 10:08:33,5 days 09:41:39,5 days 07:28:19,True,0 days 00:11:39,0 days 00:38:33,0 days 00:26:54,0 days 02:40:14,0 days 02:01:41


For the outage with this new max, the duration is consistent across different metrics.

#### Looking across the different durations and their extremes.
- We will use etrTime and timestamp as potential time metrics.
- We will use etrTime as the primary candidate and timestamp if etrTime is null.

# Conclusion:
- Unique outages denoted by "id" + "start_date"
- earliest start_time will be start_date of each unique outage.
- latest etr will be the end_time
- duration will be calculated by latestEtr - earliest_start_time
- if etr_time is null, we will calculate duration by latest timestamp - earliest timestmap. The endtime will be calculated by adding the timestampduration to start_date. 

# Running the Pipeline


Rerunning the configs

In [37]:
local_config_path = '/Users/uirja/OneDrive/Personal Files/CS Projects/outage-data-scraper/app/pipeline/config.yaml'
with open(local_config_path, 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['local_base_file_path']

In [38]:
# ~2 min 30 sec run time
for provider in config['providers']:
    pipeline = GA5_old(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['startTime'] = extraneous_rows['startTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['etrTime'] = ex

Unnamed: 0,outage_id,start_time,end_time,lat,long,zipcode,county_name,county_fips,state,utility_provider,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time
0,1387175,2023-03-12 14:37:34-04:00,NaT,31.18,-83.61,31771,Colquitt,13071,Georgia,Colquitt EMC,NaT,NaT,1.0,NaT
1,1387940,2023-03-15 06:33:39-04:00,2023-03-15 08:33:39-04:00,30.99,-83.20,31605,Lowndes,13185,Georgia,Colquitt EMC,0 days 02:15:00,0 days 02:07:30,1.0,0 days 02:00:00
2,1388250,2023-03-15 13:13:54-04:00,2023-03-15 15:13:54-04:00,31.42,-83.61,31793,Tift,13277,Georgia,Colquitt EMC,0 days 02:15:00,0 days 02:07:30,1.0,0 days 02:00:00
3,1388449,2023-03-15 19:05:17-04:00,2023-03-15 21:05:17-04:00,30.93,-83.42,31632,Lowndes,13185,Georgia,Colquitt EMC,0 days 02:15:00,0 days 02:07:30,1.0,0 days 02:00:00
4,1388470,2023-03-15 21:28:52-04:00,2023-03-15 23:28:52-04:00,31.33,-83.58,31775,Colquitt,13071,Georgia,Colquitt EMC,0 days 02:15:00,0 days 02:07:30,1.0,0 days 02:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17338,1543621,2024-01-18 09:03:45-05:00,2024-01-18 09:18:44-05:00,31.37,-83.41,00000,,,,Colquitt EMC,0 days 00:29:59,0 days 00:22:29,1.0,0 days 00:14:59
17339,1543632,2024-01-18 09:39:54-05:00,2024-01-18 10:24:55-05:00,31.52,-83.64,00000,,,,Colquitt EMC,0 days 01:00:01,0 days 00:52:31,1.0,0 days 00:45:01
17340,1543634,2024-01-18 09:45:56-05:00,2024-01-18 10:15:56-05:00,31.27,-83.39,00000,,,,Colquitt EMC,0 days 00:45:00,0 days 00:37:30,63.0,1 days 07:30:00
17341,1543666,2024-01-18 10:16:45-05:00,2024-01-18 10:16:45-05:00,31.12,-83.78,00000,,,,Colquitt EMC,0 days 00:15:00,0 days 00:07:30,1.0,0 days 00:00:00


In [39]:
# gis = GIS("http://www.arcgis.com", "JK9035", "60129@GR0W3R5") # signing in to get access to arcGIS api
# def test_compute_metrics(group):
#     """
#     Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
#     given dataframe being transformed with standardized column names
#     """
#     def get_zipcode(long, lat):
#         location = reverse_geocode((Geometry({"x":float(long), "y":float(lat), "spatialReference":{"wkid": 4326}})))
#         return location['address']['Postal']
    
#     start_time = group['start_time'].min()
#     duration_diff = (group['etrTime'].max() - start_time).dt.total_seconds() / 60 if group['etrTime'].notna().all() else group['timestamp'].max() - group['timestamp'].min()
#     end_time = group['etrTime'].max() if group['etrTime'].notna().all() else start_time + duration_diff
#     zipcode = get_zipcode(long, lat)
#     county_name = self.geomap['zip_to_county_name'][zipcode] if (pd.notna(zipcode) and zipcode != '') else pd.NA
#     county_fips = self.geomap['zip_to_county_fips'][zipcode] if (pd.notna(zipcode) and zipcode != '') else pd.NA
#     state = self.geomap['zip_to_state_name'][zipcode] if (pd.notna(zipcode) and zipcode != '') else pd.NA
#     utility_provider = group['utility_provider'].iloc[-1]
#     duration_max = duration_diff + 15
#     duration_mean = (duration_diff + duration_max) / 2
#     customer_affected_mean = group['customer_affected'].mean()
#     total_customer_outage_time = customer_affected_mean * duration_diff
    
#     lat = group['lat'].iloc[-1]
#     long = group['long'].iloc[-1]
    
#     return pd.Series({
#         'start_time': start_time,
#         'end_time': end_time,
#         'lat': lat,
#         'long': long,
#         'zipcode': zipcode,
#         'county_name': county_name,
#         'county_fips': county_fips,
#         'state': state,
#         'utility_provider': utility_provider,
#         'duration_max': duration_max,
#         'duration_mean': duration_mean,
#         'customer_affected_mean': customer_affected_mean,
#         'total_customer_outage_time': total_customer_outage_time
#     })

# Testing the New Base_Pipeline

In [40]:
class BasePipeline_new:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.type_to_prefix = {'o': 'per_outage', 'c': 'per_county', 'z': 'per_zipcode'} 
        self._data = pd.DataFrame({})
        self.geomap = {}
        self._load_data()
        self._load_geo_mapping()
        
    def _load_geo_mapping(self):
        try:
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
            with open('zip_to_state_name.json', 'r') as json_file:
                self.geomap['zip_to_state_name'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during geo map loading: {e}") 

    def _construct_file_path(self):
        file_prefix = self.type_to_prefix[self.config['type']]
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def _load_data(self):
        try:
            file_path = self._construct_file_path()
            self._data = pd.read_csv(file_path)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self, **kwargs):
        raise NotImplementedError
    
    # TODO: remove 
    def standardize(self):
        """
        obslete
        """
        self._load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def to_incident_level(self, identifers=['outage_id'], method='id_grouping'):
        """
        identifer: default identifier name "outage_id", or list like ['IncidentId', 'lat', 'lng', 'subgroup']
        method: "id_grouping" or "timegap_seperation" 

        This method will replace the standardize basically
        """
        df = self.transform(identifiers=identifers, method=method)
        grouped = df.groupby('id').apply(self._agg_vars).reset_index().round(2)
        
        return grouped
    
    def to_geoarea_level(self, geo_level='zipcode', time_interval='hourly'):
        """
        geo_level: 'zipcode', 'county', 'state'
        time_interval: 'hourly', 'daily', 'monthly'
        """    
        # TODO: complet geo_level and time_interval support
        eastern = tz.gettz('US/Eastern')
        self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)
        self._data['year'] = self._data['timestamp'].dt.year
        self._data['month'] = self._data['timestamp'].dt.month
        self._data['day'] = self._data['timestamp'].dt.day
        self._data['hour'] = self._data['timestamp'].dt.hour
        
        df = self.transform(geo_level=geo_level, time_interval=time_interval)
        
        # element wise metrics computation
        df['duration_weight'] = 15
        df['outage_freq_x_cust_a'] = df['customer_affected'] * df['outage_count']
        df['cust_a_x_duration'] = df['customer_affected'] * df['duration_weight']
        
        # TODO: complet geo_level and time_interval support 
        keys = ['EMC', 'year', 'month', 'day', 'hour', geo_level]
        grouped = df.groupby(keys).agg({
            'customer_affected': 'mean',
            'customer_served': 'mean',
            'percent_customer_affected': 'mean',
            'outage_count': 'max',
            'duration_weight': 'sum',
            'outage_freq_x_cust_a': 'sum',
            'cust_a_x_duration': 'sum'
            }).reset_index()
        
        #TODO: fill non outage hours with 0
        
        return grouped
    
    def _agg_vars(self, group):
        first_timestamp = group['timestamp'].iloc[0]
        last_timestamp = group['timestamp'].iloc[-1]
        duration_diff = (last_timestamp - first_timestamp).total_seconds() / 60
        duration_15 = 15 * len(group)
        group['duration_weight'] = (group['timestamp'].diff().dt.total_seconds() / 60).round(0).fillna(15)
        cust_affected_x_duration = (group['customer_affected'] * group['duration_weight']).sum()
        cust_a_mean = cust_affected_x_duration / group['duration_weight'].sum()
        
        return pd.Series({
            'first_timestamp': first_timestamp,
            'last_timestamp': last_timestamp,
            'duration_diff': duration_diff,
            'duration_15': duration_15,
            'customer_affected_mean': cust_a_mean,
            'cust_affected_x_duration': cust_affected_x_duration
        })
    
    # TODO: remove 
    def _compute_metrics(self, group):
        """
        Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
        given dataframe being transformed with standardized column names
        """
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
    
    def save(self, path=None):
        raise NotImplementedError
    
    def get_dataframe(self):
        return self._data
    
    def _add_metadata(self):
        """
        #TODO: add state, provider variables
        """
        raise NotImplementedError
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass


class GA5_new(BasePipeline_new):
    def transform(self, **kwargs):
        identifiers = kwargs.get('identifiers')
        method = kwargs.get('method')
        geo_level = kwargs.get('geo_level')
        time_interval = kwargs.get('time_interval')
        df = kwargs.get('dataframe', self._data.copy())

        df = df.rename(columns={
            'id':'outage_id',
            'startTime': 'start_time',
            'numPeople':'customer_affected',
            'zip_code':'zipcode',
            'EMC': 'utility_provider',
            'zip_code': 'zipcode',
            'latitude': 'lat',
            'longitude': 'long',
            'county': 'city'
        })
        
        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')

        def reformat_time(time):
            # format: 2024-01-18 09:04:15 but can also be in milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC)
            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')
            if isinstance(time, str) and time.isdigit():
                # Convert millisecond timestamp to datetime
                return pd.to_datetime(int(time), unit='ms', utc=True).tz_convert(eastern)
                
            # else the time is string format of datetime
            elif isinstance(time, str) and ":" in time:
                return pd.to_datetime(time, utc=True).tz_convert(eastern)
            
            elif isinstance(time, datetime): # is datetime object already
                return time
            
            else: # is null or extraneous values that should be null (like '-1000')
                return pd.NaT

        try:
            """
            Transforming the dataframe
            - Some of the time columns has millisecond format, error codes, and NaN so we need to separately reformat before pd.to_datetime
            - etrTime has times before 2023-01-01, so we will set them to NaT
            - "county" values look like city names so renaming accordingly
            """
            # Masks for extracting rows with millisecond format, errors, or NA
            start_time_ms = df['start_time'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
            lastUptTime_ms = df['lastUpdatedTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA)
            etrTime_ms_null = df['etrTime'].apply(lambda x: (isinstance(x, str) and (x.isdigit() or ":" not in x)) or pd.NA) # or pd.NA
            timeSt_null = df['timestamp'].isna()
            extraneous_mask = start_time_ms | lastUptTime_ms | etrTime_ms_null | timeSt_null

            extraneous_rows = df[extraneous_mask]

            extraneous_rows['start_time'] = extraneous_rows['start_time'].apply(reformat_time)
            extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
            extraneous_rows['etrTime'] = extraneous_rows['etrTime'].apply(reformat_time)
            extraneous_rows['timestamp'] = extraneous_rows['timestamp'].apply(reformat_time)

            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')

            df.loc[extraneous_rows.index, 'start_time'] = extraneous_rows['start_time']
            df.loc[extraneous_rows.index, 'lastUpdatedTime'] = extraneous_rows['lastUpdatedTime']
            df.loc[extraneous_rows.index, 'etrTime'] = extraneous_rows['etrTime']
            df.loc[extraneous_rows.index, 'timestamp'] = extraneous_rows['timestamp']
        
            df['start_time'] = pd.to_datetime(df['start_time'], utc=True).dt.tz_convert(eastern)
            df['lastUpdatedTime'] = pd.to_datetime(df['lastUpdatedTime'], utc=True).dt.tz_convert(eastern)
            df['etrTime'] =pd.to_datetime(df['etrTime'], utc=True).dt.tz_convert(eastern)
            df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True).dt.tz_convert(eastern)

            time_col = ['start_time', 'lastUpdatedTime', 'etrTime', 'timestamp']
            minimum_datetime = pd.to_datetime('2023-01-01 23:59:59-05:00', utc=True).tz_convert('US/Eastern')
            for col in time_col:
                df.loc[self._data[col] < minimum_datetime, col] = pd.NaT

        except Exception as e:
            print(f"An error occurred during transformation: {e}")

        return df

    def to_incident_level(self, identifers=['outage_id', 'start_time'], method='id_grouping'):
        """
        identifer: default identifier name "outage_id", or list like ['IncidentId', 'lat', 'lng', 'subgroup']
        method: "id_grouping" or "timegap_seperation" 
        """
        df = self.transform(identifiers=identifers, method=method)
        grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)
        
        return grouped
    
    # gis = GIS("http://www.arcgis.com", "JK9035", "60129@GR0W3R5") # signing in to get access to arcGIS api; have to find 
    def _agg_vars(self, group):
        def get_zipcode(long, lat):
            location = reverse_geocode((Geometry({"x":float(long), "y":float(lat), "spatialReference":{"wkid": 4326}})))
            return location['address']['Postal']

        first_timestamp = group['timestamp'].iloc[0]
        last_timestamp = group['timestamp'].iloc[-1]
        duration_diff = (last_timestamp - first_timestamp).total_seconds() / 60
        duration_15 = 15 * len(group)
        group['duration_weight'] = (group['timestamp'].diff().dt.total_seconds() / 60).round(0).fillna(15)
        cust_affected_x_duration = (group['customer_affected'] * group['duration_weight']).sum()
        cust_a_mean = cust_affected_x_duration / group['duration_weight'].sum()

        
        start_time = group['start_time'].min() # pd.NaT # group.index.get_level_values('start_time')
        end_time = group['timestamp'].max()        
        duration_diff_e1 = (end_time - start_time).total_seconds() / 60 if pd.notna(end_time) and pd.notna(start_time) else group['timestamp'].max() - group['timestamp'].min()

        lat = group['lat'].iloc[-1]
        long = group['long'].iloc[-1]

        zipcode_map = self.geomap['zip_to_county_name']        
        zipcode = group['zipcode'].iloc[-1] if group['zipcode'].iloc[-1] != 'unknown' else '00000' # '00000' dummy zipcode to replace get_zipcode(long, lat) 
        county_name = self.geomap['zip_to_county_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        county_fips = self.geomap['zip_to_county_fips'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        state = self.geomap['zip_to_state_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        
        # # multiple zipcode version
        # zipcode = group['zipcode'].unique()
        # for i, zp in enumerate(zipcode):
        #     if zp == 'unknown':
        #         zipcode[i] = pd.NA
        # county_name = [self.geomap['zip_to_county_name'][zp] if (pd.notna(zp) and zp != '' and zp in zipcode_map) else pd.NA for zp in zipcode] #self.geomap['zip_to_county_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        # county_fips = self.geomap['zip_to_county_fips'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA
        # state = self.geomap['zip_to_state_name'][zipcode] if (pd.notna(zipcode) and zipcode != '' and zipcode in zipcode_map) else pd.NA

        utility_provider = group['utility_provider'].iloc[-1]
        duration_max = duration_diff + 15 # in float format where 1 = 1 minute
        duration_mean = (duration_diff + duration_max) / 2
        duration_15 = 15 * len(group)
        group['duration_weight'] = (group['timestamp'].diff().dt.total_seconds() / 60).round(0).fillna(15)
        cust_affected_x_duration = (group['customer_affected'] * group['duration_weight']).sum()
        cust_a_mean = cust_affected_x_duration / group['duration_weight'].sum()

        return pd.Series({
            'first_timestamp': first_timestamp,
            'last_timestamp': last_timestamp,
            # To-Do: Figure out a way to dynamically add a "start_time" column depending on whether it is already in the group or not
            'end_time': end_time, # only if there is an endtime column like etrTime
            'zipcode': group['zipcode'].unique(),
            'lat': group['lat'].unique(),
            'long': group['long'].unique(),
            'county_name': county_name,
            'county_fips': county_fips,
            'state': state,
            'utility_provider': utility_provider,
            'zipcode': group['zipcode'].unique(),
            'duration_max': duration_max,
            'duration_mean': duration_mean,
            'duration_diff': duration_diff,
            'duration_15': duration_15,
            'customer_affected_mean': cust_a_mean,
            'cust_affected_x_duration': cust_affected_x_duration
        })

In [41]:
col_new = GA5_new(col_config, base_file_path)
col_new._load_data()
col_new._data = col_new.to_incident_level(identifers=['outage_id', 'start_time'], method='id_grouping')
display(col_new._data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['start_time'] = extraneous_rows['start_time'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['lastUpdatedTime'] = extraneous_rows['lastUpdatedTime'].apply(reformat_time)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extraneous_rows['etrTime'] = 

An error occurred during transformation: 'start_time'


  grouped = df.groupby(identifers).apply(self._agg_vars).reset_index().round(2)


Unnamed: 0,outage_id,start_time,first_timestamp,last_timestamp,end_time,zipcode,lat,long,county_name,county_fips,state,utility_provider,duration_max,duration_mean,duration_diff,duration_15,customer_affected_mean,cust_affected_x_duration
0,1387175,2023-03-12 14:37:34-04:00,NaT,NaT,NaT,[31771],[31.17911],[-83.6059],Colquitt,13071,Georgia,Colquitt EMC,,,,15,1.0,15.0
1,1387940,2023-03-15 06:33:39-04:00,NaT,NaT,NaT,[31605],[30.98839],[-83.20358],Lowndes,13185,Georgia,Colquitt EMC,,,,15,1.0,15.0
2,1388250,2023-03-15 13:13:54-04:00,NaT,NaT,NaT,[31793],[31.42243],[-83.60985],Tift,13277,Georgia,Colquitt EMC,,,,15,1.0,15.0
3,1388449,2023-03-15 19:05:17-04:00,NaT,NaT,NaT,[31632],[30.93008],[-83.41995],Lowndes,13185,Georgia,Colquitt EMC,,,,15,1.0,15.0
4,1388470,2023-03-15 21:28:52-04:00,NaT,NaT,NaT,[31775],[31.32806],[-83.58069],Colquitt,13071,Georgia,Colquitt EMC,,,,30,1.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17338,1543621,2024-01-18 09:03:45-05:00,2024-01-18 09:18:24-05:00,2024-01-18 09:33:23-05:00,2024-01-18 09:33:23-05:00,[unknown],[31.36839],[-83.40814],,,,Colquitt EMC,29.98,22.48,14.98,30,1.0,30.0
17339,1543632,2024-01-18 09:39:54-05:00,2024-01-18 09:48:23-05:00,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,[unknown],[31.52155],[-83.63948],,,,Colquitt EMC,60.02,52.52,45.02,60,1.0,60.0
17340,1543634,2024-01-18 09:45:56-05:00,2024-01-18 10:03:24-05:00,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,[unknown],[31.2686],[-83.38964],,,,Colquitt EMC,45.00,37.50,30.00,45,63.0,2835.0
17341,1543666,2024-01-18 10:16:45-05:00,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,2024-01-18 10:33:24-05:00,[unknown],[31.11652],[-83.77714],,,,Colquitt EMC,15.00,7.50,0.00,15,1.0,15.0


In [44]:
df = col_new._data
print(df.iloc[17133]['zipcode'])
display(col_df[col_df['id'] == 1387175])

['unknown']


Unnamed: 0,id,type,startTime,lastUpdatedTime,etrTime,title,numPeople,status,cause,identifier,latitude,longitude,description,county,state,EMC,zip_code,timestamp
0,1387175,OUTAGE,1678646254000,1678741805000,-1000,Outage,1,,Manual,32611,31.17911,-83.6059,Device Operation (suebolin),Moultrie,UAS,Colquitt EMC,31771,
