In [2]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
from dateutil import tz
from datetime import datetime
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.geomap = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        raise NotImplementedError

    def standardize(self):
        """
        Generic method to compute and output standardized metrics
        """
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        """
        Generic method to compute standardized metrics, used for being apply in DataFrame.groupby method, 
        given dataframe being transformed with standardized column names
        """
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass
    
class GA1TX8(BasePipeline):
    def transform(self):
        try:
            # Convert timestamps
            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)
            self._data['outageStartTime'] = pd.to_datetime(self._data['outageStartTime'], utc=True).dt.tz_convert(eastern)
            self._data['end_time'] = self._data.groupby('outageRecID')['timestamp'].transform('max')
            
            # extract lat and long
            self._data['outagePoint'] = self._data['outagePoint'].apply(lambda x: json.loads(x.replace("'", '"')))
            self._data[['lat', 'lng']] = self._data['outagePoint'].apply(lambda x: pd.Series([x['lat'], x['lng']]))
            # TODO: add zipcode NaN checking
            self._data = self._data.rename(columns={
                'outageRecID':'outage_id',
                'outageStartTime': 'start_time',
                'customersOutNow':'customer_affected',
                'zip':'zipcode'
            })
        except Exception as e:
            print(f"An error occurred during transformation: {e}")

In [9]:
with open('/Users/xuanedx1/github/outage-data-scraper/app/pipeline/config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['LOCAL_FILE_BASE_PATH']

# Instantiate a BasePipeline object for each provider in the configuration
for provider in config['providers']:
    pipeline = GA1TX8(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)
    pipeline._data.to_csv(f"{pipeline.config['name']}_o.csv")


/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_North Georgia EMC.csv


Unnamed: 0,utility_provider,state,county,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,North Georgia EMC,ga,,30736,2023-03-15-1168,2023-03-15 13:55:03-04:00,NaT,34.956816,-85.148426,,,,1.0,,,
1,North Georgia EMC,ga,,30740,2023-03-15-1177,2023-03-15 15:44:42-04:00,NaT,34.825573,-85.029398,,,,3.0,,,
2,North Georgia EMC,ga,,30742,2023-10-09-0373,2023-10-09 09:10:54-04:00,2023-10-09 17:06:40-04:00,34.947518,-85.236160,475.77,490.77,483.27,1.0,475.48,490.48,482.98
3,North Georgia EMC,ga,,30736,2023-10-09-0387,2023-10-09 12:04:31-04:00,2023-10-09 15:21:25-04:00,34.947293,-85.171554,196.90,211.90,204.40,12.0,2362.40,2542.40,2452.40
4,North Georgia EMC,ga,,unknown,2023-10-09-0389,2023-10-09 12:34:30-04:00,2023-10-09 15:06:23-04:00,34.469238,-85.399599,151.88,166.88,159.38,5.0,759.42,834.42,796.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3245,North Georgia EMC,ga,,30739,2024-02-02-0109,2024-02-02 15:38:02-05:00,2024-02-02 16:51:36-05:00,34.800969,-85.258018,73.57,88.57,81.07,1.0,73.25,88.25,80.75
3246,North Georgia EMC,ga,,30710,2024-02-02-0111,2024-02-02 15:53:12-05:00,2024-02-02 17:06:40-05:00,34.915632,-84.948977,73.47,88.47,80.97,28.0,2055.20,2475.20,2265.20
3247,North Georgia EMC,ga,,30710,2024-02-02-0116,2024-02-02 16:17:42-05:00,2024-02-02 16:36:31-05:00,34.913501,-84.949530,18.82,33.82,26.32,1.0,18.93,33.93,26.43
3248,North Georgia EMC,ga,,30710,2024-02-02-0117,2024-02-02 17:45:36-05:00,2024-02-02 18:36:22-05:00,34.915319,-84.948875,50.77,65.77,58.27,3.0,152.45,197.45,174.95


/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_8/per_outage_Guadalupe Valley Electric Coop, Inc..csv


  self._data = pd.read_csv(file_path)


Unnamed: 0,utility_provider,state,county,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,"Guadalupe Valley Electric Coop, Inc.",tx,,77995,2023-04-05-0220,2023-04-05 20:03:53-04:00,2023-04-05 21:27:35-04:00,29.208372,-97.117103,83.70,98.70,91.20,17.0,1422.90,1677.90,1550.40
1,"Guadalupe Valley Electric Coop, Inc.",tx,,77995,2023-04-05-0221,2023-04-05 20:05:28-04:00,2023-04-05 21:27:35-04:00,29.228264,-97.116361,82.12,97.12,89.62,3.0,246.35,291.35,268.85
2,"Guadalupe Valley Electric Coop, Inc.",tx,,77995,2023-04-05-0223,2023-04-05 20:07:40-04:00,2023-04-05 21:27:35-04:00,29.191775,-97.068438,79.92,94.92,87.42,2.0,159.83,189.83,174.83
3,"Guadalupe Valley Electric Coop, Inc.",tx,,78154,2023-04-05-0224,2023-04-05 20:20:36-04:00,2023-04-05 21:27:35-04:00,29.591483,-98.270095,66.98,81.98,74.48,1.0,66.98,81.98,74.48
4,"Guadalupe Valley Electric Coop, Inc.",tx,,unknown,2023-04-05-0226,2023-04-05 20:20:54-04:00,2023-04-05 21:27:35-04:00,29.230669,-97.111006,66.68,81.68,74.18,1.0,66.68,81.68,74.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4569,"Guadalupe Valley Electric Coop, Inc.",tx,,unknown,2024-02-01-0052,2024-02-01 15:55:31-05:00,2024-02-01 17:09:56-05:00,29.069146,-97.045621,74.42,89.42,81.92,37.0,2754.65,3309.65,3032.15
4570,"Guadalupe Valley Electric Coop, Inc.",tx,,78164,2024-02-02-0077,2024-02-02 08:37:56-05:00,2024-02-02 09:09:53-05:00,29.045089,-97.485442,31.95,46.95,39.45,6.0,191.70,281.70,236.70
4571,"Guadalupe Valley Electric Coop, Inc.",tx,,unknown,2024-02-02-0101,2024-02-02 14:31:27-05:00,2024-02-02 16:27:10-05:00,28.930609,-97.588939,115.72,130.72,123.22,1.0,115.67,130.67,123.17
4572,"Guadalupe Valley Electric Coop, Inc.",tx,,78108,2024-02-02-0106,2024-02-02 16:35:38-05:00,2024-02-02 18:27:08-05:00,29.603157,-98.241892,111.50,126.50,119.00,3.0,334.55,379.55,357.05


------
**dev**

In [3]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_Amicalola EMC.csv')

  df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_Amicalola EMC.csv')


In [5]:
df

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:50:53,30705,Amicalola EMC
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:54:01,30705,Amicalola EMC
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,04-13-2023 19:54:01,30536,Amicalola EMC
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:58:02,30705,Amicalola EMC
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,04-13-2023 19:58:02,30536,Amicalola EMC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247405,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,02-03-2024 00:36:12,30143,Amicalola EMC
247406,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,02-03-2024 00:51:11,30143,Amicalola EMC
247407,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,02-03-2024 01:06:11,30143,Amicalola EMC
247408,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,02-03-2024 01:21:12,30143,Amicalola EMC


In [9]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%m-%d-%Y %H:%M:%S')\
# Extract year, month, day, hour from 'timestamp'
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
        
# Aggregate count of unique outageRecID
aggregated_df = df.groupby(['zip', 'EMC', 'year', 'month', 'day', 'hour'])['outageRecID'].nunique().reset_index()
aggregated_df.rename(columns={'outageRecID': 'outage_count'}, inplace=True)

# Merge aggregated DataFrame with the original DataFrame
df = pd.merge(df, aggregated_df, on=['zip', 'EMC', 'year', 'month', 'day', 'hour'], how='left')

df

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC,year,month,day,hour,outage_count
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 19:50:53,30705,Amicalola EMC,2023,4,13,19,1
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 19:54:01,30705,Amicalola EMC,2023,4,13,19,1
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 19:54:01,30536,Amicalola EMC,2023,4,13,19,1
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 19:58:02,30705,Amicalola EMC,2023,4,13,19,1
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 19:58:02,30536,Amicalola EMC,2023,4,13,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247405,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-03 00:36:12,30143,Amicalola EMC,2024,2,3,0,1
247406,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-03 00:51:11,30143,Amicalola EMC,2024,2,3,0,1
247407,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-03 01:06:11,30143,Amicalola EMC,2024,2,3,1,1
247408,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02T18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-03 01:21:12,30143,Amicalola EMC,2024,2,3,1,1


In [175]:
df.isna().sum()

outageRecID                     0
outageName                      0
outagePoint                     0
outageStartTime                 0
estimatedTimeOfRestoral    247410
outageEndTime              244075
verified                        0
cause                      246589
crewAssigned                    0
customersOutInitially           0
customersOutNow                 0
customersRestored               0
streetsAffected                 0
outageModifiedTime              0
outageWorkStatus           247379
timestamp                       0
zip                             0
EMC                             0
dtype: int64

In [176]:
len(df[df['outageRecID'] != df['outageName']])

0

In [177]:
len(df[df['customersOutInitially'] < df['customersOutNow']])

13

In [178]:
len(df[df['customersOutInitially'] == df['customersRestored']])

2673

In [179]:
def transform(df):
    # Convert timestamps
    eastern = tz.gettz('US/Eastern')
    utc = tz.gettz('UTC')
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True).dt.tz_convert(eastern)
    df['outageStartTime'] = pd.to_datetime(df['outageStartTime'], utc=True).dt.tz_convert(eastern)
    df['end_time'] = df.groupby('outageRecID')['timestamp'].transform('max')
    
    # extract lat and long
    df['outagePoint'] = df['outagePoint'].apply(lambda x: json.loads(x.replace("'", '"')))
    df[['lat', 'lng']] = df['outagePoint'].apply(lambda x: pd.Series([x['lat'], x['lng']]))
    
    return df

In [180]:
df = transform(df)

In [181]:
df

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:50:53-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:54:01-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:54:01-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:58:02-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:58:02-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247405,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 19:36:12-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247406,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 19:51:11-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247407,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 20:06:11-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247408,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 20:21:12-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316


In [96]:
# len(t[t['outageStartTime'] > df['timestamp']])

0

In [164]:
def compute_metrics(group):
    duration = (group['end_time'] - group['outageStartTime']).dt.total_seconds() / 60
    # duration_min_est = 15 * (len(group) - 1) + 7.5
    # duration_max_est = 15 * (len(group) + 1)
    # duration_mean_est = (duration_|min_est + duration_max_est) / 2
    duration_max = duration + 15
    duration_mean = (duration + duration_max) / 2
    customer_affected_mean = group['customersOutNow'].mean()
    
    total_customer_outage_time = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + (group['timestamp'].iloc[0] - group['outageStartTime'].iloc[0]).total_seconds() / 60 * group['customersOutNow'].iloc[0]
    total_customer_outage_time_max = total_customer_outage_time + 15 * group['customersOutNow'].iloc[-1]
    total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2
    
    # total_customer_outage_time_a1 = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + 1 * group['customersOutNow'].iloc[0]
    # total_customer_outage_time_a2 = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + 7.5 * group['customersOutNow'].iloc[0]
    # total_customer_outage_time_b = 15 * group['customersOutNow'].sum()
    # total_customer_outage_time_c = 15 * group['customersOutNow'].sum() + 15 * group['customersOutNow'].iloc[0] + 15 * group['customersOutNow'].iloc[-1]
    # total_customer_outage_time_d = (cust_out_time_est_a1 + cust_out_time_est_a2 + cust_out_time_est_b + cust_out_time_est_c) / 4

    return pd.Series({
        'timestamp': group['end_time'].iloc[-1],
        'duration': duration.iloc[-1],
        'duration_max': duration_max.iloc[-1],
        'duration_mean': duration_mean.iloc[-1],
        'customer_affected_mean': customer_affected_mean,
        'total_customer_outage_time': total_customer_outage_time,
        'total_customer_outage_time_max': total_customer_outage_time_max,
        'total_customer_outage_time_mean': total_customer_outage_time_mean
    })

In [165]:
df.head()

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:50:53-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:54:01-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:54:01-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:58:02-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:58:02-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477


In [166]:
df_grouped = df.groupby('outageRecID').apply(compute_metrics).reset_index().round(2)

In [167]:
df_grouped

Unnamed: 0,outageRecID,timestamp,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,2023-04-13-1040,2023-04-13 19:58:21-04:00,253.65,268.65,261.15,16.50,12357.37,12762.37,12559.87
1,2023-04-13-1041,2023-04-13 17:24:17-04:00,96.48,111.48,103.98,1.00,276.22,291.22,283.72
2,2023-04-13-1042,2023-04-13 22:39:07-04:00,402.12,417.12,409.62,1.00,1178.90,1193.90,1186.40
3,2023-04-13-1043,2023-04-13 18:24:19-04:00,133.30,148.30,140.80,5.81,2252.33,2342.33,2297.33
4,2023-04-13-1044,2023-04-13 16:54:13-04:00,12.58,27.58,20.08,2.00,63.17,93.17,78.17
...,...,...,...,...,...,...,...,...,...
9046,2024-02-02-0018,2024-02-02 14:36:23-05:00,68.38,83.38,75.88,5.00,341.67,416.67,379.17
9047,2024-02-02-0019,2024-02-02 14:51:14-05:00,72.38,87.38,79.88,31.00,2246.47,2711.47,2478.97
9048,2024-02-02-0022,2024-02-02 16:51:15-05:00,51.57,66.57,59.07,1.00,51.53,66.53,59.03
9049,2024-02-02-0024,2024-02-02 17:36:26-05:00,47.37,62.37,54.87,1.00,47.18,62.18,54.68


In [168]:
df_merged = pd.merge(df_grouped, df, on=['outageRecID', 'timestamp'], how='inner')

In [169]:
df_merged

Unnamed: 0,outageRecID,timestamp,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13 19:58:21-04:00,253.65,268.65,261.15,16.50,12357.37,12762.37,12559.87,2023-04-13-1040,"{'lat': 34.815493607300034, 'lng': -84.5984642...",2023-04-13 15:44:42-04:00,,,False,400 Decay/age of material/equipment,True,27,27,0,"['CHATSWORTH HWY', 'COHUTTA FOREST 401', 'COHU...",2023-04-13T19:28:09-04:00,,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.815494,-84.598464
1,2023-04-13-1041,2023-04-13 17:24:17-04:00,96.48,111.48,103.98,1.00,276.22,291.22,283.72,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,True,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:57:30.3270000-04:00,,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
2,2023-04-13-1042,2023-04-13 22:39:07-04:00,402.12,417.12,409.62,1.00,1178.90,1193.90,1186.40,2023-04-13-1042,"{'lat': 34.76022275601547, 'lng': -84.43656079...",2023-04-13 15:57:00-04:00,,,False,,True,1,1,0,['POCASET DR 150'],2023-04-13T20:08:15-04:00,,30540,Amicalola EMC,2023-04-13 22:39:07-04:00,34.760223,-84.436561
3,2023-04-13-1043,2023-04-13 18:24:19-04:00,133.30,148.30,140.80,5.81,2252.33,2342.33,2297.33,2023-04-13-1043,"{'lat': 34.64094744509421, 'lng': -84.27010785...",2023-04-13 16:11:01-04:00,,,False,,True,6,6,0,"['NEWBERRY DR 364', 'NEWBERRY DR 626', 'NEWBER...",2023-04-13T17:24:18.9530000-04:00,,unknown,Amicalola EMC,2023-04-13 18:24:19-04:00,34.640947,-84.270108
4,2023-04-13-1044,2023-04-13 16:54:13-04:00,12.58,27.58,20.08,2.00,63.17,93.17,78.17,2023-04-13-1044,"{'lat': 34.400304776099645, 'lng': -84.2707052...",2023-04-13 16:41:38-04:00,,,False,,False,2,2,0,"['LOON LN 25', 'LOVELADY RD 505']",2023-04-13T16:42:03-04:00,,30107,Amicalola EMC,2023-04-13 16:54:13-04:00,34.400305,-84.270705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9046,2024-02-02-0018,2024-02-02 14:36:23-05:00,68.38,83.38,75.88,5.00,341.67,416.67,379.17,2024-02-02-0018,"{'lat': 34.714668737578464, 'lng': -84.4774361...",2024-02-02 13:28:00-05:00,,,False,,True,5,5,0,"['BOARDTOWN RD 1064', 'BOARDTOWN RD 1066', 'VA...",2024-02-02T13:33:18.5070000-05:00,,30540,Amicalola EMC,2024-02-02 14:36:23-05:00,34.714669,-84.477436
9047,2024-02-02-0019,2024-02-02 14:51:14-05:00,72.38,87.38,79.88,31.00,2246.47,2711.47,2478.97,2024-02-02-0019,"{'lat': 34.54793254252141, 'lng': -84.04290642...",2024-02-02 13:38:51-05:00,,,False,,True,31,31,0,"['HOMER EDWARDS RD 385', 'HOMER EDWARDS RD 425...",2024-02-02T13:47:42-05:00,,30533,Amicalola EMC,2024-02-02 14:51:14-05:00,34.547933,-84.042906
9048,2024-02-02-0022,2024-02-02 16:51:15-05:00,51.57,66.57,59.07,1.00,51.53,66.53,59.03,2024-02-02-0022,"{'lat': 34.612873511729674, 'lng': -84.0811178...",2024-02-02 15:59:41-05:00,,,False,,True,1,1,0,['SUNRISE RIDGE/ETOWAH TRL'],2024-02-02T16:03:33.6600000-05:00,,unknown,Amicalola EMC,2024-02-02 16:51:15-05:00,34.612874,-84.081118
9049,2024-02-02-0024,2024-02-02 17:36:26-05:00,47.37,62.37,54.87,1.00,47.18,62.18,54.68,2024-02-02-0024,"{'lat': 34.54980869474821, 'lng': -84.04115022...",2024-02-02 16:49:04-05:00,,,False,,True,1,1,0,['HORTON RD 90'],2024-02-02T16:54:59.6570000-05:00,,30533,Amicalola EMC,2024-02-02 17:36:26-05:00,34.549809,-84.041150


In [40]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/uszips.csv')

In [42]:
df

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,601,18.18027,-66.75266,Adjuntas,PR,Puerto Rico,True,,16834.0,100.9,72001,Adjuntas,"{""72001"": 98.73, ""72141"": 1.27}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,602,18.36075,-67.17541,Aguada,PR,Puerto Rico,True,,37642.0,479.2,72003,Aguada,"{""72003"": 100}",Aguada,72003,False,False,America/Puerto_Rico
2,603,18.45744,-67.12225,Aguadilla,PR,Puerto Rico,True,,49075.0,551.7,72005,Aguadilla,"{""72005"": 99.76, ""72099"": 0.24}",Aguadilla|Moca,72005|72099,False,False,America/Puerto_Rico
3,606,18.16585,-66.93716,Maricao,PR,Puerto Rico,True,,5590.0,48.7,72093,Maricao,"{""72093"": 82.27, ""72153"": 11.66, ""72121"": 6.06}",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,610,18.29110,-67.12243,Anasco,PR,Puerto Rico,True,,25542.0,265.7,72011,Añasco,"{""72011"": 96.7, ""72099"": 2.81, ""72083"": 0.37, ...",Añasco|Moca|Las Marías|Aguada,72011|72099|72083|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33783,99923,55.98043,-130.03803,Hyder,AK,Alaska,True,,25.0,0.6,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33784,99925,55.55398,-132.96276,Klawock,AK,Alaska,True,,920.0,6.6,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33785,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1465.0,4.3,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla
33786,99927,56.33305,-133.60044,Point Baker,AK,Alaska,True,,14.0,1.2,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka


In [43]:
zipcode_to_county_name = df.set_index('zip')['county_name'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_county_map.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_county_name, json_file)

In [45]:
zipcode_to_county_fips = df.set_index('zip')['county_fips'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_county_fips.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_county_fips, json_file)

In [44]:
zipcode_to_state_map = df.set_index('zip')['state_name'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_state_map.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_state_map, json_file)