In [9]:
import pandas as pd
import ast
import pytz
import os
from datetime import datetime

pd.set_option('display.max_columns', None)

**Notes on Metrics**

start_time = group['timestamp'].iloc[0]

end_time = group['timestamp'].iloc[-1]

duration = (end_time - start_time).total_seconds() / 60

duration_min_est = 15 * (len(group) - 1) + 7.5

duration_max_est = 15 * (len(group) + 1)

duration_mean_est = (duration_min_est + duration_max_est) / 2

cust_a_mean = group['cust_a'].mean()

outage_cnt = group['n_out'].max()

outage_cnt_x_cust_a = outage_cnt * cust_a_mean

cust_served_cnt_mean = group['cust_s'].mean()

cust_out_time_est_a1 = 15 * (group['cust_a'].sum() - group['cust_a'].iloc[0]) + 1 * group['cust_a'].iloc[0]

cust_out_time_est_a2 = 15 * (group['cust_a'].sum() - group['cust_a'].iloc[0]) + 7.5 * group['cust_a'].iloc[0]

cust_out_time_est_b = 15 * group['cust_a'].sum()

cust_out_time_est_c = 15 * group['cust_a'].sum() + 15 * group['cust_a'].iloc[0] + 15 * group['cust_a'].iloc[-1]

cust_out_time_est_d = (cust_out_time_est_a1 + cust_out_time_est_a2 + cust_out_time_est_b + cust_out_time_est_c) / 4


In [22]:
# zipcode to county mapping
zip_county = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/scripts/Georgia_Zip_Code_County_Lookup.csv', header=0, skiprows=1)
ga_zip_county_map = zip_county.set_index('Zip Code')['County'].to_dict()

In [18]:
def compute(group):
    '''compute metrics of interest'''
    if len(group) > 0:
        start_time = group['timestamp'].iloc[0]
        end_time = group['timestamp'].iloc[-1]
        duration = (end_time - start_time).total_seconds() / 60
        duration_min_est = 15 * (len(group) - 1) + 7.5
        duration_max_est = 15 * (len(group) + 1)
        duration_mean_est = (duration_min_est + duration_max_est) / 2
        cust_a_mean = group['cust_a'].mean()
        outage_cnt = group['n_out'].max()
        outage_cnt_x_cust_a = outage_cnt * cust_a_mean
        cust_served_cnt_mean = group['cust_s'].mean()
        # cust_out_time_est_a1 = 15 * (group['cust_a'].sum() - group['cust_a'].iloc[0]) + 1 * group['cust_a'].iloc[0]
        # cust_out_time_est_a2 = 15 * (group['cust_a'].sum() - group['cust_a'].iloc[0]) + 7.5 * group['cust_a'].iloc[0]
        # cust_out_time_est_b = 15 * group['cust_a'].sum()
        # cust_out_time_est_c = 15 * group['cust_a'].sum() + 15 * group['cust_a'].iloc[0] + 15 * group['cust_a'].iloc[-1]
        # cust_out_time_est_d = (cust_out_time_est_a1 + cust_out_time_est_a2 + cust_out_time_est_b + cust_out_time_est_c) / 4

    return pd.Series({
        'start_time': start_time,
        'end_time': end_time,
        'duration_by_diff': duration,
        'duration_est_min': duration_min_est,
        'duration_est_max': duration_max_est,
        'duration_est_mean': duration_mean_est,
        'cust_a_mean': cust_a_mean,
        'outage_cnt': outage_cnt,
        'outage_cnt_x_cust_a': outage_cnt_x_cust_a,
        'cust_served_cnt_mean': cust_served_cnt_mean
        # 'cust_out_time_est_a1': cust_out_time_est_a1,
        # 'cust_out_time_est_a2': cust_out_time_est_a2,
        # 'cust_out_time_est_b': cust_out_time_est_b,
        # 'cust_out_time_est_c': cust_out_time_est_c,
        # 'cust_out_time_est_d': cust_out_time_est_d
    })

In [19]:
def get_outages(df, key):
    '''get outage records from one zipcode'''
    subset = df[df['name'] == key].copy()
    subset['timestamp'] = pd.to_datetime(subset['timestamp'])
    grouped = subset.groupby((subset['timestamp'].diff() > pd.Timedelta(minutes=17)).cumsum())
    result = grouped.apply(compute).reset_index(drop=True)
    result['name'] = key
    # print(f'{len(result)} outages in {key}')
    return result

In [25]:
def parse_zip(key):
    strings = key.split(' ')
    zip, city = strings[0], strings[1][1:-1]
    return int(zip)

In [30]:
def parse_emc_name(file_path):
    '''
    Parses the EMC name from a given file path. The EMC name is defined as the name
    between the last underscore '_' and '.csv' in the file path.
    '''
    # Extract the part of the file path after the last underscore
    emc_name_with_extension = file_path.split('_')[-1]
    # Remove the '.csv' extension to get the EMC name
    emc_name = emc_name_with_extension.replace('.csv', '')
    return emc_name

In [51]:
def extract(df, emc='', geo_map=None):
    '''get all outage records from a dataset'''
    zipcodes = df.name.unique()
    records = []
    for zip in zipcodes:
        records.append(get_outages(df, zip))

    records = pd.concat(records, ignore_index=True)
    records['EMC'] = emc
    if geo_map:
        if emc == 'Georgia Power':
            records['zipcode'] = records['name'].apply(parse_zip)
        elif emc == 'Cobb EMC':
            records = records.rename(columns={'name': 'zipcode'})
            
        records['County'] = records['zipcode'].map(geo_map)
        
    return records

In [32]:
def run_pipepline(input_file_path, output_file_path='', geo_map=None):
    data = pd.read_csv(input_file_path)
    emc = parse_emc_name(input_file_path)
    records = extract(data, emc, geo_map=geo_map)
    
    if output_file_path:
        records.to_csv(output_file_path)

    return records

----
**Test**

In [60]:
pous2023 = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_4/per_zipcode_Georgia Power.csv')
pous2023

Unnamed: 0,key,name,cust_a,cust_s,percent_cust_a,etr,etr_confidence,areaId,n_out,timestamp,EMC
0,Zip (City),31408 (SAVANNAH),6,6145,0.10,2023-03-16T01:45:00Z,LOW,REAG|31408 (SAVANNAH)|Zip (City),1,03-16-2023 01:17:50,Georgia Power
1,Zip (City),31201 (MACON),4,9047,0.01,2023-03-16T03:00:00Z,LOW,REAG|31201 (MACON)|Zip (City),1,03-16-2023 01:17:50,Georgia Power
2,Zip (City),30315 (ATLANTA),5,16491,0.03,2023-03-16T03:45:00Z,LOW,REAG|30315 (ATLANTA)|Zip (City),1,03-16-2023 01:17:50,Georgia Power
3,Zip (City),31419 (SAVANNAH),4,26440,0.01,2023-03-16T02:45:00Z,LOW,REAG|31419 (SAVANNAH)|Zip (City),2,03-16-2023 01:17:50,Georgia Power
4,Zip (City),30349 (COLLEGE PARK),10,27568,0.04,2023-03-16T02:30:00Z,LOW,REAG|30349 (COLLEGE PARK)|Zip (City),2,03-16-2023 01:17:50,Georgia Power
...,...,...,...,...,...,...,...,...,...,...,...
978467,Zip (City),31410 (SAVANNAH),4,11777,0.01,2024-02-03T14:00:00Z,LOW,REAG|31410 (SAVANNAH)|Zip (City),1,02-03-2024 03:03:04,Georgia Power
978468,Zip (City),31711 (ANDERSONVILLE),4,237,0.42,2024-02-04T19:30:00Z,LOW,REAG|31711 (ANDERSONVILLE)|Zip (City),1,02-03-2024 03:03:04,Georgia Power
978469,Zip (City),30033 (DECATUR),141,17068,0.83,2024-02-03T03:00:00Z,LOW,REAG|30033 (DECATUR)|Zip (City),1,02-03-2024 03:03:04,Georgia Power
978470,Zip (City),30088 (STONE MOUNTAIN),4,10976,0.01,2024-02-03T05:30:00Z,LOW,REAG|30088 (STONE MOUNTAIN)|Zip (City),1,02-03-2024 03:03:04,Georgia Power


In [63]:
# Convert 'RecordDateTime' to datetime format
pous2023['timestamp'] = pd.to_datetime(pous2023['timestamp'])
 
# Sort the DataFrame by 'UtilityName', 'CountyName', and 'RecordDateTime'
pous2023.sort_values(by='timestamp', inplace=True)
 
# Calculate the time difference between consecutive records in minutes
pous2023['TimeDiff'] = pous2023.groupby('name')['timestamp'].diff().dt.total_seconds() / 60
 
# Compute the average time interval for each 'UtilityName' and 'CountyName' group
average_intervals = pous2023.groupby('name')['TimeDiff'].mean().reset_index()
 
# # Rename the 'TimeDiff' column to 'AverageTimeInterval'
# average_intervals.rename(columns={'TimeDiff': 'AverageTimeInterval'}, inplace=True)
 
print(average_intervals.mean())

TimeDiff    550.571596
dtype: float64


  print(average_intervals.mean())


In [53]:
# Georgia Power
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_4/per_zipcode_Georgia Power.csv'
georgia_power_z = run_pipepline(input_path, geo_map=ga_zip_county_map)
georgia_power_z

Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,name,EMC,zipcode,County
0,2023-03-16 01:17:50,2023-03-16 01:17:50,0.000000,7.5,30,18.75,6.0,1,6.0,6145.0,31408 (SAVANNAH),Georgia Power,31408,Chatham
1,2023-03-17 19:32:52,2023-03-17 19:32:52,0.000000,7.5,30,18.75,7.0,2,14.0,6145.0,31408 (SAVANNAH),Georgia Power,31408,Chatham
2,2023-03-17 20:32:52,2023-03-17 20:32:52,0.000000,7.5,30,18.75,7.0,2,14.0,6145.0,31408 (SAVANNAH),Georgia Power,31408,Chatham
3,2023-03-21 11:32:46,2023-03-21 11:32:46,0.000000,7.5,30,18.75,4.0,1,4.0,6146.0,31408 (SAVANNAH),Georgia Power,31408,Chatham
4,2023-03-27 17:02:35,2023-03-27 17:32:34,29.983333,37.5,60,48.75,4.0,1,4.0,6145.0,31408 (SAVANNAH),Georgia Power,31408,Chatham
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330225,2023-11-28 20:18:06,2023-11-28 20:33:12,15.100000,22.5,45,33.75,4.0,1,4.0,498.0,31093 (WARNER ROBINS),Georgia Power,31093,Houston
330226,2023-11-28 22:33:15,2023-11-28 23:03:15,30.000000,37.5,60,48.75,4.0,1,4.0,498.0,31093 (WARNER ROBINS),Georgia Power,31093,Houston
330227,2024-01-24 17:48:28,2024-01-24 18:03:15,14.783333,22.5,45,33.75,9.0,1,9.0,505.0,31093 (WARNER ROBINS),Georgia Power,31093,Houston
330228,2024-01-24 18:33:04,2024-01-24 18:33:04,0.000000,7.5,30,18.75,9.0,1,9.0,505.0,31093 (WARNER ROBINS),Georgia Power,31093,Houston


In [52]:
# Cobb EMC
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_4/per_zipcode_Cobb EMC.csv'
cobb_county_z = run_pipepline(input_path, geo_map=ga_zip_county_map)
cobb_county_z

Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,zipcode,EMC,County
0,2023-03-15 20:32:21,2023-03-15 20:32:21,0.000000,7.5,30,18.75,4.0,1,4.0,16814.0,30102,Cobb EMC,Cherokee
1,2023-03-16 13:47:21,2023-03-16 13:47:21,0.000000,7.5,30,18.75,5.0,2,10.0,16814.0,30102,Cobb EMC,Cherokee
2,2023-03-16 14:32:21,2023-03-16 15:02:22,30.016667,37.5,60,48.75,4.0,1,4.0,16814.0,30102,Cobb EMC,Cherokee
3,2023-03-16 15:32:23,2023-03-16 15:32:23,0.000000,7.5,30,18.75,4.0,1,4.0,16814.0,30102,Cobb EMC,Cherokee
4,2023-03-16 17:02:22,2023-03-16 17:32:22,30.000000,37.5,60,48.75,4.0,2,8.0,16814.0,30102,Cobb EMC,Cherokee
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,2024-01-05 16:17:33,2024-01-05 16:17:33,0.000000,7.5,30,18.75,4.0,1,4.0,1358.0,30082,Cobb EMC,Cobb
1170,2024-01-05 19:32:45,2024-01-05 19:32:45,0.000000,7.5,30,18.75,4.0,1,4.0,1358.0,30082,Cobb EMC,Cobb
1171,2024-01-05 20:48:03,2024-01-05 20:48:03,0.000000,7.5,30,18.75,4.0,1,4.0,1358.0,30082,Cobb EMC,Cobb
1172,2024-01-06 05:17:32,2024-01-06 05:17:32,0.000000,7.5,30,18.75,25.0,1,25.0,1358.0,30082,Cobb EMC,Cobb


In [39]:
# Austin Energy
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_5/per_zipcode_Austin Energy.csv'
austin_energy_z = run_pipepline(input_path)
austin_energy_z

Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,name,EMC
0,2023-04-06 01:19:47,2023-04-06 01:19:47,0.000000,7.5,30,18.75,6.000000,1,6.000000,14542.0,78746,Austin Energy
1,2023-04-17 13:29:28,2023-04-17 15:14:28,105.000000,112.5,135,123.75,1.000000,1,1.000000,14542.0,78746,Austin Energy
2,2023-04-17 15:59:27,2023-04-17 17:14:28,75.016667,82.5,105,93.75,2.000000,1,2.000000,14542.0,78746,Austin Energy
3,2023-04-17 21:44:26,2023-04-18 00:44:27,180.016667,187.5,210,198.75,1.692308,2,3.384615,14542.0,78746,Austin Energy
4,2023-04-18 14:59:27,2023-04-18 16:14:28,75.016667,82.5,105,93.75,1.000000,1,1.000000,14542.0,78746,Austin Energy
...,...,...,...,...,...,...,...,...,...,...,...,...
10216,2023-10-06 16:29:29,2023-10-06 16:29:29,0.000000,7.5,30,18.75,7.000000,1,7.000000,7.0,78737,Austin Energy
10217,2023-09-25 03:59:32,2023-09-25 07:59:30,239.966667,247.5,270,258.75,9.000000,1,9.000000,9.0,78681,Austin Energy
10218,2023-09-25 13:29:31,2023-09-25 16:29:30,179.983333,187.5,210,198.75,1.076923,2,2.153846,9.0,78681,Austin Energy
10219,2024-01-15 18:14:30,2024-01-15 19:14:30,60.000000,67.5,90,78.75,1.000000,1,1.000000,9.0,78681,Austin Energy


In [56]:
# CPS Energy
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_5/per_zipcode_CPS Energy.csv'
cps_energy_z = run_pipepline(input_path)
cps_energy_z

Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,name,EMC
0,2023-04-06 01:20:24,2023-04-06 01:20:24,0.000000,7.5,30,18.75,4.0,1,4.0,16157.0,78227,CPS Energy
1,2023-04-13 11:44:59,2023-04-13 13:15:06,90.116667,97.5,120,108.75,4.0,1,4.0,16158.0,78227,CPS Energy
2,2023-04-13 18:15:01,2023-04-13 18:30:02,15.016667,22.5,45,33.75,32.0,1,32.0,16158.0,78227,CPS Energy
3,2023-04-13 19:00:01,2023-04-13 19:00:01,0.000000,7.5,30,18.75,4.0,1,4.0,16158.0,78227,CPS Energy
4,2023-04-13 19:30:03,2023-04-13 20:00:01,29.966667,37.5,60,48.75,4.0,1,4.0,16158.0,78227,CPS Energy
...,...,...,...,...,...,...,...,...,...,...,...,...
23628,2024-01-24 14:00:05,2024-01-24 14:30:05,30.000000,37.5,60,48.75,4.0,1,4.0,16.0,78065,CPS Energy
23629,2023-09-10 03:59:59,2023-09-10 03:59:59,0.000000,7.5,30,18.75,4.0,1,4.0,5.0,78150,CPS Energy
23630,2023-09-10 05:59:57,2023-09-10 05:59:57,0.000000,7.5,30,18.75,4.0,1,4.0,5.0,78150,CPS Energy
23631,2023-12-28 22:45:04,2023-12-28 22:45:04,0.000000,7.5,30,18.75,4.0,1,4.0,5.0,78150,CPS Energy


In [57]:
# Oncor Electric Delivery Co.
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_5/per_zipcode_Oncor Electric Delivery Co..csv'
oncor_electric_z = run_pipepline(input_path)
oncor_electric_z

  data = pd.read_csv(input_file_path)


Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,name,EMC
0,2023-04-06 01:21:41,2023-04-06 01:21:41,0.000000,7.5,30,18.75,1.0,1,1.0,1149.0,76534,Oncor Electric Delivery Co.
1,2023-04-13 15:01:23,2023-04-13 16:46:17,104.900000,112.5,135,123.75,1.0,1,1.0,1149.0,76534,Oncor Electric Delivery Co.
2,2023-04-13 17:31:23,2023-04-13 19:46:21,134.966667,142.5,165,153.75,7.0,1,7.0,1149.0,76534,Oncor Electric Delivery Co.
3,2023-04-13 20:16:17,2023-04-13 20:31:21,15.066667,22.5,45,33.75,7.0,1,7.0,1149.0,76534,Oncor Electric Delivery Co.
4,2023-04-17 14:16:19,2023-04-17 14:46:23,30.066667,37.5,60,48.75,5.0,1,5.0,1149.0,76534,Oncor Electric Delivery Co.
...,...,...,...,...,...,...,...,...,...,...,...,...
108284,2023-11-08 15:46:33,2023-11-08 17:31:29,104.933333,112.5,135,123.75,1.0,1,1.0,50.0,75158,Oncor Electric Delivery Co.
108285,2023-11-10 16:01:31,2023-11-10 19:31:27,209.933333,217.5,240,228.75,1.0,1,1.0,50.0,75158,Oncor Electric Delivery Co.
108286,2023-11-28 17:01:21,2023-11-28 21:46:29,285.133333,292.5,315,303.75,1.0,1,1.0,49.0,75158,Oncor Electric Delivery Co.
108287,2023-11-30 16:01:26,2023-11-30 18:16:35,135.150000,142.5,165,153.75,1.0,1,1.0,49.0,75158,Oncor Electric Delivery Co.


In [58]:
# Texas-New Mexico Power Co.
input_path = '/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_5/per_zipcode_Texas-New Mexico Power Co..csv'
texas_nm_power_z = run_pipepline(input_path)
texas_nm_power_z

Unnamed: 0,start_time,end_time,duration_by_diff,duration_est_min,duration_est_max,duration_est_mean,cust_a_mean,outage_cnt,outage_cnt_x_cust_a,cust_served_cnt_mean,name,EMC
0,2023-04-06 01:21:06,2023-04-06 01:21:06,0.000000,7.5,30,18.75,1.000000,1,1.000000,9049.0,77591,Texas-New Mexico Power Co.
1,2023-04-13 19:00:43,2023-04-13 21:30:43,150.000000,157.5,180,168.75,15.363636,2,30.727273,9048.0,77591,Texas-New Mexico Power Co.
2,2023-04-15 16:00:43,2023-04-15 17:15:44,75.016667,82.5,105,93.75,2.000000,1,2.000000,9046.0,77591,Texas-New Mexico Power Co.
3,2023-04-15 17:45:46,2023-04-15 23:00:46,315.000000,322.5,345,333.75,2.000000,1,2.000000,9046.0,77591,Texas-New Mexico Power Co.
4,2023-04-16 08:30:36,2023-04-16 11:15:40,165.066667,172.5,195,183.75,1.000000,1,1.000000,9048.0,77591,Texas-New Mexico Power Co.
...,...,...,...,...,...,...,...,...,...,...,...,...
10879,2024-01-11 19:01:19,2024-01-11 19:01:19,0.000000,7.5,30,18.75,1.000000,1,1.000000,40.0,76459,Texas-New Mexico Power Co.
10880,2024-01-11 19:31:19,2024-01-11 19:31:19,0.000000,7.5,30,18.75,1.000000,1,1.000000,40.0,76459,Texas-New Mexico Power Co.
10881,2024-01-11 20:01:21,2024-01-11 20:01:21,0.000000,7.5,30,18.75,1.000000,1,1.000000,40.0,76459,Texas-New Mexico Power Co.
10882,2024-01-11 20:31:30,2024-01-11 21:16:24,44.900000,52.5,75,63.75,1.000000,1,1.000000,40.0,76459,Texas-New Mexico Power Co.
