In [49]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
from dateutil import tz
from datetime import datetime
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.geomap = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        # Implementation for loading data
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
            with open('zip_to_state_name.json', 'r') as json_file:
                self.geomap['zip_to_state_name'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        # Base transformation method
        raise NotImplementedError

    def standardize(self):
        # Base transformation method
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['utility_provider'] = self.config['name']
        self._data['state'] = self.config['state']
        self._data['county_name'] = self._data['zipcode'].map(self.geomap['zip_to_county_name'])
        self._data['county_fips'] = self._data['zipcode'].map(self.geomap['zip_to_county_fips'])
        
        self._data = self._data[[
            'utility_provider', 'state', 'county_name', 'county_fips', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass


class GA1TX8(BasePipeline):
    def transform(self):
        try:
            # Convert timestamps
            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)
            self._data['outageStartTime'] = pd.to_datetime(self._data['outageStartTime'], utc=True).dt.tz_convert(eastern)
            self._data['end_time'] = self._data.groupby('outageRecID')['timestamp'].transform('max')
            
            # extract lat and long
            self._data['outagePoint'] = self._data['outagePoint'].apply(lambda x: json.loads(x.replace("'", '"')))
            self._data[['lat', 'lng']] = self._data['outagePoint'].apply(lambda x: pd.Series([x['lat'], x['lng']]))
            self._data = self._data.rename(columns={
                'outageRecID':'outage_id',
                'outageStartTime': 'start_time',
                'customersOutNow':'customer_affected',
                'zip':'zipcode'
            })
        except Exception as e:
            print(f"An error occurred during transformation: {e}")

In [50]:
with open('/Users/xuanedx1/github/outage-data-scraper/app/pipeline/config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['LOCAL_FILE_BASE_PATH']

# Instantiate a BasePipeline object for each provider in the configuration
for provider in config['providers']:
    pipeline = GA1TX8(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)


/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_Washington EMC.csv


  self._data = pd.read_csv(file_path)


Unnamed: 0,utility_provider,state,county_name,county_fips,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,Washington EMC,ga,Baldwin,13009.0,31061,2023-03-15-0268,2023-03-15 16:30:29-04:00,NaT,33.138102,-83.155119,,,,3.0,,,
1,Washington EMC,ga,,,unknown,2023-04-13-0173,2023-04-13 08:49:32-04:00,2023-04-13 11:24:15-04:00,32.967522,-82.914201,154.72,169.72,162.22,1.0,451.32,466.32,458.82
2,Washington EMC,ga,Washington,13303.0,31082,2023-04-13-0174,2023-04-13 09:12:19-04:00,2023-04-13 10:39:09-04:00,32.902566,-82.969101,86.83,101.83,94.33,1.0,263.53,278.53,271.03
3,Washington EMC,ga,Washington,13303.0,31082,2023-04-13-0175,2023-04-13 09:16:54-04:00,2023-04-13 10:39:09-04:00,32.903070,-82.970877,82.25,97.25,89.75,1.0,258.95,273.95,266.45
4,Washington EMC,ga,,,unknown,2023-04-13-0176,2023-04-13 09:18:57-04:00,2023-04-13 13:22:00-04:00,33.155703,-83.115466,243.05,258.05,250.55,1.0,706.92,721.92,714.42
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4814,Washington EMC,ga,Baldwin,13009.0,31061,2024-02-02-0017,2024-02-02 10:03:49-05:00,2024-02-02 11:05:52-05:00,33.126038,-83.134496,62.05,77.05,69.55,3.0,186.20,231.20,208.70
4815,Washington EMC,ga,Baldwin,13009.0,31061,2024-02-02-0019,2024-02-02 10:51:32-05:00,2024-02-02 11:05:52-05:00,33.112744,-83.186230,14.33,29.33,21.83,4.0,57.33,117.33,87.33
4816,Washington EMC,ga,,,unknown,2024-02-02-0022,2024-02-02 11:19:03-05:00,2024-02-02 11:50:52-05:00,32.739141,-82.808476,31.82,46.82,39.32,3.0,95.50,140.50,118.00
4817,Washington EMC,ga,,,unknown,2024-02-02-0023,2024-02-02 13:53:51-05:00,2024-02-02 16:35:52-05:00,32.738211,-82.625030,162.02,177.02,169.52,2.0,324.07,354.07,339.07


/Users/xuanedx1/github/outage-data-scraper/data/s3/tx/layout_8/per_outage_Bluebonnet Electric Coop, Inc..csv


Unnamed: 0,utility_provider,state,county_name,county_fips,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,"Bluebonnet Electric Coop, Inc.",tx,,,77633,2023-04-06-0284,2023-04-06 01:49:40-04:00,2023-04-06 10:54:48-04:00,30.111487,-96.422005,545.13,560.13,552.63,3.0,1635.40,1680.40,1657.90
1,"Bluebonnet Electric Coop, Inc.",tx,Washington,48477.0,77426,2023-04-06-0315,2023-04-06 02:55:33-04:00,2023-04-06 10:54:48-04:00,30.129897,-96.210223,479.25,494.25,486.75,2.0,958.50,988.50,973.50
2,"Bluebonnet Electric Coop, Inc.",tx,Washington,48477.0,77426,2023-04-06-0316,2023-04-06 02:51:17-04:00,2023-04-06 10:54:48-04:00,30.130576,-96.207740,483.52,498.52,491.02,1.0,483.52,498.52,491.02
3,"Bluebonnet Electric Coop, Inc.",tx,Washington,48477.0,77833,2023-04-06-0364,2023-04-06 05:13:24-04:00,2023-04-06 10:54:48-04:00,30.074719,-96.389438,341.40,356.40,348.90,1.0,341.40,356.40,348.90
4,"Bluebonnet Electric Coop, Inc.",tx,Washington,48477.0,77426,2023-04-06-0385,2023-04-06 06:08:23-04:00,2023-04-06 10:54:48-04:00,30.141756,-96.283367,286.42,301.42,293.92,1.0,286.42,301.42,293.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10929,"Bluebonnet Electric Coop, Inc.",tx,Bastrop,48021.0,78953,2024-02-02-0103,2024-02-02 21:39:20-05:00,2024-02-02 22:25:10-05:00,29.890107,-97.350345,45.83,60.83,53.33,1.0,45.83,60.83,53.33
10930,"Bluebonnet Electric Coop, Inc.",tx,Caldwell,48055.0,78616,2024-02-02-0104,2024-02-02 21:41:16-05:00,2024-02-02 22:25:10-05:00,29.903496,-97.505146,43.90,58.90,51.40,1.0,43.90,58.90,51.40
10931,"Bluebonnet Electric Coop, Inc.",tx,Bastrop,48021.0,78662,2024-02-02-0106,2024-02-02 21:50:09-05:00,2024-02-02 21:55:10-05:00,29.872914,-97.488201,5.02,20.02,12.52,1.0,5.02,20.02,12.52
10932,"Bluebonnet Electric Coop, Inc.",tx,Caldwell,48055.0,78616,2024-02-02-0108,2024-02-02 21:52:49-05:00,2024-02-02 22:25:10-05:00,29.903196,-97.505283,32.35,47.35,39.85,1.0,32.40,47.40,39.90


------
**dev**

In [173]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_Amicalola EMC.csv')

  df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_1/per_outage_Amicalola EMC.csv')


In [174]:
df.head()

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:50:53,30705,Amicalola EMC
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:54:01,30705,Amicalola EMC
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,04-13-2023 19:54:01,30536,Amicalola EMC
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13T15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,04-13-2023 19:58:02,30705,Amicalola EMC
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13T15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,04-13-2023 19:58:02,30536,Amicalola EMC


In [175]:
df.isna().sum()

outageRecID                     0
outageName                      0
outagePoint                     0
outageStartTime                 0
estimatedTimeOfRestoral    247410
outageEndTime              244075
verified                        0
cause                      246589
crewAssigned                    0
customersOutInitially           0
customersOutNow                 0
customersRestored               0
streetsAffected                 0
outageModifiedTime              0
outageWorkStatus           247379
timestamp                       0
zip                             0
EMC                             0
dtype: int64

In [176]:
len(df[df['outageRecID'] != df['outageName']])

0

In [177]:
len(df[df['customersOutInitially'] < df['customersOutNow']])

13

In [178]:
len(df[df['customersOutInitially'] == df['customersRestored']])

2673

In [179]:
def transform(df):
    # Convert timestamps
    eastern = tz.gettz('US/Eastern')
    utc = tz.gettz('UTC')
    df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True).dt.tz_convert(eastern)
    df['outageStartTime'] = pd.to_datetime(df['outageStartTime'], utc=True).dt.tz_convert(eastern)
    df['end_time'] = df.groupby('outageRecID')['timestamp'].transform('max')
    
    # extract lat and long
    df['outagePoint'] = df['outagePoint'].apply(lambda x: json.loads(x.replace("'", '"')))
    df[['lat', 'lng']] = df['outagePoint'].apply(lambda x: pd.Series([x['lat'], x['lng']]))
    
    return df

In [180]:
df = transform(df)

In [181]:
df

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:50:53-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:54:01-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:54:01-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:58:02-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:58:02-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247405,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 19:36:12-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247406,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 19:51:11-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247407,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 20:06:11-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316
247408,2024-02-02-0025,2024-02-02-0025,"{'lat': 34.374641655676704, 'lng': -84.4433162...",2024-02-02 18:36:10-05:00,,,False,,True,3,3,0,"['LOWER DOWDA MILL RD 1052', 'LOWER DOWDA MILL...",2024-02-02T18:40:53.5800000-05:00,,2024-02-02 20:21:12-05:00,30143,Amicalola EMC,2024-02-02 20:36:11-05:00,34.374642,-84.443316


In [96]:
# len(t[t['outageStartTime'] > df['timestamp']])

0

In [164]:
def compute_metrics(group):
    duration = (group['end_time'] - group['outageStartTime']).dt.total_seconds() / 60
    # duration_min_est = 15 * (len(group) - 1) + 7.5
    # duration_max_est = 15 * (len(group) + 1)
    # duration_mean_est = (duration_|min_est + duration_max_est) / 2
    duration_max = duration + 15
    duration_mean = (duration + duration_max) / 2
    customer_affected_mean = group['customersOutNow'].mean()
    
    total_customer_outage_time = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + (group['timestamp'].iloc[0] - group['outageStartTime'].iloc[0]).total_seconds() / 60 * group['customersOutNow'].iloc[0]
    total_customer_outage_time_max = total_customer_outage_time + 15 * group['customersOutNow'].iloc[-1]
    total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2
    
    # total_customer_outage_time_a1 = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + 1 * group['customersOutNow'].iloc[0]
    # total_customer_outage_time_a2 = 15 * (group['customersOutNow'].sum() - group['customersOutNow'].iloc[0]) + 7.5 * group['customersOutNow'].iloc[0]
    # total_customer_outage_time_b = 15 * group['customersOutNow'].sum()
    # total_customer_outage_time_c = 15 * group['customersOutNow'].sum() + 15 * group['customersOutNow'].iloc[0] + 15 * group['customersOutNow'].iloc[-1]
    # total_customer_outage_time_d = (cust_out_time_est_a1 + cust_out_time_est_a2 + cust_out_time_est_b + cust_out_time_est_c) / 4

    return pd.Series({
        'timestamp': group['end_time'].iloc[-1],
        'duration': duration.iloc[-1],
        'duration_max': duration_max.iloc[-1],
        'duration_mean': duration_mean.iloc[-1],
        'customer_affected_mean': customer_affected_mean,
        'total_customer_outage_time': total_customer_outage_time,
        'total_customer_outage_time_max': total_customer_outage_time_max,
        'total_customer_outage_time_mean': total_customer_outage_time_mean
    })

In [165]:
df.head()

Unnamed: 0,outageRecID,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,timestamp,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:50:53-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
1,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:54:01-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
2,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:54:01-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
3,2023-04-13-1040,2023-04-13-1040,"{'lat': 34.812996172514474, 'lng': -84.6029181...",2023-04-13 15:44:42-04:00,,,False,,False,2,2,0,"['COHUTTA FOREST RD 145', 'OLD CCC CAMP RD 112...",2023-04-13T15:44:59-04:00,,2023-04-13 15:58:02-04:00,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.812996,-84.602918
4,2023-04-13-1041,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,False,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:48:10-04:00,,2023-04-13 15:58:02-04:00,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477


In [166]:
df_grouped = df.groupby('outageRecID').apply(compute_metrics).reset_index().round(2)

In [167]:
df_grouped

Unnamed: 0,outageRecID,timestamp,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,2023-04-13-1040,2023-04-13 19:58:21-04:00,253.65,268.65,261.15,16.50,12357.37,12762.37,12559.87
1,2023-04-13-1041,2023-04-13 17:24:17-04:00,96.48,111.48,103.98,1.00,276.22,291.22,283.72
2,2023-04-13-1042,2023-04-13 22:39:07-04:00,402.12,417.12,409.62,1.00,1178.90,1193.90,1186.40
3,2023-04-13-1043,2023-04-13 18:24:19-04:00,133.30,148.30,140.80,5.81,2252.33,2342.33,2297.33
4,2023-04-13-1044,2023-04-13 16:54:13-04:00,12.58,27.58,20.08,2.00,63.17,93.17,78.17
...,...,...,...,...,...,...,...,...,...
9046,2024-02-02-0018,2024-02-02 14:36:23-05:00,68.38,83.38,75.88,5.00,341.67,416.67,379.17
9047,2024-02-02-0019,2024-02-02 14:51:14-05:00,72.38,87.38,79.88,31.00,2246.47,2711.47,2478.97
9048,2024-02-02-0022,2024-02-02 16:51:15-05:00,51.57,66.57,59.07,1.00,51.53,66.53,59.03
9049,2024-02-02-0024,2024-02-02 17:36:26-05:00,47.37,62.37,54.87,1.00,47.18,62.18,54.68


In [168]:
df_merged = pd.merge(df_grouped, df, on=['outageRecID', 'timestamp'], how='inner')

In [169]:
df_merged

Unnamed: 0,outageRecID,timestamp,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean,outageName,outagePoint,outageStartTime,estimatedTimeOfRestoral,outageEndTime,verified,cause,crewAssigned,customersOutInitially,customersOutNow,customersRestored,streetsAffected,outageModifiedTime,outageWorkStatus,zip,EMC,end_time,lat,lng
0,2023-04-13-1040,2023-04-13 19:58:21-04:00,253.65,268.65,261.15,16.50,12357.37,12762.37,12559.87,2023-04-13-1040,"{'lat': 34.815493607300034, 'lng': -84.5984642...",2023-04-13 15:44:42-04:00,,,False,400 Decay/age of material/equipment,True,27,27,0,"['CHATSWORTH HWY', 'COHUTTA FOREST 401', 'COHU...",2023-04-13T19:28:09-04:00,,30705,Amicalola EMC,2023-04-13 19:58:21-04:00,34.815494,-84.598464
1,2023-04-13-1041,2023-04-13 17:24:17-04:00,96.48,111.48,103.98,1.00,276.22,291.22,283.72,2023-04-13-1041,"{'lat': 34.65504147851155, 'lng': -84.25847692...",2023-04-13 15:47:48-04:00,,,False,,True,1,1,0,['ROCKWATER RD 215'],2023-04-13T15:57:30.3270000-04:00,,30536,Amicalola EMC,2023-04-13 17:24:17-04:00,34.655041,-84.258477
2,2023-04-13-1042,2023-04-13 22:39:07-04:00,402.12,417.12,409.62,1.00,1178.90,1193.90,1186.40,2023-04-13-1042,"{'lat': 34.76022275601547, 'lng': -84.43656079...",2023-04-13 15:57:00-04:00,,,False,,True,1,1,0,['POCASET DR 150'],2023-04-13T20:08:15-04:00,,30540,Amicalola EMC,2023-04-13 22:39:07-04:00,34.760223,-84.436561
3,2023-04-13-1043,2023-04-13 18:24:19-04:00,133.30,148.30,140.80,5.81,2252.33,2342.33,2297.33,2023-04-13-1043,"{'lat': 34.64094744509421, 'lng': -84.27010785...",2023-04-13 16:11:01-04:00,,,False,,True,6,6,0,"['NEWBERRY DR 364', 'NEWBERRY DR 626', 'NEWBER...",2023-04-13T17:24:18.9530000-04:00,,unknown,Amicalola EMC,2023-04-13 18:24:19-04:00,34.640947,-84.270108
4,2023-04-13-1044,2023-04-13 16:54:13-04:00,12.58,27.58,20.08,2.00,63.17,93.17,78.17,2023-04-13-1044,"{'lat': 34.400304776099645, 'lng': -84.2707052...",2023-04-13 16:41:38-04:00,,,False,,False,2,2,0,"['LOON LN 25', 'LOVELADY RD 505']",2023-04-13T16:42:03-04:00,,30107,Amicalola EMC,2023-04-13 16:54:13-04:00,34.400305,-84.270705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9046,2024-02-02-0018,2024-02-02 14:36:23-05:00,68.38,83.38,75.88,5.00,341.67,416.67,379.17,2024-02-02-0018,"{'lat': 34.714668737578464, 'lng': -84.4774361...",2024-02-02 13:28:00-05:00,,,False,,True,5,5,0,"['BOARDTOWN RD 1064', 'BOARDTOWN RD 1066', 'VA...",2024-02-02T13:33:18.5070000-05:00,,30540,Amicalola EMC,2024-02-02 14:36:23-05:00,34.714669,-84.477436
9047,2024-02-02-0019,2024-02-02 14:51:14-05:00,72.38,87.38,79.88,31.00,2246.47,2711.47,2478.97,2024-02-02-0019,"{'lat': 34.54793254252141, 'lng': -84.04290642...",2024-02-02 13:38:51-05:00,,,False,,True,31,31,0,"['HOMER EDWARDS RD 385', 'HOMER EDWARDS RD 425...",2024-02-02T13:47:42-05:00,,30533,Amicalola EMC,2024-02-02 14:51:14-05:00,34.547933,-84.042906
9048,2024-02-02-0022,2024-02-02 16:51:15-05:00,51.57,66.57,59.07,1.00,51.53,66.53,59.03,2024-02-02-0022,"{'lat': 34.612873511729674, 'lng': -84.0811178...",2024-02-02 15:59:41-05:00,,,False,,True,1,1,0,['SUNRISE RIDGE/ETOWAH TRL'],2024-02-02T16:03:33.6600000-05:00,,unknown,Amicalola EMC,2024-02-02 16:51:15-05:00,34.612874,-84.081118
9049,2024-02-02-0024,2024-02-02 17:36:26-05:00,47.37,62.37,54.87,1.00,47.18,62.18,54.68,2024-02-02-0024,"{'lat': 34.54980869474821, 'lng': -84.04115022...",2024-02-02 16:49:04-05:00,,,False,,True,1,1,0,['HORTON RD 90'],2024-02-02T16:54:59.6570000-05:00,,30533,Amicalola EMC,2024-02-02 17:36:26-05:00,34.549809,-84.041150


In [40]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/uszips.csv')

In [42]:
df

Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,601,18.18027,-66.75266,Adjuntas,PR,Puerto Rico,True,,16834.0,100.9,72001,Adjuntas,"{""72001"": 98.73, ""72141"": 1.27}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,602,18.36075,-67.17541,Aguada,PR,Puerto Rico,True,,37642.0,479.2,72003,Aguada,"{""72003"": 100}",Aguada,72003,False,False,America/Puerto_Rico
2,603,18.45744,-67.12225,Aguadilla,PR,Puerto Rico,True,,49075.0,551.7,72005,Aguadilla,"{""72005"": 99.76, ""72099"": 0.24}",Aguadilla|Moca,72005|72099,False,False,America/Puerto_Rico
3,606,18.16585,-66.93716,Maricao,PR,Puerto Rico,True,,5590.0,48.7,72093,Maricao,"{""72093"": 82.27, ""72153"": 11.66, ""72121"": 6.06}",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,610,18.29110,-67.12243,Anasco,PR,Puerto Rico,True,,25542.0,265.7,72011,Añasco,"{""72011"": 96.7, ""72099"": 2.81, ""72083"": 0.37, ...",Añasco|Moca|Las Marías|Aguada,72011|72099|72083|72003,False,False,America/Puerto_Rico
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33783,99923,55.98043,-130.03803,Hyder,AK,Alaska,True,,25.0,0.6,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33784,99925,55.55398,-132.96276,Klawock,AK,Alaska,True,,920.0,6.6,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka
33785,99926,55.12617,-131.48928,Metlakatla,AK,Alaska,True,,1465.0,4.3,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Metlakatla
33786,99927,56.33305,-133.60044,Point Baker,AK,Alaska,True,,14.0,1.2,2198,Prince of Wales-Hyder,"{""02198"": 100}",Prince of Wales-Hyder,02198,False,False,America/Sitka


In [43]:
zipcode_to_county_name = df.set_index('zip')['county_name'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_county_map.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_county_name, json_file)

In [45]:
zipcode_to_county_fips = df.set_index('zip')['county_fips'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_county_fips.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_county_fips, json_file)

In [44]:
zipcode_to_state_map = df.set_index('zip')['state_name'].to_dict()

json_file_path = '/Users/xuanedx1/github/outage-data-scraper/app/pipeline/zip_to_state_map.json'  # Path for the output JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(zipcode_to_state_map, json_file)