In [18]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
import glob
from dateutil import tz
from datetime import datetime
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.geomap = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        raise NotImplementedError

    def standardize(self):
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass

In [19]:
class CA1(BasePipeline):
    def load_data(self):
        try:
            dir_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/"
            csv_files = glob.glob(os.path.join(dir_path, "*.csv"))
            df_list = [pd.read_csv(file) for file in csv_files]
            self._data = pd.concat(df_list, ignore_index=True)
            
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
    
    def transform(self):
        try:
            # Convert timestamps
            eastern = tz.gettz('US/Eastern')
            utc = tz.gettz('UTC')
            self._data['StartDate'] = pd.to_datetime(self._data['StartDate'], utc=True).dt.tz_convert(eastern)
            self._data['timestamp'] = pd.to_datetime(self._data['timestamp'], utc=True).dt.tz_convert(eastern)
            
            # Since there's no direct 'end_time' in the new dataset, assuming 'EstimatedRestoreDate' serves a similar purpose
            self._data['end_time'] = self._data.groupby('OBJECTID')['timestamp'].transform('max')
            
            self._data['zipcode'] = "000000"
            
            self._data = self._data.rename(columns={
                'x': 'lat',
                'y': 'lng',
                'OBJECTID': 'outage_id',
                'StartDate': 'start_time',
                'ImpactedCustomers': 'customer_affected',
                'UtilityCompany': 'utility_provider',
                'County': 'county'
            })
        except Exception as e:
            print(f"An error occurred during transformation: {e}")

class CA2(BasePipeline):
    def tranform(self):
        # Specific transformation for CA2
        pass

In [20]:
with open('/Users/xuanedx1/github/outage-data-scraper/app/pipeline/config.yaml', 'r') as file:
    config = yaml.safe_load(file)
    base_file_path = config['globals']['LOCAL_FILE_BASE_PATH']

# Instantiate a BasePipeline object for each provider in the configuration
for provider in config['providers']:
    pipeline = CA1(provider, base_file_path)
    pipeline.standardize()
    display(pipeline._data)

  df_list = [pd.read_csv(file) for file in csv_files]


Unnamed: 0,utility_provider,state,county,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,LAWP,ca,LOS ANGELES,000000,2164236,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.442245,34.150712,18.04,33.04,25.54,9.17,20491.80,20506.80,20499.30
1,LAWP,ca,LOS ANGELES,000000,2164360,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.352399,34.079090,18.04,33.04,25.54,2.30,5146.80,5161.80,5154.30
2,LAWP,ca,LOS ANGELES,000000,2164361,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.442245,34.150712,18.04,33.04,25.54,7.93,17716.80,17731.80,17724.30
3,LAWP,ca,LOS ANGELES,000000,2164432,2023-03-31 05:46:44.425000-04:00,2023-03-31 05:50:06-04:00,-118.352399,34.079090,3.36,18.36,10.86,13.04,28171.80,28186.80,28179.30
4,LAWP,ca,LOS ANGELES,000000,2167231,2023-03-30 03:04:00.284000-04:00,2023-03-30 03:05:07-04:00,-118.298986,34.001976,1.11,16.11,8.61,2.68,1497.57,1512.57,1505.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840091,PGE,ca,SANTA CRUZ,000000,29416381,2024-02-02 22:03:07-05:00,2024-02-02 22:21:09-05:00,-122.036320,37.167310,18.03,33.03,25.53,256.00,4616.53,8456.53,6536.53
2840092,PGE,ca,HUMBOLDT,000000,29416382,2024-02-02 22:06:35-05:00,2024-02-02 22:21:09-05:00,-123.873390,41.350810,14.57,29.57,22.07,1.00,14.57,29.57,22.07
2840093,PGE,ca,MONTEREY,000000,29416383,2024-02-02 22:06:54-05:00,2024-02-02 22:21:09-05:00,-121.610240,36.677880,14.25,29.25,21.75,65.00,926.25,1901.25,1413.75
2840094,PGE,ca,ALAMEDA,000000,29416384,2024-02-02 22:08:00-05:00,2024-02-02 22:21:09-05:00,-122.274790,37.842250,13.15,28.15,20.65,1.00,13.15,28.15,20.65


In [21]:
result = pipeline.get_dataframe()

In [22]:
result

Unnamed: 0,utility_provider,state,county,zipcode,outage_id,start_time,end_time,lat,lng,duration,duration_max,duration_mean,customer_affected_mean,total_customer_outage_time,total_customer_outage_time_max,total_customer_outage_time_mean
0,LAWP,ca,LOS ANGELES,000000,2164236,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.442245,34.150712,18.04,33.04,25.54,9.17,20491.80,20506.80,20499.30
1,LAWP,ca,LOS ANGELES,000000,2164360,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.352399,34.079090,18.04,33.04,25.54,2.30,5146.80,5161.80,5154.30
2,LAWP,ca,LOS ANGELES,000000,2164361,2023-03-31 06:47:03.414000-04:00,2023-03-31 07:05:06-04:00,-118.442245,34.150712,18.04,33.04,25.54,7.93,17716.80,17731.80,17724.30
3,LAWP,ca,LOS ANGELES,000000,2164432,2023-03-31 05:46:44.425000-04:00,2023-03-31 05:50:06-04:00,-118.352399,34.079090,3.36,18.36,10.86,13.04,28171.80,28186.80,28179.30
4,LAWP,ca,LOS ANGELES,000000,2167231,2023-03-30 03:04:00.284000-04:00,2023-03-30 03:05:07-04:00,-118.298986,34.001976,1.11,16.11,8.61,2.68,1497.57,1512.57,1505.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840091,PGE,ca,SANTA CRUZ,000000,29416381,2024-02-02 22:03:07-05:00,2024-02-02 22:21:09-05:00,-122.036320,37.167310,18.03,33.03,25.53,256.00,4616.53,8456.53,6536.53
2840092,PGE,ca,HUMBOLDT,000000,29416382,2024-02-02 22:06:35-05:00,2024-02-02 22:21:09-05:00,-123.873390,41.350810,14.57,29.57,22.07,1.00,14.57,29.57,22.07
2840093,PGE,ca,MONTEREY,000000,29416383,2024-02-02 22:06:54-05:00,2024-02-02 22:21:09-05:00,-121.610240,36.677880,14.25,29.25,21.75,65.00,926.25,1901.25,1413.75
2840094,PGE,ca,ALAMEDA,000000,29416384,2024-02-02 22:08:00-05:00,2024-02-02 22:21:09-05:00,-122.274790,37.842250,13.15,28.15,20.65,1.00,13.15,28.15,20.65


In [3]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ca/layout_investor/per_outage_investor_owned.csv')

In [9]:
df.head()

Unnamed: 0,OBJECTID,UtilityCompany,StartDate,EstimatedRestoreDate,Cause,ImpactedCustomers,County,OutageStatus,OutageType,GlobalID,OutageTypeColor,OutageStatusColor,IncidentId,x,y,timestamp
0,2164236,LAWP,2023-03-29 21:48:19.243,2023-03-29 20:00:00,Unknown,1,LOS ANGELES,Active,Not Planned,84a9a053-1659-4302-a48b-e882df433da3,#ffaa00,,6,-118.442245,34.150712,03-29-2023 22:05:07
1,2164360,LAWP,2023-03-29 21:48:19.243,2023-03-29 17:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,8be57f57-ff46-4563-9fc0-dff6d2dfdcca,#ffaa00,,7,-118.352399,34.07909,03-29-2023 22:05:07
2,2164361,LAWP,2023-03-29 21:48:19.243,2023-03-29 17:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,9c21f562-6a0f-453a-bd9e-075d233298cf,#ffaa00,,8,-118.442245,34.150712,03-29-2023 22:05:07
3,2164432,LAWP,2023-03-29 21:48:19.243,2023-03-29 21:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,096422f7-eb86-4099-a5f0-a7a2715a7df7,#ffaa00,,9,-118.352399,34.07909,03-29-2023 22:05:07
4,2167231,LAWP,2023-03-29 21:48:19.243,,Unknown,7,LOS ANGELES,Active,Not Planned,839f6d96-4d8c-4c2b-9b31-35b16492a4f9,#ffaa00,,5,-118.298986,34.001976,03-29-2023 22:05:07


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1029470 entries, 0 to 1029469
Data columns (total 16 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   OBJECTID              1029470 non-null  int64  
 1   UtilityCompany        1029470 non-null  object 
 2   StartDate             1029470 non-null  object 
 3   EstimatedRestoreDate  921070 non-null   object 
 4   Cause                 818813 non-null   object 
 5   ImpactedCustomers     1029470 non-null  int64  
 6   County                1027143 non-null  object 
 7   OutageStatus          1029470 non-null  object 
 8   OutageType            1029470 non-null  object 
 9   GlobalID              1029470 non-null  object 
 10  OutageTypeColor       1029470 non-null  object 
 11  OutageStatusColor     0 non-null        float64
 12  IncidentId            1029470 non-null  object 
 13  x                     1029470 non-null  float64
 14  y                     1029470 non-

In [7]:
len(df.OBJECTID.unique()) == len(df.GlobalID.unique())

True

In [10]:
def transform_data(data):
    try:
        # Convert timestamps
        eastern = tz.gettz('US/Eastern')
        utc = tz.gettz('UTC')
        data['timestamp'] = pd.to_datetime(data['timestamp'], utc=True).dt.tz_convert(eastern)
        data['StartDate'] = pd.to_datetime(data['StartDate'], utc=True).dt.tz_convert(eastern)
        data['end_time'] = data.groupby('OBJECTID')['timestamp'].transform('max')
        
        # Rename columns according to new specification
        data = data.rename(columns={
            'OBJECTID': 'outage_id',
            'StartDate': 'start_time',
            'ImpactedCustomers': 'customer_affected',
            'x': 'lat',
            'y': 'lng'
        })
        
        # The dataset already contains 'lat' and 'lng' so no need to extract from 'outagePoint'
        
    except Exception as e:
        print(f"An error occurred during transformation: {e}")
    
    return data

In [11]:
transform_data(df)

Unnamed: 0,outage_id,UtilityCompany,start_time,EstimatedRestoreDate,Cause,customer_affected,County,OutageStatus,OutageType,GlobalID,OutageTypeColor,OutageStatusColor,IncidentId,lat,lng,timestamp,end_time
0,2164236,LAWP,2023-03-29 17:48:19.243000-04:00,2023-03-29 20:00:00,Unknown,1,LOS ANGELES,Active,Not Planned,84a9a053-1659-4302-a48b-e882df433da3,#ffaa00,,6,-118.442245,34.150712,2023-03-29 18:05:07-04:00,2023-03-31 07:05:06-04:00
1,2164360,LAWP,2023-03-29 17:48:19.243000-04:00,2023-03-29 17:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,8be57f57-ff46-4563-9fc0-dff6d2dfdcca,#ffaa00,,7,-118.352399,34.079090,2023-03-29 18:05:07-04:00,2023-03-31 07:05:06-04:00
2,2164361,LAWP,2023-03-29 17:48:19.243000-04:00,2023-03-29 17:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,9c21f562-6a0f-453a-bd9e-075d233298cf,#ffaa00,,8,-118.442245,34.150712,2023-03-29 18:05:07-04:00,2023-03-31 07:05:06-04:00
3,2164432,LAWP,2023-03-29 17:48:19.243000-04:00,2023-03-29 21:30:00,Unknown,1,LOS ANGELES,Active,Not Planned,096422f7-eb86-4099-a5f0-a7a2715a7df7,#ffaa00,,9,-118.352399,34.079090,2023-03-29 18:05:07-04:00,2023-03-31 05:50:06-04:00
4,2167231,LAWP,2023-03-29 17:48:19.243000-04:00,,Unknown,7,LOS ANGELES,Active,Not Planned,839f6d96-4d8c-4c2b-9b31-35b16492a4f9,#ffaa00,,5,-118.298986,34.001976,2023-03-29 18:05:07-04:00,2023-03-30 03:05:07-04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029465,2229613,PGE,2023-06-04 09:50:00-04:00,2023-06-04 15:30:00,,1,SANTA CRUZ,Active,Not Planned,414a4e73-c299-4e1b-ac11-e5f0440fde7b,#ffaa00,,2128302,-121.997820,37.153850,2023-06-04 10:21:40-04:00,2023-06-04 10:21:40-04:00
1029466,2229614,PGE,2023-06-04 07:44:10-04:00,2023-06-04 17:45:00,,28,SONOMA,Active,Not Planned,84b3903b-90ca-4695-b482-1100b3bde112,#ffaa00,,2128303,-123.063650,38.350130,2023-06-04 10:21:40-04:00,2023-06-04 10:21:40-04:00
1029467,2229615,PGE,2023-06-04 09:50:00-04:00,2023-06-04 17:30:00,,1589,MADERA,Active,Not Planned,56e81e6c-8dfa-40c2-90d6-70cf998bd265,#ffaa00,,2128307,-119.710260,37.228700,2023-06-04 10:21:40-04:00,2023-06-04 10:21:40-04:00
1029468,2229616,SDGE,2023-06-04 03:11:00-04:00,2023-06-04 15:00:00,Upgrading the electric system in your area req...,1,SAN DIEGO,Active,Planned,dadc8429-7131-43e3-8d31-b6a4e89a140a,#a8a800,,1011382,-116.675996,33.247166,2023-06-04 10:21:40-04:00,2023-06-04 10:21:40-04:00
