In [2]:
import pandas as pd
import ast
import pytz
import os
import json
import yaml
import glob
from dateutil import tz
from datetime import datetime
from IPython.display import display

pd.set_option('display.max_columns', None)

import pandas as pd
import ast
import pytz
import os
import json
import yaml
from dateutil import tz
from datetime import datetime
from IPython.display import display

pd.set_option('display.max_columns', None)

class BasePipeline:
    def __init__(self, config, base_file_path):
        self.config = config
        self.base_file_path = base_file_path
        self.geomap = {}
        self._data = pd.DataFrame({})
    
    def construct_file_path(self):
        #TODO: add type to prefix mapping
        file_prefix = 'per_outage' if self.config['type'] == 'o' else 'per_county'
        file_path = f"{self.base_file_path}/{self.config['state']}/layout_{self.config['layout']}/{file_prefix}_{self.config['name']}.csv"
        return file_path.replace('//', '/')

    def load_data(self):
        try:
            file_path = self.construct_file_path()
            print(file_path)
            self._data = pd.read_csv(file_path)
            with open('zip_to_county_name.json', 'r') as json_file:
                self.geomap['zip_to_county_name'] = json.load(json_file)
            with open('zip_to_county_fips.json', 'r') as json_file:
                self.geomap['zip_to_county_fips'] = json.load(json_file)
        except Exception as e:
            print(f"An error occurred during file loading: {e}")
            
    def transform(self):
        raise NotImplementedError

    def standardize(self):
        self.load_data()
        self.transform()
        grouped = self._data.groupby('outage_id').apply(self._compute_metrics).reset_index().round(2)
        self._data = pd.merge(grouped, self._data, on=['outage_id', 'timestamp'], how='inner')
        
        self._data['state'] = self.config['state']
        if self.config['state'] != 'ca':
            self._data['utility_provider'] = self.config['name'] 
            self._data['county'] = self._data['zipcode'].map(self.geomap) 
        
        self._data = self._data[[
            'utility_provider', 'state', 'county', 'zipcode',
            'outage_id', 'start_time', 'end_time', 'lat', 'lng', 
            'duration', 'duration_max', 'duration_mean', 'customer_affected_mean', 'total_customer_outage_time', 'total_customer_outage_time_max', 'total_customer_outage_time_mean'
        ]]
        
        return self._data
    
    def output_data(self, standard_data):
        # TODO: Output unified data
        pass
    
    def get_dataframe(self):
        return self._data
    
    def _compute_metrics(self, group):
        duration = (group['end_time'] - group['start_time']).dt.total_seconds() / 60
        duration_max = duration + 15
        duration_mean = (duration + duration_max) / 2
        customer_affected_mean = group['customer_affected'].mean()
        
        total_customer_outage_time = 15 * (group['customer_affected'].sum() - group['customer_affected'].iloc[0]) + (group['timestamp'].iloc[0] - group['start_time'].iloc[0]).total_seconds() / 60 * group['customer_affected'].iloc[0]
        total_customer_outage_time_max = total_customer_outage_time + 15 * group['customer_affected'].iloc[-1]
        total_customer_outage_time_mean = (total_customer_outage_time + total_customer_outage_time_max) / 2

        return pd.Series({
            'timestamp': group['end_time'].iloc[-1],
            'duration': duration.iloc[-1],
            'duration_max': duration_max.iloc[-1],
            'duration_mean': duration_mean.iloc[-1],
            'customer_affected_mean': customer_affected_mean,
            'total_customer_outage_time': total_customer_outage_time,
            'total_customer_outage_time_max': total_customer_outage_time_max,
            'total_customer_outage_time_mean': total_customer_outage_time_mean
        })
        
    def check_vars(self):
        # TODO: Check other useful variables
        pass

In [4]:
class GA11TX12(BasePipeline):
    def standardize(self, outage_data):
        # Specific transformation for GA11TX12
        pass

In [3]:
df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_11/per_outage_Mitchell EMC.csv')

  df = pd.read_csv('/Users/xuanedx1/github/outage-data-scraper/data/s3/ga/layout_11/per_outage_Mitchell EMC.csv')


In [6]:
df


Unnamed: 0,substation,feeder,incident_id,alias,outage_comment,estimated_restore_time,formatted_ert,start_date,duration,consumers_affected,lon,lat,opt_code,service_index_name,outages,NumConsumers,zip_code,isHighTraffic,updateTime,EMC,timestamp,poly
0,0.0,0.0,G437006,,Meter Shop,,,03/15 07:14 am,09 hr 10 min,0,,,,Electric,1,25259,unknown,False,"Mar 15, 3 25, pm",Mitchell EMC,,
1,8.0,4.0,C483513,,,,,03/15 02:24 pm,02 hr 00 min,1,-84.374671,31.214144,,Electric,1,25259,31730,False,"Mar 15, 3 25, pm",Mitchell EMC,,
2,0.0,0.0,G437006,,Meter Shop,,,03/15 07:14 am,09 hr 25 min,0,,,,Electric,0,25259,unknown,False,"Mar 15, 3 40, pm",Mitchell EMC,,
3,0.0,0.0,G437006,,Meter Shop,,,03/15 07:14 am,09 hr 40 min,0,,,,Electric,0,25259,unknown,False,"Mar 15, 3 55, pm",Mitchell EMC,,
4,0.0,0.0,G437006,,Meter Shop,,,03/15 07:14 am,09 hr 55 min,0,,,,Electric,0,25259,unknown,False,"Mar 15, 4 10, pm",Mitchell EMC,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129141,4.0,4.0,D501452,,DAKOTA - MDR,,,02/02 03:59 pm,03 hr 26 min,1,-84.548668,31.377461,C- Other Planned,Electric,0,25448,39862,False,"Feb 2, 6 25, pm",Mitchell EMC,02-03-2024 00:25:38,"[{'lon': '-84.5475030825143', 'lat': '31.37716..."
129142,4.0,4.0,D501452,,DAKOTA - MDR,,,02/02 03:59 pm,03 hr 40 min,1,-84.548668,31.377461,C- Other Planned,Electric,0,25448,39862,False,"Feb 2, 6 40, pm",Mitchell EMC,02-03-2024 00:40:39,"[{'lon': '-84.5475030825143', 'lat': '31.37716..."
129143,4.0,4.0,D501452,,DAKOTA - MDR,,,02/02 03:59 pm,03 hr 55 min,1,-84.548668,31.377461,C- Other Planned,Electric,0,25448,39862,False,"Feb 2, 6 55, pm",Mitchell EMC,02-03-2024 00:55:38,"[{'lon': '-84.5475030825143', 'lat': '31.37716..."
129144,4.0,4.0,D501452,,DAKOTA - MDR,,,02/02 03:59 pm,04 hr 10 min,1,-84.548668,31.377461,C- Other Planned,Electric,0,25448,39862,False,"Feb 2, 7 10, pm",Mitchell EMC,02-03-2024 01:10:37,"[{'lon': '-84.5475030825143', 'lat': '31.37716..."


In [6]:
len(df.incident_id.unique())

7649

In [12]:
df.groupby(['incident_id']).count()

Unnamed: 0_level_0,substation,feeder,alias,outage_comment,estimated_restore_time,formatted_ert,start_date,duration,consumers_affected,lon,lat,opt_code,service_index_name,outages,NumConsumers,zip_code,isHighTraffic,updateTime,EMC,timestamp,poly
incident_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C483513,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0
C483521,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0
C483523,6,6,0,0,0,0,6,6,6,6,6,0,6,6,6,6,6,6,6,0,0
C483525,6,6,0,0,0,0,6,6,6,6,6,0,6,6,6,6,6,6,6,0,0
C483528,1,1,0,0,0,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D501452,18,18,0,18,0,0,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
G437006,28265,28265,0,28265,0,0,28265,28265,28265,10936,10936,0,28265,28265,28265,28265,28265,28265,28265,27035,19222
G484576,1,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
G495149,4,4,0,4,0,0,4,4,4,4,4,0,4,4,4,4,4,4,4,4,4


In [17]:
df.groupby(['incident_id','start_date'])['duration'].max()

incident_id  start_date    
C483513      03/15 02:24 pm           02 hr 00 min
C483521      03/16 07:22 am           00 hr 02 min
C483523      03/16 01:27 pm           01 hr 42 min
C483525      03/17 06:04 am           01 hr 49 min
C483528      03/17 08:34 am           00 hr 05 min
                                      ...         
G437006      12/27 07:41 am           23 hr 43 min
             12/28 07:22 am    5 days 01 hr 32 min
G484576      04/14 10:25 am           00 hr 30 min
G495149      08/30 09:40 am           01 hr 00 min
G495228      08/30 11:12 am           05 hr 12 min
Name: duration, Length: 7941, dtype: object

In [20]:
len(df[df['outages'] == 1])

14470

In [14]:
(df.groupby(['incident_id'])['start_date'].nunique() >  1).sum()

25