In [None]:
# Import the data from acquire

import pandas as pd
from common import timeStampFields, csvFileName

timeFields = [t[0] for t in timeStampFields]

pdOutages = pd.read_csv(csvFileName, header='infer', parse_dates=timeFields)

# Convert the time fields to America/Vancouver timezone
for field in timeFields:
  pdOutages[field] = pdOutages[field].dt.tz_convert('America/Vancouver')

### Merge Duplicates

Sometimes, outages are duplicated in BC Hydro's own data (ie, the same outage is reported twice __with two different IDs__)

We should be able to fix this, though, by grouping all outages that have overlapping start and end times with an identical location

In [None]:

removedDups = 0

def matchIntervals(areaEntries: pd.DataFrame):
  # Group into intervals
  # https://stackoverflow.com/a/48243958
  intervals = (
    (
      areaEntries["dateOn"]
      .apply(lambda x: x.timestamp())
      .rolling(window=2, min_periods=1)
      .min()
      - areaEntries["dateOff"]
      .apply(lambda x: x.timestamp())
      .rolling(window=2, min_periods=1)
      .max()
    )
    < 0
  ).cumsum()

  areaEntries["interval"] = intervals

  global removedDups
  removedDups += intervals.count() - len(intervals.unique())

  return areaEntries


mergedPdOutages = (
  pdOutages.groupby(by=["area"])
  .apply(matchIntervals)
  .reset_index(drop=True)
  .groupby(by=["area", "interval"])
  .aggregate(
    {
      "id": "first",  # Maybe not the best idea, perhaps combine ids?
      "gisId": "first",
      "regionId": "first",
      "municipality": "first",
      "area": "first",
      "cause": "first",
      "numCustomersOut": "max",
      "crewEta": "max",
      "crewEtr": "max",
      "dateOff": "min",
      "dateOn": "max",
      "estDateOn": "max",
      "lastUpdated": "max",
      "regionName": "first",
      "latitude": "first",
      "longitude": "first",
      "polygon": "last",
      "interval": "max"
    }
  )
  .reset_index(drop=True)
)

print(f"Merged {removedDups} outages with overlapping start/end times and areas")