# Outage Distances Computation

Goal: Compute the distance of each power outage to its closest landmarking features
- There are two landmarking features we care about:
  1. Substations
  2. Populated Place

## Define Functions for Computing Distance

In [None]:
import pandas as pd
import numpy as np


def haversine(lat1, lon1, lat2, lon2):
  """
    Returns distance between two coordinate points in km
  """
  # reference : exercise 3, calc_distance.py haversine formula
  # Note : the radius of Earth is 6371 km
  # https://stackoverflow.com/a/21623206
  r = 6371 # km
  p = np.pi / 180

  a = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p) * np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p))/2
  return 2 * r * np.asin(np.sqrt(a))


def distance_between_feature_outage(outages: pd.DataFrame, features: pd.DataFrame):
  # iterate every feature for each outage
  outagesLat = pd.DataFrame(outages["latitude"].values).values
  outagesLong = pd.DataFrame(outages["longitude"].values).values

  # calculates the distance between each outage and each substation
  return haversine(
    outagesLat,
    outagesLong,
    features["latitude"].values,
    features["longitude"].values
  )


# calculates the closest feature for each outage
def shortest_distance(outages: pd.DataFrame, features: pd.DataFrame, featureName: str):
  distances = distance_between_feature_outage(outages, features)

  # find the nearest distance between each substation and outage
  nearestDistances = distances.min(axis=1)
  # for each outage, get the index of the shortest distance
  shortestDistanceIndexes = distances.argmin(axis=1)
  # get the nearest feature corresponding to the shortest distance
  nearestFeatures = features.iloc[shortestDistanceIndexes]

  closestSubstationsToOutages = pd.DataFrame(
    {
      "outage_id": outages["id"],
      f"nearest_{featureName}_id": nearestFeatures["id"].values,
      f"nearest_{featureName}_distance_km": nearestDistances,
    }
  )

  return closestSubstationsToOutages

## Apply Functions

In [None]:
from common import get_dataframe_from_pipeline

outages = get_dataframe_from_pipeline("./pipeline/2.csv.gz")
features = pd.read_csv("./_datasets/BCSubstationLocations.csv")
populatedPlaces = pd.read_csv("./_datasets/BCPopulatedPlaces.csv")

features.drop(
  columns=[
    "state",
    "zip",
    "county",
    "countyfips",
    "country",
    "naics_code",
    "naics_desc",
    "source",
    "sourcedate",
    "val_method",
    "val_date",
    "lines",
    "max_volt",
    "min_volt",
    "max_infer",
    "min_infer",
    "status",
    "type"
  ],
  inplace=True,
)

populatedPlaces.rename(
  columns={
    "Latitude": "latitude",
    "Longitude": "longitude",
  },
  inplace=True
)

closestSubstationToOutages = shortest_distance(outages, features, "substation")
closestPopulatedPlaceToOutages = shortest_distance(outages, populatedPlaces, "populated_place")

closestPopulatedPlaceToOutages

In [None]:
# Merge dataframes together
outages.rename(columns={"id": "outage_id"}, inplace=True)

mergedData = pd.merge(outages, closestSubstationToOutages, on="outage_id", how="left")
mergedData = pd.merge(
  mergedData,
  features,
  left_on="nearest_substation_id",
  right_on="id",
  how="left",
  suffixes=("_outage", "_substation"),
)
mergedData.drop(
  # 'id' is referring to the substation id, which identical to the 'nearest_substation_id' column
  # thus is redundant and confusing
  columns=["id"],
  inplace=True,
)

# mergedData

mergedData.rename(columns={
  "outage_id": "outageId",
  "regionId": "outageRegionId",
  "municipality": "outageMunicipality",
  "area": "outageArea",
  "cause": "outageCause",
  
  "latitude_outage": "outageLatitude",
  "longitude_outage": "outageLongitude",

  "nearest_substation_id": "nearestSubstationId",
  "nearest_substation_distance_km": "outageToSubstationDistance",

  "name": "substationName",
  "city": "substationCity",
  "latitude_substation": "substationLatitude",
  "longitude_substation": "substationLongitude",
}, inplace=True)

mergedData.to_csv(
  "./pipeline/3.csv.gz", index=False, compression="gzip"
)

mergedData