# Outage Distances Computation

Goal: Compute the distance of each power outage to its closest landmarking features
- There are two landmarking features we care about:
  1. Substations
  2. Populated Place

## Define Functions for Computing Distance

In [None]:
import pandas as pd
import numpy as np


def haversine(lat1, lon1, lat2, lon2):
  """
    Returns distance between two coordinate points in km
  """
  # reference : exercise 3, calc_distance.py haversine formula
  # Note : the radius of Earth is 6371 km
  # https://stackoverflow.com/a/21623206
  r = 6371 # km
  p = np.pi / 180

  a = 0.5 - np.cos((lat2-lat1)*p)/2 + np.cos(lat1*p) * np.cos(lat2*p) * (1-np.cos((lon2-lon1)*p))/2
  return 2 * r * np.arcsin(np.sqrt(a))


def distance_between_feature_outage(outages: pd.DataFrame, features: pd.DataFrame):
  # iterate every feature for each outage
  outagesLat = pd.DataFrame(outages["latitude"].values).values
  outagesLong = pd.DataFrame(outages["longitude"].values).values

  # calculates the distance between each outage and each substation
  return haversine(
    outagesLat,
    outagesLong,
    features["latitude"].values,
    features["longitude"].values
  )


# calculates the closest feature for each outage
def shortest_distance(outages: pd.DataFrame, features: pd.DataFrame, featureName: str):
  distances = distance_between_feature_outage(outages, features)

  # find the nearest distance between each feature and outage
  nearestDistances = distances.min(axis=1)
  # for each outage, get the index of the shortest distance
  shortestDistanceIndexes = distances.argmin(axis=1)
  # get the nearest feature corresponding to the shortest distance
  nearestFeatures = features.iloc[shortestDistanceIndexes]

  closestSubstationsToOutages = pd.DataFrame(
    {
      "outage_id": outages["id"],
      f"nearest_{featureName}_id": nearestFeatures["id"].values,
      f"nearest_{featureName}_distance_km": nearestDistances,
    }
  )

  return closestSubstationsToOutages

## Apply Functions

In [None]:
from common import get_dataframe_from_pipeline

outages = get_dataframe_from_pipeline("./pipeline/2.csv.gz")
substations = pd.read_csv("./_datasets/BCSubstationLocations.csv")
populatedPlaces = pd.read_csv("./_datasets/BCPopulatedPlaces.csv")

closestSubstationToOutages = shortest_distance(outages, substations, "substation")
closestPopulatedPlaceToOutages = shortest_distance(outages, populatedPlaces, "populated_place")

In [None]:
# Merge dataframes together
mergedData = outages.join(
  closestSubstationToOutages.set_index("outage_id"),
  on="id",
  how="left",
  rsuffix="_substation"
)

mergedData = mergedData.join(
  closestPopulatedPlaceToOutages.set_index("outage_id"),
  on="id",
  how="left",
  rsuffix="_populated_place"
)

mergedData.rename(columns={
  "id": "outageId",
  "regionId": "outageRegionId",
  "municipality": "outageMunicipality",
  "area": "outageArea",
  "cause": "outageCause",
  
  "latitude": "outageLatitude",
  "longitude": "outageLongitude",

  "nearest_substation_id": "nearestSubstationId",
  "nearest_substation_distance_km": "outageToSubstationDistance",

  "nearest_populated_place_id": "nearestPopulatedPlaceId",
  "nearest_populated_place_distance_km": "outageToPopulatedPlaceDistance",
}, inplace=True)

mergedData.to_csv(
  "./pipeline/3.csv.gz", index=False, compression="gzip"
)