# Substation Distance Computation:

*Goal, compute the distance of each power outage to it's closest substation*

## Define Functions for Computing Distance

In [None]:
import pandas as pd
import numpy as np


def haversine(lat1, lon1, lat2, lon2):
  # reference : exercise 3, calc_distance.py haversine formula
  # Note : the radius of Earth is 6371 km
  radiusOfEarth = 6371.0
  dLat = np.radians(lat2 - lat1)
  dLon = np.radians(lon2 - lon1)
  formulaA = (np.sin(dLat / 2) ** 2) + np.cos(np.radians(lat1)) * np.cos(
    np.radians(lat2)
  ) * (np.sin(dLon / 2) ** 2)
  formulaC = 2 * np.arctan2(np.sqrt(formulaA), np.sqrt(1 - formulaA))
  distance = radiusOfEarth * formulaC
  return distance


def distance_between_substation_outage(outages, substations):
  # get outage coordinates and substation coordinates
  outagesLat = pd.DataFrame(outages["latitude"].values).values
  outagesLong = pd.DataFrame(outages["longitude"].values).values
  substationsLat = substations["latitude"].values
  substationsLong = substations["longitude"].values

  # calculates the distance between each outage and each substation
  distances = haversine(outagesLat, outagesLong, substationsLat, substationsLong)
  return distances


# calculates the closest substation for each outage
def shortest_distance(outages, substations):
  distances = distance_between_substation_outage(outages, substations)

  # find the nearest distance between each substation and outage
  nearestDistances = distances.min(axis=1)
  # for each outage, get the index of the shortest distance
  shortestDistanceIndexes = distances.argmin(axis=1)
  # get the nearest substation corressponding to the shortest distance
  nearestSubstations = substations.iloc[shortestDistanceIndexes]

  closestSubstationsToOutages = pd.DataFrame(
    {
      "outage_id": outages["id"],
      "nearest_substation_id": nearestSubstations["id"].values,
      "distance_km": nearestDistances,
    }
  )

  return closestSubstationsToOutages

## Apply Functions

In [None]:
from common import get_dataframe_from_pipeline

outages = get_dataframe_from_pipeline("./pipeline/2.csv.gz")
substations = pd.read_csv("./_datasets/BCSubstationLocations.csv")

substations.drop(
  columns=[
    "state",
    "zip",
    "county",
    "countyfips",
    "country",
    "naics_code",
    "naics_desc",
    "source",
    "sourcedate",
    "val_method",
    "val_date",
    "lines",
    "max_volt",
    "min_volt",
    "max_infer",
    "min_infer",
    "status"
  ],
  inplace=True,
)

outages["latitude"] = outages["latitude"].astype(float)
outages["longitude"] = outages["longitude"].astype(float)

substations["latitude"] = substations["latitude"].astype(float)
substations["longitude"] = substations["longitude"].astype(float)

closestSubstationsToOutages = shortest_distance(outages, substations)

# Merge dataframes together
outages.rename(columns={"id": "outage_id"}, inplace=True)

mergedData = pd.merge(outages, closestSubstationsToOutages, on="outage_id", how="left")
mergedData = pd.merge(
  mergedData,
  substations,
  left_on="nearest_substation_id",
  right_on="id",
  how="left",
  suffixes=("_outage", "_substation"),
)
mergedData.drop(
  # 'id' is referring to the substation id, which identical to the 'nearest_substation_id' column
  # thus is redundant and confusing
  columns=["id"],
  inplace=True,
)

mergedData

mergedData.rename(columns={
  "outage_id": "outageId",
  "regionId": "outageRegionId",
  "municipality": "outageMunicipality",
  "area": "outageArea",
  "cause": "outageCause",
  
  "latitude_outage": "outageLatitude",
  "longitude_outage": "outageLongitude",

  "nearest_substation_id": "nearestSubstationId",
  "distance_km": "outageToSubstationDistance",

  "name": "substationName",
  "city": "substationCity",
  "type": "typeOfSubstation",
  "latitude_substation": "substationLatitude",
  "longitude_substation": "substationLongitude",
}, inplace=True)

mergedData.to_csv(
  "./pipeline/3.csv.gz", index=False, compression="gzip"
)

### Column headers were previously:
outage_id, gisId, regionId  , municipality                           , area                                                                                     , cause                                   , numCustomersOut, crewEta                  , dateOff                  , estDateOn                , lastUpdated              , regionName      , crewEtr                  , latitude_outage, longitude_outage, dateOn                   , nearest_substation_id, distance_km        , name                    , city                    , type          , latitude_substation, longitude_substation