In [None]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import spearmanr


# Allow importing from parent directory by temporarily moving the CWD up one level
# Very hacky, but there literally isn't a simpler way (in Jupyter)
import sys
sys.path.append("..")
from common import get_dataframe_from_pipeline
outages = get_dataframe_from_pipeline("../pipeline/3.csv.gz")
outages['timeOut'] = outages['dateOn'] - outages['dateOff']
outages['timeOut'] = outages['timeOut'].apply(lambda x: x.total_seconds()/3600)
outages['timeOut']
# Drop the path back down after import
sys.path.pop()

In [None]:
population_csv = pd.read_csv("../_datasets/BCPopulatedPlaces.csv")
population_csv_filtered = population_csv[[
    'id',
    'population density per square kilometre, 2021',
    'population, 2021',
    'generic term'
]]

In [None]:
outages = outages.join(population_csv_filtered.set_index('id'), on = 'nearestPopulatedPlaceId')
outages.columns

### Grouping the outages into different "parts" to analyse later:

In [None]:
def getPart(part):
  if part == "municipality":
    df = outages.groupby('outageMunicipality').agg(
    outage_count=('outageToPopulatedPlaceDistance', 'count'),
    mean_populated_distance=('outageToPopulatedPlaceDistance', 'mean'),
    mean_substation_distance= ('outageToSubstationDistance', 'mean'),
    mean_urban_distance = ('outageToUrbanAreaDistance', 'mean'),
    mean_timeout=('timeOut', 'mean'), 
    total_people_affected=('numCustomersOut', 'sum')
    ).reset_index()
    return df
  if part == "region":
    df = outages.groupby('regionName').agg(
    outage_count=('outageToPopulatedPlaceDistance', 'count'),
    mean_populated_distance=('outageToPopulatedPlaceDistance', 'mean'),
    mean_substation_distance= ('outageToSubstationDistance', 'mean'),
    mean_urban_distance = ('outageToUrbanAreaDistance', 'mean'),
    mean_timeout=('timeOut', 'mean'),
    total_people_affected=('numCustomersOut', 'sum')
    ).reset_index()
    return df
  # grouping by 
  if part == "substation":
    outages['subBin'] = outages['outageToSubstationDistance'].round(0)
    df = outages.groupby(by='subBin').agg(
    outage_count=('outageToPopulatedPlaceDistance', 'count'),
    mean_populated_distance=('outageToPopulatedPlaceDistance', 'mean'),
    mean_substation_distance= ('outageToSubstationDistance', 'mean'),
    mean_urban_distance = ('outageToUrbanAreaDistance', 'mean'),
    mean_timeout=('timeOut', 'mean'),
    total_people_affected=('numCustomersOut', 'sum')
    ).reset_index()
    return df
  if part == "populated":
    outages['placeBin'] = outages['outageToPopulatedPlaceDistance'].round(0)
    df = outages.groupby(by='placeBin').agg(
    outage_count=('outageToPopulatedPlaceDistance', 'count'),
    mean_populated_distance=('outageToPopulatedPlaceDistance', 'mean'),
    mean_substation_distance= ('outageToSubstationDistance', 'mean'),
    mean_urban_distance = ('outageToUrbanAreaDistance', 'mean'),
    mean_timeout=('timeOut', 'mean'),
    total_people_affected=('numCustomersOut', 'sum')
    ).reset_index()
    return df
  if part == "urban":
    outages['urbanAreaBin'] = outages['outageToUrbanAreaDistance'].round(0)
    df = outages.groupby(by='urbanAreaBin').agg(
    outage_count=('outageToPopulatedPlaceDistance', 'count'),
    mean_populated_distance=('outageToPopulatedPlaceDistance', 'mean'),
    mean_substation_distance= ('outageToSubstationDistance', 'mean'),
    mean_urban_distance = ('outageToUrbanAreaDistance', 'mean'),
    mean_timeout=('timeOut', 'mean'),
    total_people_affected=('numCustomersOut', 'sum')
    ).reset_index()
    return df




The function bellow takes takes the parts (municipality or region) and analyses the data:

In [None]:
def analyse(df):

  df['outage_per_person'] =  df['outage_count']/df['total_people_affected']
  #to avoid normal test giving an error for less than 8 datapoints:
  if(df.shape[0] > 7):
    #normaltest:
    p = stats.normaltest(df['outage_count']).pvalue
    print(f"Outage count normal test p-value: {p:.4f}")

  # mean substation distance vs outage count
  print("Spearman test  (mean substation distance vs outage count)")
  rho, pval = spearmanr(df['mean_substation_distance'], df['outage_count'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # mean substation distance vs with outage per person
  print("Spearman test  (mean substation distance vs outage per person)")
  rho, pval = spearmanr(df['mean_substation_distance'], df['outage_per_person'])  
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # mean substation distance vs mean outage duration
  print("Spearman test  (mean substation distance vs mean outage duration)")
  rho, pval = spearmanr(df['mean_timeout'], df['mean_substation_distance'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # mean populated distance with outage counts
  print("Spearman test  (mean populated distance vs outage counts)")
  rho, pval = spearmanr(df['mean_populated_distance'], df['outage_count'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # mean populated place distance vs with outage per person
  print("Spearman test  (mean populated place distance vs outage per person)")
  rho, pval = spearmanr(df['mean_populated_distance'], df['outage_per_person'])  
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # mean populated place distance vs mean outage duration
  print("Spearman test  (mean populated place distance vs mean outage duration)")
  rho, pval = spearmanr(df['mean_timeout'], df['mean_populated_distance'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  # comparing urban distance:
  print("Spearman test  (mean urban distance vs outage count)")
  rho, pval = spearmanr(df['mean_urban_distance'], df['outage_count'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  print("Spearman test  (mean urban distance vs mean outage duration)")
  rho, pval = spearmanr(df['mean_urban_distance'], df['mean_timeout'])
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

  print("Spearman test  (mean urban distance vs outage per person)")
  rho, pval = spearmanr(df['mean_urban_distance'], df['outage_per_person'])  
  print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")


In [None]:
df = getPart("urban")
analyse(df)

In [None]:
df = getPart('populated')
analyse(df)

In [None]:
df = getPart('substation')
analyse(df)

In [None]:
df = getPart('municipality')
analyse(df)

In [None]:
df = getPart('region')
analyse(df)