In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn
seaborn.set_theme()

# Allow importing from parent directory by temporarily moving the CWD up one level
# Very hacky, but there literally isn't a simpler way (in Jupyter)
import sys
sys.path.append("..")
from common import get_dataframe_from_pipeline
outages = get_dataframe_from_pipeline("../pipeline/3.csv.gz")
sys.path.pop()

### Box plot of distances

In [None]:
plt.boxplot(outages['outageToSubstationDistance'], vert=False) 
plt.title('Distribution of distances of outages to the nearest station') 
plt.xlabel('Distance (km)') 
plt.yticks([1], ['Outages'])
plt.show()

Zooming in on the box:

In [None]:
plt.boxplot(outages['outageToSubstationDistance'], vert=False) 
plt.title('Distribution of distances of outages to the nearest station') 
plt.xlabel('Distance (km)') 
plt.yticks([1], ['Outages'])
plt.xlim(0,  20)  # Limit the x-axis to the threshold
plt.show()

### Box plot of timeouts

In [None]:
outages['timeOut'] = outages['dateOn'] - outages['dateOff']
outages['timeOut'] = outages['timeOut'].apply(lambda x: x.total_seconds()/3600)
outages['timeOut']
plt.boxplot(outages['timeOut'], vert=False) 
plt.title('Distribution of the duration of the outage') 
plt.xlabel('Timeout (hour)') 
plt.yticks([1], ['Outages'])
plt.show()

Zooming in on the median:

In [None]:
plt.boxplot(outages['timeOut'], vert=False) 
plt.title('Distribution of the duration of the outage') 
plt.xlabel('Timeout (hour)') 
plt.yticks([1], ['Outages'])
plt.xlim(0, 20)  # Limit the x-axis to the threshold
plt.show()

Timeouts for different regions

In [None]:
grouped_data = outages.groupby('regionName')['timeOut'].apply(list)
plt.boxplot(grouped_data, vert=False)
plt.title('Distribution of the duration of the outage') 
plt.xlabel('Timeout (hour)') 
plt.ylabel('Region')
plt.yticks(range(1, len(grouped_data) + 1), grouped_data.index)
plt.show()

### Zooming in on the median for each region

In [None]:
grouped_data = outages.groupby('regionName')['timeOut'].apply(list)
plt.boxplot(grouped_data, vert=False)
plt.title('Distribution of the duration of the outage') 
plt.xlabel('Timeout (hour)') 
plt.ylabel('Region')
plt.yticks(range(1, len(grouped_data) + 1), grouped_data.index)
plt.xlim(0, 30)  # only viewing up to an hour
plt.show()

very interesting that the median lies all the way there, but there is still so many outages having much larger timeouts.

In [None]:
grouped_data = outages.groupby('regionName')['timeOut'].apply(list)
plt.boxplot(grouped_data, vert=False)
plt.title('Distribution of the duration of the outage') 
plt.xlabel('Timeout (hour)') 
plt.ylabel('Region')
plt.yticks(range(1, len(grouped_data) + 1), grouped_data.index)
plt.xlim(0, 13)  # only viewing up to 13 hours
plt.tight_layout()
plt.show()

In [None]:
seaborn.boxplot(data = outages, orient='h', 
                x='timeOut', y='regionName', 
                hue='regionName', palette='viridis', legend=False  )
plt.title('Distribution of Outage Durations by Region', fontsize=14, fontweight='bold')
plt.xlabel('Timeout (hours)')
plt.ylabel('Region')
plt.xlim(0, 10) # only viewing up to 10hours
plt.tight_layout()
plt.show()


### Distances per region:

In [None]:
seaborn.boxplot(data=outages, y='regionName', x ='outageToSubstationDistance', orient='h')
plt.xlabel('Distance to Substation (Km)')
plt.ylabel('Region')
plt.title('Distribution of Distances per Region')
#plt.xlim(0, 100) 

plt.tight_layout()
plt.show()