### Tests relating to the different regions:

In [None]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np

# Allow importing from parent directory by temporarily moving the CWD up one level
# Very hacky, but there literally isn't a simpler way (in Jupyter)
import sys
sys.path.append("..")
from common import get_dataframe_from_pipeline
outages = get_dataframe_from_pipeline("../pipeline/3.csv.gz")
outages['timeOut'] = outages['dateOn'] - outages['dateOff']
outages['timeOut'] = outages['timeOut'].apply(lambda x: x.total_seconds()/3600)
# Drop the path back down after import
sys.path.pop()

### Timeouts per region:

In [None]:
grouped_data = outages.groupby('regionName')['timeOut'].apply(list)
grouped_df = grouped_data.reset_index(name='timeOutList')
grouped_df

### Checking if the timeouts for different regions are normally distributed:
its seems like they are very non normal

In [None]:
for i in range(7):
    p = stats.normaltest(grouped_df['timeOutList'][i]).pvalue
    print(grouped_df['regionName'][i], f"{p:.50f}")

### Transforming the timeouts:

In [None]:
grouped_df['logTimeOutList'] = grouped_df['timeOutList'].apply(
  lambda lst: [np.log(x) for x in lst]
)
#Checking if the transformed timeouts is normal:
for i in range(7):
    p = stats.normaltest(grouped_df['logTimeOutList'][i]).pvalue
    print(grouped_df['regionName'][i], f"{p:.50f}")

### T-test: 
Comparing lower Mainland timeouts to North VI gave significant results indicating that lower mainland indeed has less timeout times compared to North VI!
### Important: We cannot use the results of this because welch's t test still needs normality!

In [None]:
t_stat, p_value = stats.ttest_ind(grouped_df['timeOutList'][1], grouped_df['timeOutList'][2], equal_var=False, alternative='less')
print("Levene Test p-value:", stats.levene(grouped_df['timeOutList'][1], grouped_df['timeOutList'][2]).pvalue)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

### Anova test?
but we need normality for that!
### We will use Kruskal instead
it is a non parametric anova! below, since the p value is very low, we reject the null hypothesis and proceed to perform a pairwise post hoc test to see which ones are different.

In [None]:
groups = grouped_df['timeOutList'].tolist()
h_stat, p_value = stats.kruskal(*groups)
print(f"H-statistic: {h_stat}, p-value: {p_value}")

### Pairwise Games-Howell post-hoc test:
"Although rather similar to Tukey's test in its formulation, the Games-Howell test does not assume equal variances and sample sizes."
its also a non parametric test - doesnt need normality
<p>
It is interesting that the p-value between Lower Mainland and Northen VI agrees with the p-value I got above (with T-test) but it is not nearly as small!

In [None]:
import pingouin as pg

tests_regions = pg.pairwise_gameshowell(dv= 'timeOut', between='regionName', data=outages).round(4)
tests_regions.to_csv("regions_compared.csv")

In [None]:
import plotly.graph_objects as go
df = tests_regions

# Create Plotly Table
fig = go.Figure(data=[go.Table(
    header=dict(
        values=list(df.columns),
        fill_color='paleturquoise',
        align='left'
    ),
    cells=dict(
        values=[df[col] for col in df.columns],
        fill_color='lavender',
        align='left'
    )
)])

fig.show()