In [None]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import sys
from scipy.stats import spearmanr

sys.path.append("..")
from common import get_dataframe_from_pipeline
outages = get_dataframe_from_pipeline("../pipeline/3.csv.gz")
outages['timeOut'] = outages['dateOn'] - outages['dateOff']
outages['timeOut'] = outages['timeOut'].apply(lambda x: x.total_seconds()/3600)
outages['timeOut']
# Drop the path back down after import
sys.path.pop()

only keeping 'id','Population density per square kilometre, 2021', 'Population, 2021', 'Generic Term' to join with the outages csv

In [None]:
population_csv = pd.read_csv("../_datasets/BCPopulatedPlaces.csv")
population_csv_filtered = population_csv.filter(items=['id','Population density per square kilometre, 2021', 'Population, 2021', 'Generic Term'])

In [None]:
outages = outages.join(population_csv_filtered.set_index('id'), on = 'nearestPopulatedPlaceId')
outages.columns


### Checking normality:

In [None]:
stats.normaltest(outages['outageToPopulatedPlaceDistance']).pvalue

In [None]:
stats.normaltest(np.log(outages['outageToPopulatedPlaceDistance'])).pvalue

### Trying Distance bins again similar to tests-distance.ipynb

In [None]:
n = 1000 #number of bins
distance_bins = pd.cut(outages['outageToPopulatedPlaceDistance'], bins=n)
outages['distance_bin'] = distance_bins
outages_per_dist = outages.groupby(['distance_bin']).size().reset_index(name="# of outages")
outages_per_dist

In [None]:
outages['outageToPopulatedPlaceDistance'].max()

### The results for the Welch's ttest cannot be trusted still because it assumes normality

In [None]:
#median_bin = n//2
median_bin = n//2

closer_outages = outages_per_dist[outages_per_dist['distance_bin'].cat.codes < median_bin]['# of outages'].reset_index(drop=True)
farther_outages = outages_per_dist[outages_per_dist['distance_bin'].cat.codes >= median_bin]['# of outages'].reset_index(drop=True)
closer_outages = closer_outages.to_frame()
farther_outages = farther_outages.to_frame()

t_stat, p_value = stats.ttest_ind(farther_outages['# of outages'], closer_outages['# of outages'], equal_var=False, alternative='greater')
print("Levene Test p-value:", stats.levene(farther_outages['# of outages'], closer_outages['# of outages']).pvalue)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

### U-test:

In [None]:
#trying manwhitney
print(stats.mannwhitneyu(farther_outages['# of outages'], closer_outages['# of outages'], alternative='greater').pvalue)
#super skewed data, i dont know if we can trust this

### Exactly similar results to the distances to substations!

### Correlation between timeouts and distance to the nearest city?

In [None]:
fit = stats.linregress(outages['timeOut'], outages['outageToPopulatedPlaceDistance'])
plt.plot(outages["timeOut"], outages["outageToPopulatedPlaceDistance"], 'b.', alpha = 0.5)
plt.plot(outages["timeOut"], outages["timeOut"]*fit.slope + fit.intercept, 'r-', linewidth = 3)
plt.title('Timeout vs Distance')
plt.ylabel('Distance (km)')
plt.xlabel('TimeOut (hour)')
plt.show()

### Important: The fitted lines cannot be trusted because the residuals are not normal

In [None]:
y = outages["outageToPopulatedPlaceDistance"]
x = outages["timeOut"]
residuals = y - (fit.slope*x + fit.intercept)
plt.hist(residuals, bins = 90)
plt.title("plot of the residuals", fontsize=14, fontweight='bold')
plt.show()

In [None]:
stats.normaltest(residuals).pvalue

### Trying new tests:

### Looking at outages in different **municipalities**:

In [None]:
outages_per_municipality = outages.groupby('outageMunicipality').size().reset_index(name = 'outage_count')
distances_per_municipality = outages.groupby('outageMunicipality')['outageToPopulatedPlaceDistance'].mean().reset_index(name='mean_distance')
municipality_stats = pd.merge(outages_per_municipality, distances_per_municipality, on='outageMunicipality')
municipality_stats

### testing normality of the outages per municipality and the distances per municipality:

In [None]:
stats.normaltest(municipality_stats['outage_count']).pvalue

In [None]:
stats.normaltest(municipality_stats['mean_distance']).pvalue

still very non normal.

### Using spearmann's correlation test:
doesnt need normality. Below, we have used the alternative hypothesis of the correlation being less than 0, since the p value is 0.948, its very likely that there is a positive relation between them!
we can proceed with them having a small positive correlation!

In [None]:

rho, pval = spearmanr(municipality_stats['mean_distance'], municipality_stats['outage_count'],alternative = 'less')
print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

### Permutation tests
https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html
Spearmann suggests using the permutation tests for <500 data points. 
<p>
since we have around 350 regions, its good to try that too:

In [None]:
def statistic(x): # permute only `x`

    return stats.spearmanr(x, y).statistic

x = municipality_stats['mean_distance']
y = municipality_stats['outage_count']
res_exact = stats.permutation_test((x,), statistic,

    permutation_type='pairings', alternative = 'less')

res_asymptotic = stats.spearmanr(x, y, alternative = 'less')

res_exact.pvalue, res_asymptotic.pvalue

This also agrees with our findings! It is aproving the null hypothesis. which says there is a positive correlation between mean distances to cities and number of outages for different regions!
<p>
the higher the mean distance to cities in a municipality, the more outages the municipality is experiencing.

### **Distance vs timeout**:

Trying spearmann again:
note that since we have so many data points (trying it on the 3 year data), spearmann is a good option!

In [None]:
rho, pval = spearmanr(outages['timeOut'], outages['outageToPopulatedPlaceDistance'],alternative = 'less')
print(f"Spearman correlation: {rho:.3f}, p-value: {pval:.4f}")

crazy results! indicating there is indeed a positive correlation between timeouts and distance