In [None]:
import pandas as pd
from scipy import stats
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

In [None]:
distances = pd.read_csv("../DataAcquisition/shortest_distances.csv")

### Checking if the distances are normally distributed:

In [None]:
stats.normaltest(distances['distance_km']).pvalue

### First Idea
1. Splitting the distances into **equal** length **bins** and seeing how many outages occured that have distances in that distance interval for example from (0,3), is the number of outages that had a distance of between 0 to 3 to the nearest station.
2. Now we **groupby** these intervals and aggregate by counting the number of outages that are in this interval. This gives us outages per distance
3. We can do a statistical test like a **T-test** to see if there is a significant different between the first half of these outages compared to the second half of these outages.

In [None]:
#this cuts the data into 3 equal width bins.
data = pd.Series([2,19,1,20, 13, 19, 24, 30])
bins = pd.cut(data, bins=3)
print(bins)

In [None]:
n = 10 #number of bins
distance_bins = pd.cut(distances['distance_km'], bins=n)
distances['distance_bin'] = distance_bins
outages_per_dist = distances.groupby(['distance_bin']).size()
outages_per_dist = pd.DataFrame(outages_per_dist)
outages_per_dist = outages_per_dist.rename(columns={0:"#of outages"})
outages_per_dist


### Checking for correlations: **distance vs timeout**
Checking if there is any correlation between the distance of the outage to the station and the time it took for the outage to be resolved.

In [None]:
outages = pd.read_csv("../DataAcquisition/outages.csv")
outages['dateOn']
outages['dateOn'] = outages['dateOn'].apply(datetime.fromisoformat)
outages['dateOff'] = outages['dateOff'].apply(datetime.fromisoformat)
outages['timeOut'] = outages['dateOn'] - outages['dateOff']
outages['timeOut'] = outages['timeOut'].apply(lambda x: x.total_seconds()/60)
outages['timeOut']
#timeout is the total of minutes without power (we can change it into hours if its better)

In [None]:
stats.normaltest(outages['timeOut']).pvalue

In [None]:
fit = stats.linregress(outages['timeOut'], distances['distance_km'])
plt.xticks(rotation = 25)
plt.plot(outages["timeOut"], distances["distance_km"], 'b.', alpha = 0.5)
plt.plot(outages["timeOut"], outages["timeOut"]*fit.slope + fit.intercept, 'r-', linewidth = 3)
plt.show()
#this plot doesnt look good because the distance csv that I created was not very good. 
#it would be nice to try it on our actual big dataset and the corresponding distances csv

In [None]:
outages["timeOut"].corr(distances["distance_km"])

### Trying transformations:

In [None]:
outages["timeOut"].apply(np.sqrt).corr(distances["distance_km"].apply(np.sqrt))

### Log transformation:


In [None]:
outages["timeOut"].apply(np.log).corr(distances["distance_km"].apply(np.log))

In [None]:
timeouts_transformed = outages["timeOut"].apply(np.log)
distance_transformed = distances["distance_km"].apply(np.log)
fit = stats.linregress(timeouts_transformed, distance_transformed)
plt.xticks(rotation = 25)
plt.plot(timeouts_transformed, distance_transformed, 'b.', alpha = 0.5)
plt.plot(timeouts_transformed, timeouts_transformed*fit.slope + fit.intercept, 'r-', linewidth = 3)
plt.show()

Using log makes better bins as well! 
but are the bins even meaningful?

In [None]:
n = 10 #number of bins
distance_bins = pd.cut(distances['distance_km'].apply(np.log), bins=n)
distances['distance_bin'] = distance_bins
outages_per_dist = distances.groupby(['distance_bin']).size()
outages_per_dist = pd.DataFrame(outages_per_dist)
outages_per_dist = outages_per_dist.rename(columns={0:"#of outages"})
outages_per_dist