In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
# from dart import ts_spikes
from statsmodels.stats.diagnostic import het_arch
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.seasonal import STL
from scipy.optimize import curve_fit

In [2]:
GRID_SIZE = 50
NUMBER = 1000
NUMBER_OF_GROUPS = 10
NUMBER_OF_TILES = NUMBER // NUMBER_OF_GROUPS

In [3]:
def create_data(df,grid_size, number_of_tiles, number):
    '''Creates the data from a given array'''
    df = df.sort_values(by = ['latitude', 'longitude']).reset_index(drop=True)
    df['temporal_group'] = np.tile(np.arange(number) // (number_of_tiles), grid_size * grid_size)
    return df

In [4]:
latitude = np.repeat(range(GRID_SIZE),GRID_SIZE * NUMBER)
longitude = np.tile(np.arange(GRID_SIZE),GRID_SIZE * NUMBER) 
power = np.random.random(GRID_SIZE*GRID_SIZE*NUMBER)
df = pd.DataFrame({'latitude' : latitude, 'longitude' : longitude, 'power' :  power})
df = create_data(df, GRID_SIZE, NUMBER_OF_TILES, NUMBER)

In [5]:
df.groupby(['latitude', 'longitude'])['temporal_group'].max()

latitude  longitude
0         0            9
          1            9
          2            9
          3            9
          4            9
                      ..
49        45           9
          46           9
          47           9
          48           9
          49           9
Name: temporal_group, Length: 2500, dtype: int64

In [47]:
# Creating aggregations
aggregated_dict = { 
                    'power' : [
                                'mean',
                                'var',
                                # slope,
                                # last_CSO_value,
                                'skew',
                                x_acf1,
                                diff1_acf1,
                                diff2_acf1,
                                # e_acf1,
                                x_acf10,
                                diff1_acf10,
                                diff2_acf10,
                                # e_acf10,
                                entropy,
                                crossing_points,
                                flat_spots,
                                nonlinear,
                                linearity,
                                curvature,
                                x_pacf5,
                                diff1_pacf5,
                                diff2_pacf5,
                                lumpiness,
                                stability,
                                arch_stat,
                                trend
                            ]
                    }
aggregated_data = df.groupby(['latitude', 'longitude']).agg(aggregated_dict)
aggregated_data

  lag, arch_stat, p_value, f_test = het_arch(x, maxlag=10)


Unnamed: 0_level_0,Unnamed: 1_level_0,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power,power
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,var,skew,x_acf1,diff1_acf1,diff2_acf1,x_acf10,diff1_acf10,diff2_acf10,entropy,...,nonlinear,linearity,curvature,x_pacf5,diff1_pacf5,diff2_pacf5,lumpiness,stability,arch_stat,trend
latitude,longitude,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,0,0.480224,0.083460,0.076767,-0.037106,-0.511272,-0.673846,1.006728,1.273023,1.503124,364.853856,...,,,,0.912568,-0.282875,-0.892816,,,0.656494,0.052160
0,1,0.506503,0.082065,0.001629,0.052060,-0.486049,-0.669547,1.015216,1.253957,1.505451,362.453901,...,,,,1.042335,-0.240502,-0.902749,,,0.146294,0.060540
0,2,0.488894,0.087774,0.047771,0.010518,-0.512061,-0.682145,1.005324,1.276243,1.528818,355.343115,...,,,,1.041523,-0.257595,-0.894067,,,0.939799,0.073461
0,3,0.496995,0.084856,0.007271,0.023810,-0.487072,-0.654608,1.009520,1.251086,1.466608,357.517898,...,,,,1.083152,-0.313164,-0.907293,,,0.357230,0.079207
0,4,0.497579,0.084707,0.005146,-0.016521,-0.520909,-0.673413,1.010381,1.282173,1.492492,358.453335,...,,,,1.076921,-0.314825,-0.933823,,,0.372098,0.083073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49,45,0.510430,0.083715,-0.033505,0.032233,-0.480992,-0.640338,1.011867,1.251169,1.433355,357.862872,...,,,,1.111075,-0.233906,-0.893584,,,0.199607,0.079097
49,46,0.505560,0.086940,-0.031735,-0.006437,-0.515921,-0.678583,1.008451,1.277575,1.506224,352.704758,...,,,,0.987168,-0.301243,-0.913955,,,0.661778,0.062250
49,47,0.513288,0.084247,-0.055424,-0.043909,-0.503063,-0.660203,1.016531,1.275182,1.476668,355.996286,...,,,,0.894528,-0.324548,-0.914972,,,0.170467,0.050416
49,48,0.485466,0.080335,0.065583,-0.001047,-0.487571,-0.668763,1.014526,1.264626,1.518362,369.269242,...,,,,0.925504,-0.256119,-0.876133,,,0.228770,0.047034


In [None]:
_,group = next(iter(df.groupby(['temporal_group', 'latitude', 'longitude'])))

In [24]:
diff1_acf1

<function __main__.diff1_acf1(x)>

In [46]:
# Different aggregate functions
def difference_time_series(x):
    ''''''
    return x.diff().dropna()


def x_acf1(x):
    '''The autocorrelation function of the series'''
    return acf(x, nlags=1)[1]


def diff1_acf1(x):
    '''The autocorrelation function of the differenced series'''
    first_difference = difference_time_series(x)
    return acf(first_difference, nlags=1)[1]


def diff2_acf1(x):
    '''The autocorrelation function of the second-order differenced series'''
    first_difference = difference_time_series(x)
    second_difference = difference_time_series(first_difference)
    return acf(second_difference, nlags=1)[1]


def e_acf1(x):
    '''The autocorrelation function of the residuals'''
    return acf(data - x.rolling(window=2).mean().dropna(), nlags=1)[1]


def x_acf10(x):
    '''The sum of squares of the first 10 autocorrelation coefficients for series'''
    return sum(acf(x, nlags=10)**2)

def diff1_acf10(x):
    '''The sum of squares of the first 10 autocorrelation coefficients for the differenced series'''
    first_difference = difference_time_series(x)
    return sum(acf(first_difference, nlags=10)**2)

def diff2_acf10(x):
    '''The sum of squares of the first 10 autocorrelation coefficients for second-order the differenced series'''
    first_difference = difference_time_series(x)
    second_difference = difference_time_series(first_difference)
    return sum(acf(second_difference, nlags=10)**2)


def e_acf10(x,window_size=2):
    '''The sum of squares of the first 10 autocorrelation coefficients of the residuals'''
    # Calculate the rolling mean
    rolling_mean = data_pd.rolling(window=window_size).mean().dropna()

    # Calculate the residuals by subtracting the rolling mean from the original data
    residuals = data[:-window_size+1] - rolling_mean.values

    # Compute the autocorrelation function for the residuals
    e_acf = acf(residuals, nlags=10)

    # Calculate the sum of squares of the first 10 autocorrelation coefficients
    e_acf10 = np.sum(np.square(e_acf[:10]))

    return e_acf10


def entropy(x):
    '''The spectral entropy is the Shannon entropy'''
    return -np.sum(x * np.log2(x))


def crossing_points(x):
    '''The number of times the temporal data-set crosses the median line'''
    return len(np.where(np.diff((x > x.median())))[0])


def flat_spots(x):
    '''The temporal dataset is divided into ten equally blocks then the largest runlength represents the value of flat_spots'''
    pass


def nonlinear(x):
    '''The nonlinearity is estimated from a modified Teräsvirta’s test'''
    pass


def linearity(x):
    ''' The strength of curvature are estimated from the coefficients of the orthogonal quadratic regression.'''
    pass


def curvature(x):
    ''''''
    pass


def x_pacf5(x):
    '''Thee sum of the first 5 partial autocorrelation coefficients of the series'''
    return np.sum(pacf(x, nlags=5)[:5])


def diff1_pacf5(x):
    ''''The sum of the first 5 partial autocorrelation coefficients of the differenced series'''
    first_difference = difference_time_series(x)
    return np.sum(pacf(first_difference, nlags=5)[:5])
    

def diff2_pacf5(x):
    '''The sum of the first 5 partial autocorrelation coefficients of the second-order differenced series'''
    first_difference = difference_time_series(x)
    second_difference = difference_time_series(first_difference)
    return np.sum(pacf(second_difference, nlags=5)[:5])


def lumpiness(x):
    '''Temporal dataset is divided into non-overlapping windoes the variance of the mean of the tiled windows'''
    pass


def stability(x):
    '''Temporal dataset is divided into non-overlapping windoes the variance of the variance of the tiled windows'''
    pass


def arch_stat(x):
    '''Stability based on Lagrange Multiplier Test'''
    lag, arch_stat, p_value, f_test = het_arch(x, nlags=10)
    return arch_stat


def trend(x):
    '''The strength of the trend is found from Seasonal-Trend decomposition using LOESS'''
    stl = STL(x, period=12)
    res = stl.fit()

    # calculate trend strength measure
    trend_var, total_var = np.var(res.trend), np.var(x)
    return trend_var / total_var


def spike(x):
    '''Variance of the leave-one-out variances of the residuals'''
    model = ARIMA(x, order=(1,1,1))
    model_fit = model.fit()

    # calculate residuals
    residuals = pd.Series(model_fit.resid)

    # calculate variance of residuals
    residuals_var = np.var(residuals)

    # calculate leave-one-out variances of residuals
    loocv_variances = []
    for i in range(len(residuals)):
        # remove one observation from the data
        data_loo = x.drop(data.index[i])
        # fit ARIMA model to the leave-one-out data
        model_loo = ARIMA(data_loo, order=(1,1,1))
        model_fit_loo = model_loo.fit()
        # calculate residuals of the leave-one-out model
        residuals_loo = pd.Series(model_fit_loo.resid)
        # calculate variance of residuals of the leave-one-out model
        residuals_var_loo = np.var(residuals_loo)
        # add variance to the list
        loocv_variances.append(residuals_var_loo)

    # calculate spike as the variance of the leave-one-out variances of residuals
    spike = np.var(loocv_variances)
