In [93]:
import pandas as pd
import numpy as np


from utils.data_loader import load_from_tsfile_to_dataframe
from utils.regressor_tools import process_data
import mlflow
from tsfeatures import tsfeatures
np.set_printoptions(threshold=np.inf)
#pd.set_option('display.max_rows', None)  
#pd.set_option('display.max_columns', None) 


In [94]:
train_x, train_y = load_from_tsfile_to_dataframe("/home/sim/Desktop/TS Extrinsic Regression/data/AppliancesEnergy_TRAIN.ts", replace_missing_vals_with='NaN')
test_x, test_y = load_from_tsfile_to_dataframe("/home/sim/Desktop/TS Extrinsic Regression/data/AppliancesEnergy_TEST.ts", replace_missing_vals_with='NaN')



min_len = np.inf
for i in range(len(train_x)):
    x = train_x.iloc[i, :]
    all_len = [len(y) for y in x]
    min_len = min(min(all_len), min_len)

for i in range(len(test_x)):
    x = test_x.iloc[i, :]
    all_len = [len(y) for y in x]
    min_len = min(min(all_len), min_len)

train_x_p= process_data(train_x, normalise=None, min_len=min_len)
test_x_p = process_data(test_x, normalise=None, min_len=min_len)
train_x_p.shape

119it [00:02, 45.29it/s]
66it [00:01, 60.51it/s] 
100%|██████████| 95/95 [00:00<00:00, 820.20it/s]
100%|██████████| 42/42 [00:00<00:00, 853.22it/s]


(95, 144, 24)

In [97]:
# Test if reshaping from (len_flat_dim, dim) to (num_dp, len_ts, dim) is working
num_dp = 2
len_ts = 5
num_dim = 4

array = np.arange(num_dp * len_ts * num_dim).reshape(num_dp, len_ts, num_dim)
print(array)

# Reshape to (len_flat_dim, dim)
array_flatdim = array.reshape(num_dp * len_ts, num_dim)
print(array_flatdim)

# Put column after column. For gzip part.
print(array_flatdim.reshape(-1, order='F'))


# Reshape back to (num_dp, len_ts, dim)
array_back = array_flatdim.reshape(-1, len_ts, num_dim)
print(array_back)

print(array_back.flatten())

[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]
  [12 13 14 15]
  [16 17 18 19]]

 [[20 21 22 23]
  [24 25 26 27]
  [28 29 30 31]
  [32 33 34 35]
  [36 37 38 39]]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]
 [24 25 26 27]
 [28 29 30 31]
 [32 33 34 35]
 [36 37 38 39]]
[ 0  4  8 12 16 20 24 28 32 36  1  5  9 13 17 21 25 29 33 37  2  6 10 14
 18 22 26 30 34 38  3  7 11 15 19 23 27 31 35 39]
[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]
  [12 13 14 15]
  [16 17 18 19]]

 [[20 21 22 23]
  [24 25 26 27]
  [28 29 30 31]
  [32 33 34 35]
  [36 37 38 39]]]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]


In [98]:
# Test binary conversion and compression
num_dp = 2
len_ts = 2
num_dim = 2

array = np.arange(num_dp * len_ts * num_dim).reshape(num_dp, len_ts, num_dim)
array_flat = array.reshape(num_dp * len_ts, num_dim)

# Test if .tobytes saves metadata of the np.array -> No it flattens the array row after row (and slice after slice), then just saves the content of the array as bytes!
print(array_flat.tobytes() == np.arange(8).tobytes())


byte_nparray = np.arange(5)
print(byte_nparray.dtype)

byte_nparray = byte_nparray.tobytes()

print(bytearray)
print(len(byte_nparray))

# len(byte-string) -> return number of bytes in byte-object!


True
int64
<class 'bytearray'>
40


Test if saving with tofile keeps the array the same!

In [99]:
# Test Saving in Files

np.save('time_series.npy', train_x_p)
train_x_p[0][0][0].dtype


flattened = train_x_p.flatten()
flattened.tofile("data.bin")

train_after = np.fromfile("data.bin", dtype=train_x_p[0][0][0].dtype)
train_after.shape
if np.array_equal(flattened, train_after):
    print("The arrays have the same content.")
else:
    print("The arrays do not have the same content.")

The arrays have the same content.


In [100]:
# Only preparing Data for Testing!

def prepare_data(data_x_p, data_y):

    # Swap the dimensions so that columns are stacked after each other. Copy since swapaxes only returns a view
    #(95, 144, 24) -> (95, 24, 144), first column gets first row etc. One Row is the ts of the belonging dimension.
    data_swapped = data_x_p.swapaxes(1, 2).copy()

    # Reshape to flattened ts. Stack the rows behind the other for each slice.
    data_x_flattend = data_swapped.reshape(data_swapped.shape[0], -1)
    

    prep_data = pd.DataFrame(data_x_flattend)
    prep_data['target'] = data_y
    prep_data.columns = prep_data.columns.astype(str) #fwiz or flaml needs string as columns!

   
    #data_x_p = data_x_p[0:2,...]

    num_datapoints = data_x_p.shape[0]
    len_timeseries = data_x_p.shape[1]
    num_dimensions = data_x_p.shape[2]
    num_features = 38

    all_features = np.ndarray((num_datapoints, num_features * num_dimensions))

    for i in range(0, num_datapoints):
        start_index = 0

        for j in range(0, num_dimensions):
            curr_ts = data_x_p[i,:,j]

            #print(curr_ts.size)

            timeseries_df = pd.DataFrame({'unique_id' : np.ones(len_timeseries),'ds': np.arange(0, len_timeseries) , 'y': curr_ts})
            
            feature_array = tsfeatures(timeseries_df, freq=1).fillna(0).values

            #print(feature_array.size)
            #print(np.isnan(feature_array).sum())

            end_index = start_index + feature_array.size
            all_features[i, start_index: end_index] = feature_array
            start_index = end_index
        

    all_features = pd.DataFrame(all_features)

    # name the features
    for i, col in enumerate(all_features.columns):
        # Generate the new column name
        new_col_name = 'f' + str(i + 1)
        # Rename the column
        all_features.rename(columns={col: new_col_name}, inplace=True)



    ts_and_features = pd.concat([prep_data, all_features], axis=1)


    all_features['target'] = data_y

    ts_and_features = pd.concat([prep_data.drop(columns=['target']), all_features], axis=1)
    
    #prep_data.columns = prep_data.columns.astype(str)

    return ts_and_features, all_features

Test preparing part function of load_and_prepare_everything

In [101]:
# Simple test:
#-> No a perfect test, but it seems to work! I have no idea about better tests!

len_ts = 3
ts_1_v = np.arange(len_ts)
np.random.seed(42)  # For reproducibility
ts_2_v = np.random.rand(len_ts)
x = np.linspace(0, 2 * np.pi, len_ts)
ts_3_v = np.sin(x)

ts_4_v =  ts_1_v + 0.2 * ts_2_v


# Reulting Feature matrix
ts_1 = pd.DataFrame({'unique_id' : np.ones(len_ts),'ds': np.arange(0,len_ts) , 'y': ts_1_v})
ts_2 = pd.DataFrame({'unique_id' : np.ones(len_ts),'ds': np.arange(0,len_ts) , 'y': ts_2_v})
ts_3 = pd.DataFrame({'unique_id' : np.ones(len_ts),'ds': np.arange(0,len_ts) , 'y': ts_3_v})
ts_4 = pd.DataFrame({'unique_id' : np.ones(len_ts),'ds': np.arange(0,len_ts) , 'y': ts_4_v})

feature_matrix = np.vstack((tsfeatures(ts_1, freq=1).fillna(0).values, tsfeatures(ts_2, freq=1).fillna(0).values, tsfeatures(ts_3, freq=1).fillna(0).values,tsfeatures(ts_4, freq=1).fillna(0).values))
#print(feature_matrix)


layer1 = np.stack((ts_1_v, ts_2_v), axis=1)
layer2 = np.stack((ts_3_v,ts_4_v), axis=1)

input = np.stack((layer1, layer2), axis=0)


print(input)
print(feature_matrix)
tsf, f = prepare_data(input, np.arange(2))
print(tsf)
#print(f)


[[[ 0.00000000e+00  3.74540119e-01]
  [ 1.00000000e+00  9.50714306e-01]
  [ 2.00000000e+00  7.31993942e-01]]

 [[ 0.00000000e+00  7.49080238e-02]
  [ 1.22464680e-16  1.19014286e+00]
  [-2.44929360e-16  2.14639879e+00]]]
[[ 1.00000000e+00  5.00000000e-01  3.00000000e+00  4.44089210e-16
   3.33333333e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   7.35200499e+02  0.00000000e+00  9.99999985e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00 -0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 1.00000000e+00  6.66801831e-01  3.00000000e+00 -2.75921612e+00
   3.33333333e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  

In [102]:
# Create Sample TS
linear_data = linear_data = np.arange(360).reshape(3, 30, 4)  # Creates an array with values from 0 to 1199
linear_data

np.random.seed(42)  # For reproducibility
random_data = np.random.rand(3, 30, 4)
random_y = np.random.rand(3)
random_data

prepare_data(linear_data, random_y)

(     0    1    2    3    4    5    6    7    8    9  ...      f144  f145  \
 0    0    4    8   12   16   20   24   28   32   36  ...  0.521199   1.0   
 1  120  124  128  132  136  140  144  148  152  156  ...  0.521199   1.0   
 2  240  244  248  252  256  260  264  268  272  276  ...  0.521199   1.0   
 
    f146  f147      f148      f149      f150      f151     f152    target  
 0   1.0   0.9  2.931081 -0.360864  0.333636 -0.568267  0.53624  0.388170  
 1   1.0   0.9  2.931081 -0.360864  0.333636 -0.568267  0.53624  0.643288  
 2   1.0   0.9  2.931081 -0.360864  0.333636 -0.568267  0.53624  0.458253  
 
 [3 rows x 273 columns],
     f1        f2    f3   f4        f5   f6   f7   f8        f9  f10  ...  \
 0  1.0  0.985647  30.0  0.0  1.098661  0.0  0.0  0.0  1.290323  0.0  ...   
 1  1.0  0.985647  30.0  0.0  1.098661  0.0  0.0  0.0  1.290323  0.0  ...   
 2  1.0  0.985647  30.0  0.0  1.098661  0.0  0.0  0.0  1.290323  0.0  ...   
 
        f144  f145  f146  f147      f148      f14