In [10]:
from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

import numpy as np
import pandas as pd

from utils.data_loader import load_from_tsfile_to_dataframe
from utils.regressor_tools import process_data

np.set_printoptions(threshold=np.inf)
#pd.set_option('display.max_rows', None)  
#pd.set_option('display.max_columns', None) 
from utils.personal_utils import load_dataset
import os
import matplotlib.pyplot as plt

In [11]:
data_path = "/home/sim/Desktop/TS Extrinsic Regression/data/Covid3Month_TRAIN.ts"
dataset_array, data_y = load_dataset(data_path)
dataset_id = os.path.basename(data_path).split('_')[0]

153it [00:00, 16532.15it/s]
100%|██████████| 140/140 [00:00<00:00, 2088.65it/s]


In [12]:
# Convert from (sample,timestpes,dim) to (timesteps, dim) so each sample under the other! -> same like in Compression flattened_ts
# Ad a Columng with different id's for each sample. ts_length * num_samples, and each ts_length block is a unique id
# Then name the columns properly

num_dp, len_ts, num_dim = dataset_array.shape

# Create the ts-values per dimension and convert to df
array_flatdim = dataset_array.reshape(-1, num_dim).copy()
column_names_dim = [f"dim_{i+1}" for i in range(num_dim)]
dataset_df = pd.DataFrame(array_flatdim,columns=column_names_dim)

# Create the timesteps flattened
timesteps_flattened = np.tile(np.arange(len_ts), num_dp)


#Create the id for each datapoint/sample
ts_ids = [i+1 for i in range(num_dp) for _ in range(len_ts)]

dataset_df.insert(0, 'timesteps', timesteps_flattened)
dataset_df.insert(0, 'ts_ids', ts_ids)


In [13]:
dataset_df

Unnamed: 0,ts_ids,timesteps,dim_1
0,1,0,-0.297331
1,1,1,-0.297331
2,1,2,-0.297331
3,1,3,-0.297331
4,1,4,-0.297331
...,...,...,...
11755,140,79,0.444094
11756,140,80,1.147942
11757,140,81,2.555637
11758,140,82,4.667179


In [14]:
# Create y as a pandas Series -> The order is right!
y_ids = [i+1 for i in range(num_dp)]

y_ser = pd.Series(data_y,index=y_ids)
y_ser

1      0.000000
2      0.077586
3      0.000000
4      0.000000
5      0.154003
         ...   
136    0.006969
137    0.000000
138    0.021021
139    0.040038
140    0.005510
Length: 140, dtype: float64

In [15]:
selected = extract_relevant_features(timeseries_container=dataset_df, y = y_ser, column_id='ts_ids',column_sort='timesteps', ml_task='regression', default_fc_parameters=EfficientFCParameters())
print(selected.shape)

Feature Extraction: 100%|██████████| 28/28 [00:01<00:00, 25.76it/s]


1
2
3
4
5
...
136
137
138
139
140


In [16]:
extracted = extract_features(timeseries_container=dataset_df,column_id='ts_ids',column_sort="timesteps", default_fc_parameters=EfficientFCParameters())
impute(extracted)
extracted

Feature Extraction: 100%|██████████| 28/28 [00:01<00:00, 24.63it/s]
 'dim_1__fft_coefficient__attr_"real"__coeff_44'
 'dim_1__fft_coefficient__attr_"real"__coeff_45'
 'dim_1__fft_coefficient__attr_"real"__coeff_46'
 'dim_1__fft_coefficient__attr_"real"__coeff_47'
 'dim_1__fft_coefficient__attr_"real"__coeff_48'
 'dim_1__fft_coefficient__attr_"real"__coeff_49'
 'dim_1__fft_coefficient__attr_"real"__coeff_50'
 'dim_1__fft_coefficient__attr_"real"__coeff_51'
 'dim_1__fft_coefficient__attr_"real"__coeff_52'
 'dim_1__fft_coefficient__attr_"real"__coeff_53'
 'dim_1__fft_coefficient__attr_"real"__coeff_54'
 'dim_1__fft_coefficient__attr_"real"__coeff_55'
 'dim_1__fft_coefficient__attr_"real"__coeff_56'
 'dim_1__fft_coefficient__attr_"real"__coeff_57'
 'dim_1__fft_coefficient__attr_"real"__coeff_58'
 'dim_1__fft_coefficient__attr_"real"__coeff_59'
 'dim_1__fft_coefficient__attr_"real"__coeff_60'
 'dim_1__fft_coefficient__attr_"real"__coeff_61'
 'dim_1__fft_coefficient__attr_"real"__coeff_62'
 

Unnamed: 0,dim_1__variance_larger_than_standard_deviation,dim_1__has_duplicate_max,dim_1__has_duplicate_min,dim_1__has_duplicate,dim_1__sum_values,dim_1__abs_energy,dim_1__mean_abs_change,dim_1__mean_change,dim_1__mean_second_derivative_central,dim_1__median,...,dim_1__fourier_entropy__bins_5,dim_1__fourier_entropy__bins_10,dim_1__fourier_entropy__bins_100,dim_1__permutation_entropy__dimension_3__tau_1,dim_1__permutation_entropy__dimension_4__tau_1,dim_1__permutation_entropy__dimension_5__tau_1,dim_1__permutation_entropy__dimension_6__tau_1,dim_1__permutation_entropy__dimension_7__tau_1,dim_1__query_similarity_count__query_None__threshold_0.0,dim_1__mean_n_absolute_max__number_of_maxima_7
1,0.0,0.0,1.0,1.0,2.664535e-15,84.0,0.405882,0.083976,0.042500,-0.297331,...,0.220352,0.469942,2.439543,0.601432,0.905449,1.154301,1.375494,1.495479,0.0,2.855756
2,1.0,0.0,1.0,1.0,4.329870e-15,84.0,0.212948,0.000000,0.000000,-0.172156,...,0.220352,0.220352,0.220352,0.262533,0.331045,0.334382,0.337792,0.341279,0.0,2.090468
3,1.0,0.0,1.0,1.0,1.387779e-15,84.0,0.265758,0.000000,0.000000,-0.131297,...,0.188113,0.188113,1.244400,0.245627,0.331045,0.400766,0.471727,0.476577,0.0,1.631835
4,0.0,0.0,1.0,1.0,3.885781e-15,84.0,0.386241,0.000000,-0.008885,-0.294906,...,0.647929,1.111263,2.796044,0.465464,0.647991,0.808611,0.917277,1.010331,0.0,3.035798
5,1.0,0.0,1.0,1.0,-8.881784e-16,84.0,0.204457,0.047634,0.010221,-0.477473,...,0.220352,0.220352,0.438435,1.266387,2.080562,2.664843,3.085776,3.306209,0.0,2.876924
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,0.0,0.0,1.0,1.0,4.635181e-15,84.0,0.411782,0.007353,-0.013025,-0.425044,...,0.297445,0.680168,2.445047,0.870278,1.365272,1.609656,1.642801,1.658946,0.0,2.626557
137,0.0,0.0,1.0,1.0,8.881784e-16,84.0,0.380770,0.021553,-0.032724,-0.241359,...,0.220352,0.525174,2.538545,0.519589,0.803049,1.018706,1.228109,1.393198,0.0,2.569768
138,0.0,0.0,1.0,1.0,-2.664535e-15,84.0,0.390093,0.020062,-0.000846,-0.393708,...,0.220352,0.329684,2.009197,1.023508,1.622437,1.887774,2.080845,2.142784,0.0,2.705300
139,1.0,0.0,1.0,1.0,1.942890e-15,84.0,0.243313,0.000564,-0.000076,-0.436307,...,0.297445,0.438435,2.085236,1.667742,2.699363,3.429537,3.860782,4.122049,0.0,2.207340


In [17]:
sf = select_features(extracted, y_ser, ml_task='regression')

sf

1
2
3
4
5
...
136
137
138
139
140


In [18]:
sf.columns

Index([], dtype='object')