In [1]:
from tsfresh import extract_features, select_features, extract_relevant_features
from tsfresh.feature_extraction import extract_features, EfficientFCParameters
from tsfresh.utilities.dataframe_functions import impute

import numpy as np
import pandas as pd

from utils.data_loader import load_from_tsfile_to_dataframe
from utils.regressor_tools import process_data

np.set_printoptions(threshold=np.inf)
#pd.set_option('display.max_rows', None)  
#pd.set_option('display.max_columns', None) 
from utils.personal_utils import load_dataset
import os
import matplotlib.pyplot as plt

In [2]:
data_path = "/home/sim/Desktop/TS Extrinsic Regression/data/AppliancesEnergy_TRAIN.ts"
dataset_array, data_y = load_dataset(data_path)
dataset_id = os.path.basename(data_path).split('_')[0]

119it [00:02, 47.36it/s]
100%|██████████| 95/95 [00:00<00:00, 499.49it/s]


In [3]:
# Convert from (sample,timestpes,dim) to (timesteps, dim) so each sample under the other! -> same like in Compression flattened_ts
# Ad a Columng with different id's for each sample. ts_length * num_samples, and each ts_length block is a unique id
# Then name the columns properly

num_dp, len_ts, num_dim = dataset_array.shape

# Create the ts-values per dimension and convert to df
array_flatdim = dataset_array.reshape(-1, num_dim).copy()
column_names_dim = [f"dim_{i+1}" for i in range(num_dim)]
dataset_df = pd.DataFrame(array_flatdim,columns=column_names_dim)

# Create the timesteps flattened
timesteps_flattened = np.tile(np.arange(len_ts), num_dp)


#Create the id for each datapoint/sample
ts_ids = [i for i in range(num_dp) for _ in range(len_ts)]

dataset_df.insert(0, 'timesteps', timesteps_flattened)
dataset_df.insert(0, 'ts_ids', ts_ids)


In [4]:
dataset_df

Unnamed: 0,ts_ids,timesteps,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_15,dim_16,dim_17,dim_18,dim_19,dim_20,dim_21,dim_22,dim_23,dim_24
0,1,0,1.441523,-2.797124,1.640701,-2.044184,-0.039561,-1.541150,1.480303,-2.271346,...,1.635063,-0.743721,1.907051,0.331959,1.321198,-2.565395,-1.203355,1.529866,0.688243,-1.077608
1,1,1,1.441523,-2.478552,1.568025,-1.941992,-0.039561,-1.472232,1.480303,-2.180667,...,1.551828,-0.743721,1.907051,0.362299,1.273468,-2.500097,-1.167875,1.348995,0.688243,-1.042313
2,1,2,1.441523,-2.315181,1.531687,-1.745272,-0.039561,-1.345882,1.422508,-1.958503,...,1.464630,-0.724709,1.907051,0.362299,1.225738,-2.434799,-1.132396,1.168124,0.688243,-1.007017
3,1,3,1.406570,-2.070125,1.466279,-1.540887,-0.039561,-1.345882,1.422508,-1.786213,...,1.345722,-0.691869,1.635362,0.331959,1.178008,-2.369501,-1.096916,0.987253,0.688243,-0.971722
4,1,4,1.336662,-1.980272,1.433574,-1.367160,-0.039561,-1.345882,1.358292,-1.591253,...,1.274378,-0.707425,1.635362,0.331959,1.130278,-2.304203,-1.061436,0.806383,0.688243,-0.936427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13675,95,139,1.273620,-3.371802,1.079540,-3.014006,-0.554115,-1.171041,1.041079,-2.586500,...,1.549734,-1.246361,2.528725,0.508217,1.508583,1.381264,-1.842273,1.071629,2.457532,-3.268416
13676,95,140,1.174375,-4.107478,1.039416,-3.402309,-0.329764,-2.126392,0.840424,-2.441176,...,1.461167,-1.301478,2.528725,0.560153,1.485757,1.397584,-1.791819,0.963338,2.457532,-2.980907
13677,95,141,1.119791,-4.535847,0.959169,-3.751425,-0.758070,-2.561607,0.710969,-2.666428,...,1.377028,-1.331938,2.528725,0.586121,1.462931,1.413904,-1.741365,0.855047,2.457532,-2.693398
13678,95,142,1.010622,-5.001465,0.826761,-3.897484,-0.951828,-2.619989,0.710969,-2.484773,...,1.297317,-1.363848,2.528725,0.586121,1.440105,1.430224,-1.690911,0.746756,2.457532,-2.405889


In [5]:
# Create y as a pandas Series -> The order is right!
y_ids = [i+1 for i in range(num_dp)]

y_ser = pd.Series(data_y,index=y_ids)
y_ser

1     19.38
2     12.68
3      5.34
4     12.72
5     13.25
      ...  
91    14.71
92    13.69
93    13.87
94    17.66
95     8.75
Length: 95, dtype: float64

In [6]:
selected = extract_relevant_features(timeseries_container=dataset_df, y = y_ser, column_id='ts_ids',column_sort='timesteps', ml_task='regression', default_fc_parameters=EfficientFCParameters())
print(selected.shape)

Feature Extraction: 100%|██████████| 30/30 [00:17<00:00,  1.75it/s]


(95, 96)


In [10]:
extracted = extract_features(timeseries_container=dataset_df,column_id='ts_ids',column_sort="timesteps", default_fc_parameters=EfficientFCParameters())
impute(extracted)


Feature Extraction: 100%|██████████| 30/30 [00:18<00:00,  1.66it/s]
 'dim_1__fft_coefficient__attr_"real"__coeff_74'
 'dim_1__fft_coefficient__attr_"real"__coeff_75'
 'dim_1__fft_coefficient__attr_"real"__coeff_76'
 'dim_1__fft_coefficient__attr_"real"__coeff_77'
 'dim_1__fft_coefficient__attr_"real"__coeff_78'
 'dim_1__fft_coefficient__attr_"real"__coeff_79'
 'dim_1__fft_coefficient__attr_"real"__coeff_80'
 'dim_1__fft_coefficient__attr_"real"__coeff_81'
 'dim_1__fft_coefficient__attr_"real"__coeff_82'
 'dim_1__fft_coefficient__attr_"real"__coeff_83'
 'dim_1__fft_coefficient__attr_"real"__coeff_84'
 'dim_1__fft_coefficient__attr_"real"__coeff_85'
 'dim_1__fft_coefficient__attr_"real"__coeff_86'
 'dim_1__fft_coefficient__attr_"real"__coeff_87'
 'dim_1__fft_coefficient__attr_"real"__coeff_88'
 'dim_1__fft_coefficient__attr_"real"__coeff_89'
 'dim_1__fft_coefficient__attr_"real"__coeff_90'
 'dim_1__fft_coefficient__attr_"real"__coeff_91'
 'dim_1__fft_coefficient__attr_"real"__coeff_92'
 

(95, 18648)


In [14]:
extracted

Unnamed: 0,dim_1__variance_larger_than_standard_deviation,dim_1__has_duplicate_max,dim_1__has_duplicate_min,dim_1__has_duplicate,dim_1__sum_values,dim_1__abs_energy,dim_1__mean_abs_change,dim_1__mean_change,dim_1__mean_second_derivative_central,dim_1__median,...,dim_24__fourier_entropy__bins_5,dim_24__fourier_entropy__bins_10,dim_24__fourier_entropy__bins_100,dim_24__permutation_entropy__dimension_3__tau_1,dim_24__permutation_entropy__dimension_4__tau_1,dim_24__permutation_entropy__dimension_5__tau_1,dim_24__permutation_entropy__dimension_6__tau_1,dim_24__permutation_entropy__dimension_7__tau_1,dim_24__query_similarity_count__query_None__threshold_0.0,dim_24__mean_n_absolute_max__number_of_maxima_7
1,1.0,1.0,1.0,1.0,3.552714e-14,144.0,0.045383,0.004074,0.000000,-0.189649,...,0.197570,0.197570,0.359959,0.896323,1.213385,1.550073,1.881835,2.206758,0.0,1.893495
2,0.0,1.0,0.0,1.0,7.105427e-13,144.0,0.055214,-0.007307,-0.000372,-0.285464,...,0.144565,0.144565,0.393377,0.853950,1.029231,1.205084,1.391396,1.578241,0.0,1.737151
3,1.0,1.0,1.0,1.0,-8.881784e-13,144.0,0.035675,-0.019565,0.000519,-0.291441,...,0.144565,0.144565,0.359959,0.722207,0.889739,1.066833,1.243463,1.419294,0.0,1.963860
4,0.0,1.0,1.0,1.0,-3.552714e-15,144.0,0.057926,0.017205,0.000000,-0.163220,...,0.216560,0.216560,0.216560,0.992603,1.316224,1.651004,1.982723,2.310422,0.0,1.691409
5,0.0,1.0,1.0,1.0,6.465939e-13,144.0,0.039837,-0.007350,0.000000,-0.115325,...,0.216560,0.216560,0.359959,0.818912,0.958040,1.107742,1.258217,1.409421,0.0,2.343835
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1.0,1.0,0.0,1.0,1.421085e-13,144.0,0.052055,0.004821,0.000192,-0.112014,...,0.197570,0.197570,0.216560,0.854290,1.088198,1.331165,1.573013,1.813218,0.0,1.957547
92,0.0,1.0,1.0,1.0,1.887379e-13,144.0,0.045754,0.008245,-0.000178,-0.114486,...,0.216560,0.288359,0.483562,0.999050,1.356143,1.709327,2.057375,2.412511,0.0,1.869562
93,0.0,1.0,1.0,1.0,3.836931e-13,144.0,0.037779,-0.009261,0.000671,-0.202034,...,0.144565,0.144565,0.457404,0.936192,1.201439,1.475779,1.749071,2.020857,0.0,2.316670
94,0.0,1.0,1.0,1.0,-1.421085e-14,144.0,0.062485,0.012666,0.000000,-0.021907,...,0.340969,0.359959,0.573539,0.955501,1.264878,1.582562,1.898162,2.210897,0.0,2.247776


In [8]:
sf = select_features(extracted, y_ser, ml_task='regression')

sf

Unnamed: 0,dim_5__ar_coefficient__coeff_1__k_10,dim_6__ar_coefficient__coeff_2__k_10,dim_6__ar_coefficient__coeff_1__k_10,dim_5__lempel_ziv_complexity__bins_100,dim_5__ratio_value_number_to_time_series_length,dim_5__ar_coefficient__coeff_2__k_10,"dim_5__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.2","dim_5__change_quantiles__f_agg_""var""__isabs_False__qh_0.6__ql_0.2",dim_5__quantile__q_0.6,dim_5__percentage_of_reoccurring_datapoints_to_all_datapoints,...,dim_2__has_duplicate_max,dim_5__energy_ratio_by_chunks__num_segments_10__segment_focus_4,dim_6__autocorrelation__lag_3,"dim_5__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""max""","dim_6__change_quantiles__f_agg_""var""__isabs_False__qh_1.0__ql_0.4","dim_6__agg_linear_trend__attr_""rvalue""__chunk_len_50__f_agg_""var""","dim_6__change_quantiles__f_agg_""var""__isabs_True__qh_0.6__ql_0.4","dim_5__fft_coefficient__attr_""angle""__coeff_57","dim_6__change_quantiles__f_agg_""mean""__isabs_True__qh_1.0__ql_0.4","dim_6__fft_aggregated__aggtype_""variance"""
1,1.386988,-0.938208,1.710903,0.548611,0.368056,0.027513,0.002467,0.003035,-0.163321,0.756944,...,0.0,0.024295,0.456575,1.154854,0.186895,0.880439,0.004095,158.152239,0.231844,184.801323
2,1.568132,-0.870402,1.421868,0.541667,0.402778,-0.374047,0.003859,0.005961,-0.091734,0.701389,...,0.0,0.000683,0.719031,1.490546,0.210012,0.898746,0.004535,159.402941,0.188638,312.455420
3,0.901869,-0.142091,1.276991,0.444444,0.166667,0.033683,0.009727,0.012607,0.226804,0.951389,...,0.0,0.029076,0.931798,0.009406,0.011409,-0.542840,0.005833,-26.053306,0.066125,302.611276
4,1.153100,-0.297464,1.331890,0.423611,0.145833,-0.118289,0.021349,0.027916,0.012652,0.951389,...,0.0,0.014609,0.925466,0.905923,0.004878,-0.868020,0.001114,162.494940,0.043385,334.374965
5,1.171382,-0.274046,1.419483,0.583333,0.416667,-0.039614,0.001691,0.002010,0.000583,0.708333,...,0.0,0.000849,0.901434,0.444533,0.022654,-0.635845,0.007563,-22.337904,0.091215,238.916926
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1.860718,-0.737978,1.644842,0.604167,0.534722,-1.131695,0.008066,0.009929,0.366689,0.583333,...,0.0,0.007472,0.625961,0.728735,0.070066,0.234392,0.001785,-28.396218,0.151131,363.661774
92,1.093274,-0.193937,1.334888,0.451389,0.180556,0.049684,0.011636,0.016880,-0.118989,0.916667,...,0.0,0.095543,0.950730,0.547682,0.006529,-0.737376,0.003594,164.037183,0.059675,186.246661
93,1.558330,-1.024922,1.750713,0.590278,0.409722,-0.340132,0.005841,0.007677,0.447375,0.743056,...,0.0,0.004216,0.797066,0.907221,0.075323,0.756420,0.010097,-20.335715,0.122882,206.567569
94,1.751047,-1.008034,1.780075,0.541667,0.430556,-0.690955,0.001251,0.001873,-0.396749,0.659722,...,0.0,0.011680,0.729411,0.894920,0.138623,0.859691,0.014091,165.353599,0.189655,176.884548


In [13]:
sf.sort_index(axis=1)

Unnamed: 0,dim_2__has_duplicate_max,dim_2__number_peaks__n_1,dim_2__percentage_of_reoccurring_datapoints_to_all_datapoints,dim_2__percentage_of_reoccurring_values_to_all_values,dim_2__ratio_value_number_to_time_series_length,"dim_5__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""max""",dim_5__ar_coefficient__coeff_1__k_10,dim_5__ar_coefficient__coeff_2__k_10,dim_5__c3__lag_1,dim_5__c3__lag_2,...,dim_6__fourier_entropy__bins_100,dim_6__has_duplicate_max,dim_6__index_mass_quantile__q_0.6,dim_6__index_mass_quantile__q_0.7,dim_6__number_peaks__n_10,dim_6__partial_autocorrelation__lag_2,dim_6__percentage_of_reoccurring_datapoints_to_all_datapoints,dim_6__percentage_of_reoccurring_values_to_all_values,dim_6__quantile__q_0.8,dim_6__ratio_value_number_to_time_series_length
1,0.0,11.0,0.659722,0.409639,0.576389,1.154854,1.386988,0.027513,1.548690,1.317409,...,1.195274,0.0,0.763889,0.812500,2.0,-0.764844,0.638889,0.341772,0.736974,0.548611
2,0.0,18.0,0.277778,0.111111,0.812500,1.490546,1.568132,-0.374047,1.731992,1.298330,...,0.811303,0.0,0.611111,0.770833,2.0,-0.330962,0.513889,0.195402,0.742545,0.604167
3,0.0,4.0,0.784722,0.403846,0.361111,0.009406,0.901869,0.033683,0.395062,0.382114,...,0.216560,0.0,0.590278,0.687500,0.0,-0.102556,0.798611,0.482143,1.049354,0.388889
4,0.0,7.0,0.604167,0.313253,0.576389,0.905923,1.153100,-0.118289,0.519048,0.422749,...,0.359959,1.0,0.444444,0.520833,3.0,-0.089283,0.777778,0.438596,0.707822,0.395833
5,0.0,9.0,0.451389,0.185567,0.673611,0.444533,1.171382,-0.039614,0.578374,0.436641,...,0.650739,1.0,0.687500,0.743056,1.0,-0.779937,0.750000,0.462687,0.882541,0.465278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.0,13.0,0.527778,0.276596,0.652778,0.728735,1.860718,-1.131695,-0.490937,-0.541409,...,0.810197,0.0,0.722222,0.756944,2.0,-0.207584,0.298611,0.098214,0.328137,0.777778
92,0.0,12.0,0.493056,0.179775,0.618056,0.547682,1.093274,0.049684,0.217281,0.206687,...,0.359959,1.0,0.618056,0.798611,1.0,-0.126114,0.805556,0.533333,0.892508,0.416667
93,0.0,5.0,0.652778,0.264706,0.472222,0.907221,1.558330,-0.340132,0.437427,0.290349,...,0.483562,0.0,0.798611,0.847222,1.0,-0.600069,0.562500,0.267442,0.769372,0.597222
94,0.0,13.0,0.541667,0.266667,0.625000,0.894920,1.751047,-0.690955,1.348839,1.179302,...,1.069586,0.0,0.694444,0.826389,2.0,-0.823801,0.416667,0.160000,0.631497,0.694444
