In [1]:
import pandas as pd
import numpy as np
import glob

In [2]:
a = glob.glob('./features/*.txt')
features={}

for name in a:
    try:
        with open(name) as f:
            # read in the data 
            temp_df = pd.read_csv(name,delim_whitespace=True,header=None)
            # make the row names cycle as a variable
            temp_df.index.name = 'cycle'
            temp_df.reset_index(inplace=True)
            # transpose the data 
            temp_df_transposed= temp_df.T
            # make the seconds a variable, call it "time”
            temp_df_transposed.index.name = 'time'
            temp_df_transposed.reset_index(inplace=True)
            # add a prefix cycle in the column names to help with pivoting data (from wide to long)
            string = ' cycle'.join(str(e) for e in list(temp_df_transposed.columns))
            temp_df_transposed.columns = string.split(" ")
            # From wide to long to help with joining all the variables 
            temp_df_long = pd.wide_to_long(temp_df_transposed.iloc[1:,:],stubnames='cycle', i=['time'], j='c')
            temp_df_long.reset_index(inplace=True)
            # save each data variable in the long format into separate data frames in the dictionary called "features"
            # but exclude path and txt in the names
            features[name[9:-4]] = temp_df_long
                        
           
    
    # prevent a possible error
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise

In [3]:
for key in list(features.keys()):
    features[key].columns=['seconds','cycle',key]

In [4]:
dfs= [features['s/CP'],
      features['s/CE'],
      features['s/EPS1'],
      features['s/FS1'],
      features['s/FS2'],
      features['s/PS1'],
      features['s/PS2'],
      features['s/PS3'],
      features['s/PS4'],
      features['s/PS5'],
      features['s/PS6'],
      features['s/SE'],
      features['s/TS1'],
      features['s/TS2'],
      features['s/TS3'],
      features['s/TS4'],
     features['s/VS1']]

In [5]:
from functools import reduce
feats_join = reduce(lambda x,y: pd.merge(x,y,on=['seconds','cycle']),dfs)

In [6]:
label = pd.read_csv('profile.txt',delim_whitespace=True)
label.columns = ['cooler_condition', 'valve_condition', 'pump_leak', 'hydraulic_accumulator', 'stable_flag']

In [7]:
%pip install tsfresh

Collecting tsfresh
  Downloading tsfresh-0.17.0-py2.py3-none-any.whl (91 kB)
[K     |████████████████████████████████| 91 kB 3.2 MB/s eta 0:00:011
Collecting distributed>=2.11.0
  Downloading distributed-2021.1.0-py3-none-any.whl (671 kB)
[K     |████████████████████████████████| 671 kB 7.2 MB/s eta 0:00:01
Collecting dask[dataframe]>=2.9.0
  Downloading dask-2021.1.0-py3-none-any.whl (889 kB)
[K     |████████████████████████████████| 889 kB 9.0 MB/s eta 0:00:01
Collecting toolz>=0.8.2
  Downloading toolz-0.11.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.9 MB/s  eta 0:00:01
Collecting zict>=0.1.3
  Downloading zict-2.0.0-py3-none-any.whl (10 kB)
Collecting sortedcontainers!=2.0.0,!=2.0.1
  Downloading sortedcontainers-2.3.0-py2.py3-none-any.whl (29 kB)
Collecting tblib>=1.6.0
  Downloading tblib-1.7.0-py2.py3-none-any.whl (12 kB)
Collecting partd>=0.3.10; extra == "dataframe"
  Downloading partd-1.1.0-py3-none-any.whl (19 kB)
Collecting fsspec>=0.6.0

In [11]:
from azureml.core import Workspace, Experiment, Dataset, Model
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig
import joblib, pickle


from tsfresh.transformers import RelevantFeatureAugmenter
from sklearn.pipeline import Pipeline

from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_relevant_features
from tsfresh.feature_selection import select_features



In [9]:
# Automatic feautre extraction using the tsfresh package
extracted_features = extract_features(feats_join, column_id="cycle", column_sort="seconds")
# Impute any possible missing values 
impute(extracted_features)

Feature Extraction: 100%|██████████| 10/10 [20:51<00:00, 125.13s/it]
 's/PS5__fft_coefficient__attr_"real"__coeff_32'
 's/PS5__fft_coefficient__attr_"real"__coeff_33' ...
 's/PS4__fft_coefficient__attr_"angle"__coeff_97'
 's/PS4__fft_coefficient__attr_"angle"__coeff_98'
 's/PS4__fft_coefficient__attr_"angle"__coeff_99'] did not have any finite values. Filling with zeros.


Unnamed: 0,s/PS5__variance_larger_than_standard_deviation,s/PS5__has_duplicate_max,s/PS5__has_duplicate_min,s/PS5__has_duplicate,s/PS5__sum_values,s/PS5__abs_energy,s/PS5__mean_abs_change,s/PS5__mean_change,s/PS5__mean_second_derivative_central,s/PS5__median,...,s/PS4__fourier_entropy__bins_2,s/PS4__fourier_entropy__bins_3,s/PS4__fourier_entropy__bins_5,s/PS4__fourier_entropy__bins_10,s/PS4__fourier_entropy__bins_100,s/PS4__permutation_entropy__dimension_3__tau_1,s/PS4__permutation_entropy__dimension_4__tau_1,s/PS4__permutation_entropy__dimension_5__tau_1,s/PS4__permutation_entropy__dimension_6__tau_1,s/PS4__permutation_entropy__dimension_7__tau_1
0,0.0,0.0,0.0,1.0,597.793,5955.965499,0.013492,0.000000,-0.000250,9.9640,...,0.142506,0.142506,0.379535,0.563420,1.107653,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000
1,0.0,0.0,0.0,1.0,582.611,5657.283409,0.013356,0.000237,-0.000034,9.7100,...,0.142506,0.142506,0.379535,0.563420,1.107653,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000
2,0.0,0.0,1.0,1.0,576.258,5534.577636,0.012441,0.000237,0.000043,9.6040,...,0.142506,0.142506,0.379535,0.563420,1.107653,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000
3,0.0,0.0,0.0,1.0,570.020,5415.403256,0.013136,-0.000051,0.000216,9.5000,...,0.142506,0.142506,0.379535,0.563420,1.107653,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000
4,0.0,1.0,1.0,1.0,564.123,5303.934739,0.012034,0.000237,0.000112,9.4025,...,0.142506,0.142506,0.379535,0.563420,1.107653,-0.000000,-0.000000,-0.000000,-0.000000,-0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2200,0.0,0.0,0.0,1.0,598.589,5971.837225,0.013305,0.000017,-0.000147,9.9755,...,0.142506,0.142506,0.379535,0.611952,0.838120,1.759520,2.772239,3.576481,3.906512,3.963312
2201,0.0,0.0,0.0,1.0,598.424,5968.545638,0.012373,0.000475,0.000052,9.9720,...,0.142506,0.142506,0.283936,0.518700,1.195625,1.773311,2.837236,3.585825,3.830896,3.963312
2202,0.0,0.0,1.0,1.0,598.104,5962.162836,0.013017,-0.000169,0.000009,9.9680,...,0.142506,0.283936,0.283936,0.518700,1.062934,1.761302,2.867800,3.591893,3.846587,3.988984
2203,0.0,1.0,0.0,1.0,598.003,5960.148517,0.012695,0.000153,0.000121,9.9650,...,0.142506,0.142506,0.457102,0.457102,0.928839,1.778878,2.872232,3.650747,3.956922,3.988984


In [20]:
extracted_features.drop(extracted_features.tail(1).index,inplace=True)

In [21]:
features_filtered_accum = select_features(extracted_features, label['hydraulic_accumulator'])

features_filtered_flag = select_features(extracted_features, label['stable_flag'])

In [24]:
joint = features_filtered_accum.join(label)
joint.columns

Index(['s/PS1__cwt_coefficients__coeff_14__w_5__widths_(2, 5, 10, 20)',
       's/PS1__cwt_coefficients__coeff_13__w_5__widths_(2, 5, 10, 20)',
       's/PS1__cwt_coefficients__coeff_11__w_2__widths_(2, 5, 10, 20)',
       's/PS1__cwt_coefficients__coeff_12__w_5__widths_(2, 5, 10, 20)',
       's/PS1__ratio_beyond_r_sigma__r_1.5',
       's/PS1__cwt_coefficients__coeff_7__w_2__widths_(2, 5, 10, 20)',
       's/PS1__permutation_entropy__dimension_7__tau_1',
       's/PS1__permutation_entropy__dimension_6__tau_1',
       's/PS3__last_location_of_maximum',
       's/PS1__fft_coefficient__attr_"real"__coeff_1',
       ...
       's/PS3__lempel_ziv_complexity__bins_3',
       's/PS6__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"',
       's/CP__standard_deviation', 's/CP__variance',
       's/PS3__fft_coefficient__attr_"real"__coeff_19', 'cooler_condition',
       'valve_condition', 'pump_leak', 'hydraulic_accumulator', 'stable_flag'],
      dtype='object', length=5221)

In [31]:
automl_config = AutoMLConfig(
    experiment_timeout_minutes=70,
    task='classification',
    primary_metric='AUC_weighted',
    training_data=joint,
    label_column_name='stable_flag',
    n_cross_validations=2)

In [32]:
ws = Workspace.get(name="quick-starts-ws-135060")
exp = Experiment(workspace=ws, name="Ranga")

autoexp = Experiment(workspace=ws,name='AutoRanga')
remote_run = autoexp.submit(automl_config)
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)



In [None]:
best,fitted = remote_run.get_output()
fitted

#TODO: Save the best model
filename = 'best_automl_model.sav'
# pickle.dump(fitted, open(filename, 'wb'))

joblib.dump(fitted,filename)

In [15]:
pipeline = Pipeline([('augmenter', RelevantFeatureAugmenter(column_id="cycle", column_sort="seconds")),
            ('best automl', fitted)])

In [16]:
y_stable_flag = label['stable_flag']
x = pd.DataFrame(index = y_stable_flag.index)

In [17]:
pipeline.set_params(augmenter__timeseries_container=feats_join)
pipeline.fit(x,y_stable_flag)

Feature Extraction: 100%|██████████| 10/10 [27:21<00:00, 164.18s/it]
 's/CP__fft_coefficient__attr_"real"__coeff_32'
 's/CP__fft_coefficient__attr_"real"__coeff_33' ...
 's/VS1__fft_coefficient__attr_"angle"__coeff_97'
 's/VS1__fft_coefficient__attr_"angle"__coeff_98'
 's/VS1__fft_coefficient__attr_"angle"__coeff_99'] did not have any finite values. Filling with zeros.
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \




Pipeline(memory=None,
         steps=[('augmenter',
                 RelevantFeatureAugmenter(chunksize=None, column_id='cycle',
                                          column_kind=None,
                                          column_sort='seconds',
                                          column_value=None,
                                          default_fc_parameters=None,
                                          disable_progressbar=False,
                                          fdr_level=0.05,
                                          filter_only_tsfresh_features=True,
                                          hypotheses_independent=False,
                                          kind_to_fc_parameters=None,
                                          ml_task='auto', n_jobs=2,
                                          profile=Fals...
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, gamma=0,
             