# Aufbau der Notebooks [Philipp]

- Multivalue bei Notebooks angucken
- Wie installiere ich den "scheiß"
- Widgets: https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html


In [4]:
from enum import Enum
import ipywidgets as widgets
from pandas import read_csv
import pandas as pd
import asammdf
from IPython.display import display
import os
import numpy as np
import time
import pathlib as pl

In [42]:
# Globals

PATH_RAW_DATA = "./data/raw/"
PATH_FEATURE_DATA = "./data/"
PATH_EXPLORATION_DATA = "./exploration/"
PATH_MODEL = "./models/"
DATA_SOURCE_KIDAQ = ["TEST_NAME", "TEST_TYPE", "RPM", "FLOW_RATE", "P1", "P2"]
RAW_DATA_TYPE = ["KIDAQ", "VIB"]

DATA_SOURCE_VID = [
    "TEST_NAME",
    "TEST_TYPE",
    "RPM",
    "FLOW_RATE",
    "S1",
    "S2",
    "S3",
    "S4",
    "S5",
    "S6",
    "S7",
    "S8",
]
FEATURE = [
    "STD",
    "RANGE",
    "IQR",
    "MEAN_MEDIAN",
    "FFT",
]
OPERATING_POINT_FREQ = [725, 1450, 2175, 2900]
OPERATING_POINT_FLOW_RATE = [0, 25, 50, 75, 100]

LEARNER = Enum("LEARNER", ["DNN", "DT"])

DEFAULT_LEARNER = LEARNER.DNN
DEFAULT_RAW_DATA_TYPE = RAW_DATA_TYPE[0]
DEFAULT_RAW_DATA = DATA_SOURCE_KIDAQ

DEFAULT_CLASS_LABEL = "TEST_TYPE"

tags = widgets.TagsInput(
    value=DEFAULT_RAW_DATA, allowed_tags=DEFAULT_RAW_DATA, allow_duplicates=False
)
display(tags)

TagsInput(value=['TEST_NAME', 'TEST_TYPE', 'RPM', 'FLOW_RATE', 'P1', 'P2'], allow_duplicates=False, allowed_ta…

## 1. Aufgabe und Daten erklären/beschreiben [Philipp]


## 3. Preprocessing [Valerij]

Aufgeteilt nach KIDAQ und VIB (separat um es einfach zu halten)

- Auswahl der Fenstergröße in Millisekunden
- Auswahl der Abtastrate
- Multi-Selektion der Aggregation (avg, mean, std, ...)
- Frequenzanalyse
- Fourier-Transformation
- Fenstergröße nach Frequenzbereichen

### 3.1 Vorbereitung der Tainings- und Testdaten

Multi-Selektion für:

- Features
- Betriebspunkte (RPM/FLOWRATE)
- Klassifikationsarten (Szenario / Testdurchlauf)


In [43]:

WINDOW_SIZE_MS = ["100", "200", "300", "400", "500", "600", "700", "800","900","1000"]
AGGREGATIONS = ["std", "range", "iqr", "mean_median"]

win_sizes = widgets.Dropdown(
    placeholder= "Choose the window size in ms",
    options = WINDOW_SIZE_MS,
    description = "Window size in ms: ",
    ensure_option=True,
    disabled = False
)
display(win_sizes)

raw_data_type = widgets.Dropdown(
    placeholder= "Choose the raw data type",
    options = RAW_DATA_TYPE,
    description = "Raw data type: ",
    ensure_option=True,
    disabled = False
)
display(raw_data_type)

aggregations = widgets.SelectMultiple(
    options = AGGREGATIONS,
    description = "Aggregations:",
    disabled = False
)
display(aggregations)



Dropdown(description='Window size in ms: ', options=('100', '200', '300', '400', '500', '600', '700', '800', '…

Dropdown(description='Raw data type: ', options=('KIDAQ', 'VIB'), value='KIDAQ')

SelectMultiple(description='Aggregations:', options=('std', 'range', 'iqr', 'mean_median'), value=())

In [32]:
COLUMNS=[
    "p1",
    "p2",
    "a2",
    "T2",
    "T1"
]

CWD = pl.Path.cwd()

PATH_RAW_DATA = CWD / "data" / "raw"
PATH_TO_SETUP = PATH_RAW_DATA / "Setup-I"


kidaq_files = [file for file in PATH_TO_SETUP.glob("**/*.mf4")]



def process_file(file):
    
    error_type = file.parts[8]
    rpm = file.parts[10].split("r")[0]
    rpm_percent = file.parts[11].split("m")[1].split("%")[0]
    
    mdf = asammdf.MDF(file)
    df = mdf.to_dataframe()
    df = df.reset_index()

    df = df[COLUMNS]

    df_features = None

    windows = df.groupby(df.index // (int(win_sizes.value) * 20))

    for _, window in windows:

        features = {
            "Fehlertyp": error_type, 
            "rpm": rpm, 
            "rpm%": rpm_percent
            }

        for agg in aggregations.value:
            match agg:
                case "std":
                    features["p1_std"] = window["p1"].std()
                    features["p2_std"] = window["p2"].std()
                    features["a2_std"] = window["a2"].std()
                    features["T2_std"] = window["T2"].std()
                    features["T1_std"] = window["T1"].std()
                case "range":
                    features["p1_range"] =  window["p1"].max() - window["p1"].min()
                    features["p2_range"] =  window["p2"].max() - window["p2"].min()
                    features["a2_range"] =  window["a2"].max() - window["a2"].min()
                    features["T2_range"] =  window["T2"].max() - window["T2"].min()
                    features["T1_range"] =  window["T1"].max() - window["T1"].min()
                case "iqr":
                    features["p1_iqr"] = window["p1"].quantile(0.75) - window["p1"].quantile(0.25)
                    features["p2_iqr"] = window["p2"].quantile(0.75) - window["p2"].quantile(0.25)
                    features["a2_iqr"] = window["a2"].quantile(0.75) - window["a2"].quantile(0.25)
                    features["T2_iqr"] = window["T2"].quantile(0.75) - window["T2"].quantile(0.25)
                    features["T1_iqr"] = window["T1"].quantile(0.75) - window["T1"].quantile(0.25)
                case "median":
                    features["p1_median"] =  window["p1"].median()
                    features["p2_median"] =  window["p2"].median()
                    features["a2_median"] =  window["a2"].median()
                    features["T2_median"] =  window["T2"].median()
                    features["T1_median"] =  window["T1"].median()
                case _:
                    print("No aggregation selected")

        if df_features is None:
            df_features = pd.DataFrame(features, index=[0])    
        else:
            df_features = pd.concat([df_features, pd.DataFrame(features, index=[0])])

    return df_features


In [33]:
from tqdm.notebook import tqdm

RESULT_FILE = CWD / "data" / f"{raw_data_type.value}_features.csv"

for file in tqdm(kidaq_files):
    features = process_file(file)

    if features is not None:
        if not pl.Path(RESULT_FILE).exists():
            features.to_csv(RESULT_FILE, index=False)
        else:
            features.to_csv(RESULT_FILE, mode="a", header=False, index=False)

  0%|          | 0/20 [00:00<?, ?it/s]

## 4. Deskriptive/Explorative Datenanalyse [Philipp]

- Plots
- Beschreibung der Plots


## 5. Machine Learning [Kevin]

Multi-Selektion für:

- Auswahl der Featuredateien (Train/Testdaten)
- Auswahl des Learners
- Konfiguration der Hyperparameter
- Live-Validation des Models mit vorausgewählten Testdaten (Random-Search, ...)


In [44]:
# SELECT TRAINING AND TEST DATA
featureDataDir = list(filter(lambda x: os.path.isfile(os.path.join(PATH_FEATURE_DATA, x)) and DEFAULT_RAW_DATA_TYPE in x.upper(), os.listdir(PATH_FEATURE_DATA)))

modelDropdown = widgets.Dropdown(description="training features")
modelDropdown.options = featureDataDir
selectedModelFile = None
def onTrainigFileChange(change):
    global selectedModelFile
    selectedModelFile = change['new']
modelDropdown.observe(onTrainigFileChange, names='value')
display(modelDropdown)

testFileDropdown = widgets.Dropdown(description="test features")
testFileDropdown.options = featureDataDir
selectedTestFile = None
def onTestFileChange(change):
    global selectedTestFile
    selectedTestFile = change['new']
testFileDropdown.observe(onTestFileChange, names='value')
display(testFileDropdown)


Dropdown(description='training features', options=('KIDAQ_features.csv',), value=None)

Dropdown(description='test features', options=('KIDAQ_features.csv',), value=None)

In [45]:
# SET ML CONFIG
FEATURE_TRAIN_FILE = PATH_FEATURE_DATA + selectedModelFile
FEATURE_TEST_FILE = PATH_FEATURE_DATA + selectedTestFile

DEFAULT_OPERATING_POINTS = OPERATING_POINT_FREQ + OPERATING_POINT_FLOW_RATE


# DNN
DNN_EXPLORATION_TARGET_VAL_ACCURACY = 0.9
DNN_EXPLORATION_MAX_ITER = 1
DNN_EXPLORATION_HIDDEN_LAYERS_MIN = 2
DNN_EXPLORATION_HIDDEN_LAYERS_MAX = 4
DNN_EXPLORATION_NEURONS_MIN = 8
DNN_EXPLORATION_NEURONS_MAX = 64


DNN_EARLY_STOPPING_PATIENCE = 50
DNN_VERBOSE = 0
DNN_EPOCHS = 100
DNN_BATCH_SIZE = 32
DNN_BATCH_NORMALIZATION = True


# DT
DT_MAX_DEPTH = 6
DT_NUM_ESTIMATORS = 100

In [46]:
# LOAD DATA
train_data = read_csv(FEATURE_TRAIN_FILE, header=None, delimiter=";").values
test_data = read_csv(FEATURE_TEST_FILE, header=None, delimiter=";").values

train_x, train_y = train_data[:, 2:].astype('float32'), train_data[:, 1:2]
test_x, test_y = test_data[:, 2:].astype('float32'), test_data[:, 1:2]

In [47]:
# Train Decision Tree
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


labelEncoder = LabelEncoder()
labelEncoder = labelEncoder.fit(np.ravel(train_y))
label_encoded_train_y = labelEncoder.transform(np.ravel(train_y))
label_encoded_test_y = labelEncoder.transform(np.ravel(test_y))


xgb = XGBClassifier(
    tree_method="hist",
    enable_categorical=True,
    max_depth=DT_MAX_DEPTH,
    n_estimators=DT_NUM_ESTIMATORS,
)
# fit model
xgb.fit(train_x, label_encoded_train_y, eval_set=[(test_x, label_encoded_test_y)])


preds = xgb.predict(test_x)
accuracy = accuracy_score(label_encoded_test_y, preds)

XGBClassifier.save_model(xgb, PATH_MODEL+"dt."+str(round(time.time())))

ModuleNotFoundError: No module named 'sklearn'

In [77]:
# Train Neural Network
import json
import random
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

onehotencoder = OneHotEncoder()
onehotencoder = onehotencoder.fit(train_y)
onehot_encoded_train_y = onehotencoder.transform(train_y).toarray()
onehot_encoded_test_y = onehotencoder.transform(test_y).toarray()

exploration_results = []


n_features = train_x.shape[1]
categories = len(onehot_encoded_train_y[0])

test_acc = 0.0
model = Sequential()
interation = 0
while (
    test_acc < DNN_EXPLORATION_TARGET_VAL_ACCURACY
    and interation < DNN_EXPLORATION_MAX_ITER
):
    model = Sequential()
    model.add(InputLayer(input_shape=(n_features,)))
    model.add(BatchNormalization())
    dense_count = random.randint(
        DNN_EXPLORATION_HIDDEN_LAYERS_MIN, DNN_EXPLORATION_HIDDEN_LAYERS_MAX
    )
    dense_neurons = []
    for i in range(0, dense_count):
        neurons = random.randint(
            DNN_EXPLORATION_NEURONS_MIN, DNN_EXPLORATION_NEURONS_MAX
        )
        dense_neurons.append(neurons)
        model.add(Dense(neurons, activation="tanh"))
        model.add(BatchNormalization())
    model.add(Dense(categories, activation="sigmoid"))
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    model.fit(
        train_x,
        onehot_encoded_train_y,
        epochs=DNN_EPOCHS,
        batch_size=DNN_BATCH_SIZE,
        verbose=1,
        validation_data=(test_x, onehot_encoded_test_y),
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=DNN_EARLY_STOPPING_PATIENCE,
                restore_best_weights=True,
            )
        ],
    )

    train_loss, train_acc = model.evaluate(train_x, onehot_encoded_train_y)
    test_loss, test_acc = model.evaluate(test_x, onehot_encoded_test_y)

    exploration_results.append(
        {
            "dense_count": dense_count,
            "dense_neurons": dense_neurons,
            "train_loss": train_loss,
            "train_acc": train_acc,
            "test_loss": test_loss,
            "test_acc": test_acc,
        }
    )
    interation += 1

modelName = "dnn." + str(round(time.time()))
model.save(PATH_MODEL + modelName)

with open(PATH_EXPLORATION_DATA + modelName + ".exploration_results.json", "w") as f:
    json.dump(exploration_results, f, indent=4)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100




INFO:tensorflow:Assets written to: ./models/dnn.1685239363\assets


INFO:tensorflow:Assets written to: ./models/dnn.1685239363\assets


## 6. Modelanalyse des Learners [Kevin]

- Vorherige Auswahl eines Learners
- Feature Importance
- Korrelationsmatrix
- Konfusionsmatrix
- Post-Validation des Models mit auswählbaren Daten


In [81]:
# SELECT LEARNER AND TEST DATA
featureDataDir = list(filter(lambda x: os.path.isfile(os.path.join(PATH_FEATURE_DATA, x)) and DEFAULT_RAW_DATA_TYPE in x.upper(), os.listdir(PATH_FEATURE_DATA)))
modelDir = os.listdir(PATH_MODEL)

modelDropdown = widgets.Dropdown(description="model")
modelDropdown.options = modelDir
selectedModelFile = None
def onTrainigFileChange(change):
    global selectedModelFile
    selectedModelFile = change['new']
modelDropdown.observe(onTrainigFileChange, names='value')
display(modelDropdown)

testFileDropdown = widgets.Dropdown(description="test data")
testFileDropdown.options = featureDataDir
selectedTestFile = None
def onTestFileChange(change):
    global selectedTestFile
    selectedTestFile = change['new']
testFileDropdown.observe(onTestFileChange, names='value')
display(testFileDropdown)


Dropdown(description='model', options=('dnn.1685239363', 'dt.1685239368'), value=None)

Dropdown(description='test data', options=('vib.setup1.csv', 'vib.setup2.csv'), value=None)

In [None]:
# ANALYZE MODEL
from matplotlib import pyplot
from tensorflow import math as tfmath
import tensorflow_probability as tfp
import eli5
from eli5.sklearn import PermutationImportance

conf_matrix = tfmath.confusion_matrix(np.argmax(y_test, axis=1), np.argmax(result, axis=1))
pyplot.matshow(conf_matrix, 1)
for (x, y), value in np.ndenumerate(conf_matrix):
    pyplot.text(y, x, f"{value:.2f}", va="center", ha="center")
pyplot.show()


print('Test Accuracy: %.3f' % test_acc)


corr_matrix = tfp.stats.correlation(X_test)
pyplot.matshow(corr_matrix)
pyplot.show()

perm = PermutationImportance(model, scoring="neg_mean_squared_error", random_state=1).fit(X_test, y_test)
print(eli5.format_as_text(eli5.explain_weights(perm, feature_names=feature_names)))

## 7. Statische Interpretation des Resultats

- Welches Ergebnis haben wir erzielt und wie kann man es anwenden?
