# Aufbau der Notebooks [Philipp]

- Multivalue bei Notebooks angucken
- Wie installiere ich den "scheiß"
- Widgets: https://ipywidgets.readthedocs.io/en/stable/examples/Widget%20List.html


In [None]:
from enum import Enum
import ipywidgets as widgets
from pandas import read_csv
import pandas as pd
import asammdf
from IPython.display import display
import os
import numpy as np
import time
import pathlib as pl

In [None]:
# Setting up the global variables

PATH_RAW_DATA = "./data/raw/"
PATH_FEATURE_DATA = "./data/"
PATH_EXPLORATION_DATA = "./exploration/"
PATH_MODEL = "./models/"
DATA_SOURCE_KIDAQ = ["TEST_NAME", "TEST_TYPE", "RPM", "FLOW_RATE", "P1", "P2"]
RAW_DATA_TYPE = ["KIDAQ", "VIB"]

DATA_SOURCE_VID = [
    "TEST_NAME",
    "TEST_TYPE",
    "RPM",
    "FLOW_RATE",
    "S1",
    "S2",
    "S3",
    "S4",
    "S5",
    "S6",
    "S7",
    "S8",
]
FEATURE = [
    "STD",
    "RANGE",
    "IQR",
    "MEAN_MEDIAN",
    "FFT",
]
OPERATING_POINT_FREQ = [725, 1450, 2175, 2900]
OPERATING_POINT_FLOW_RATE = [0, 25, 50, 75, 100]

DEFAULT_RAW_DATA_TYPE = RAW_DATA_TYPE[1]
DEFAULT_RAW_DATA = DATA_SOURCE_KIDAQ

DEFAULT_CLASS_LABEL = "TEST_TYPE"

## 1. Aufgabe und Daten erklären/beschreiben [Philipp]


## 3. Preprocessing [Valerij]

Aufgeteilt nach KIDAQ und VIB (separat um es einfach zu halten)

- Auswahl der Fenstergröße in Millisekunden
- Auswahl der Abtastrate
- Multi-Selektion der Aggregation (avg, mean, std, ...)
- Frequenzanalyse
- Fourier-Transformation
- Fenstergröße nach Frequenzbereichen

### 3.1 Vorbereitung der Tainings- und Testdaten

Multi-Selektion für:

- Features
- Betriebspunkte (RPM/FLOWRATE)
- Klassifikationsarten (Szenario / Testdurchlauf)


### Vorauswahl der Feature Einstellungen und Auswahl der Daten

In [None]:
WINDOW_SIZE_MS = ["100", "200", "300", "400", "500", "600", "700", "800","900","1000"]
AGGREGATIONS = ["std", "range", "iqr", "median"]

win_sizes = widgets.Dropdown(
    placeholder= "Choose the window size in ms",
    options = WINDOW_SIZE_MS,
    description = "Window size in ms: ",
    ensure_option=True,
    disabled = False
)
display(win_sizes)

raw_data_type = widgets.Dropdown(
    placeholder= "Choose the raw data type",
    options = RAW_DATA_TYPE,
    description = "Raw data type: ",
    ensure_option=True,
    disabled = False
)
display(raw_data_type)

# list all available directories in data/raw with max depth of 1
raw_data_folders = [f.name for f in os.scandir(PATH_RAW_DATA) if f.is_dir()]
raw_data_folder = widgets.Dropdown(
    placeholder= "Choose the raw data folder",
    options = raw_data_folders,
    description = "Raw data folder: ",
    ensure_option=True,
    disabled = False
)
display(raw_data_folder)





Dropdown(description='Window size in ms: ', options=('100', '200', '300', '400', '500', '600', '700', '800', '…

Dropdown(description='Raw data type: ', options=('KIDAQ', 'VIB'), value='KIDAQ')

Dropdown(description='Raw data folder: ', options=('Setup-I',), value='Setup-I')

### Definition der Funktionen zum Preprocessing der verschiedenen Datentypen

In [None]:

from tqdm.notebook import tqdm
COLUMNS_KiDAQ = [
    "p1",
    "p2",
    "a2",
    "T2",
    "T1"
]


CWD = pl.Path.cwd()

PATH_RAW_DATA = CWD / "data" / "raw"
PATH_TO_SETUP = PATH_RAW_DATA / raw_data_folder.value 


if raw_data_type.value == "KIDAQ":
    raw_files = [file for file in PATH_TO_SETUP.glob("**/*.mf4") if "KiDAQ" in file.parts]    
elif raw_data_type.value == "VIB":
    raw_files = [file for file in PATH_TO_SETUP.glob("**/*.csv") if "Rohdaten CSV" in file.parts]
def process_vib(file):
    # error_type is the first folder name after the setup folder
    error_type_with_number = file.parts[8]
    error_type = error_type_with_number.split(" ")[0]
    rpm = file.parts[10].split("r")[0]
    rpm_percent = file.parts[11].split("%")[0]
    sensor = file.parts[-1].split(" ")[0]
    version = file.parts[12] if file.parts[12] != None else '0'

    df = read_csv(file, skiprows=2, encoding="ISO-8859-1", sep=";")

    df["Timestamp [ns]"] = pd.to_datetime(df["Timestamp [ns]"], unit="ns")
    df = df.set_index("Timestamp [ns]")
    resampled = df.resample("1s")

    df_mean = resampled.mean()
    df_mean = df_mean.rename(columns={"Value": f"{sensor}_mean"})
    df_range = resampled.max() - resampled.min()
    df_range = df_range.rename(columns={"Value": f"{sensor}_range"})
    df_std = resampled.std()
    df_std = df_std.rename(columns={"Value": f"{sensor}_std"})
    df_iqr = resampled.quantile(0.75) - resampled.quantile(0.25)
    df_iqr = df_iqr.rename(columns={"Value": f"{sensor}_iqr"})
    df_mean_median = resampled.mean() - resampled.median()
    df_mean_median = df_mean_median.rename(columns={"Value": f"{sensor}_mean_median"})

    df = pd.concat([df_mean, df_range, df_std, df_iqr, df_mean_median], axis=1)
    df["Fehlertyp"] = error_type_with_number
    df["Fehlertyp_allgemein"] = error_type
    df["rpm"] = rpm
    df["rpm%"] = rpm_percent
    df["version"] = version
    df = df.reset_index()
    df = df.drop(columns=["Timestamp [ns]"])
    df["ID"] = df.index
    df = df.melt(
        id_vars=["ID", "Fehlertyp", "Fehlertyp_allgemein", "rpm", "rpm%", "version"],
        var_name="Aggregation",
        value_name="Value",
    )

    return df 

    


def process_kidaq(file):
    
    error_type_with_number = file.parts[8]
    error_type = error_type_with_number.split(" ")[0]
    rpm = file.parts[10].split("r")[0]
    rpm_percent = file.parts[11].split("m")[1].split("%")[0]
    
    mdf = asammdf.MDF(file)
    df = mdf.to_dataframe()
    df = df.reset_index()

    df = df[COLUMNS_KiDAQ]

    df_features = None

    windows = df.groupby(df.index // (int(win_sizes.value) * 20))

    for _, window in windows:

        features = {
            "Fehlertyp": error_type_with_number, 
            "Fehlertyp_allgemein": error_type,
            "rpm": rpm, 
            "rpm%": rpm_percent
            }

        features["p1_std"] = window["p1"].std()
        features["p2_std"] = window["p2"].std()
        features["a2_std"] = window["a2"].std()
        features["T2_std"] = window["T2"].std()
        features["T1_std"] = window["T1"].std()
        
        features["p1_range"] =  window["p1"].max() - window["p1"].min()
        features["p2_range"] =  window["p2"].max() - window["p2"].min()
        features["a2_range"] =  window["a2"].max() - window["a2"].min()
        features["T2_range"] =  window["T2"].max() - window["T2"].min()
        features["T1_range"] =  window["T1"].max() - window["T1"].min()
        
        features["p1_iqr"] = window["p1"].quantile(0.75) - window["p1"].quantile(0.25)
        features["p2_iqr"] = window["p2"].quantile(0.75) - window["p2"].quantile(0.25)
        features["a2_iqr"] = window["a2"].quantile(0.75) - window["a2"].quantile(0.25)
        features["T2_iqr"] = window["T2"].quantile(0.75) - window["T2"].quantile(0.25)
        features["T1_iqr"] = window["T1"].quantile(0.75) - window["T1"].quantile(0.25)
        
        features["p1_mean_median"] = window["p1"].mean() - window["p1"].median()
        features["p2_mean_median"] = window["p2"].mean() - window["p2"].median()
        features["a2_mean_median"] = window["a2"].mean() - window["a2"].median()
        features["T2_mean_median"] = window["T2"].mean() - window["T2"].median()
        features["T1_mean_median"] = window["T1"].mean() - window["T1"].median()

        if df_features is None:
            df_features = pd.DataFrame(features, index=[0])    
        else:
            df_features = pd.concat([df_features, pd.DataFrame(features, index=[0])])

    return df_features


### Funktionem zum Preprocessing der Daten in den Frequenbereich und Generierung der Features

In [None]:
def process_freq_kidaq(file):

    error_type_with_number = file.parts[8]
    error_type = error_type_with_number.split(" ")[0]
    rpm = file.parts[10].split("r")[0]
    rpm_percent = file.parts[11].split("m")[1].split("%")[0]
    
    mdf = asammdf.MDF(file)
    df = mdf.to_dataframe()
    df.index = pd.to_timedelta(df.index, unit="s")

    resample_time = df.resample("1s")

    r = resample_time.aggregate(lambda sample: np.fft.fft)

    
    df_fft = pd.DataFrame(np.fft.fft(df))

    df_fft.columns = df.columns
    df_fft.index = df.index
    df_fft.index = pd.to_timedelta(df_fft.index, unit="s")
    df_fft = df_fft.apply(np.abs)
    resampled = df_fft.resample("1s")

    df_mean = resampled.mean().to_numpy()
    df_range = resampled.max() - resampled.min()
    df_range = df_range.to_numpy()
    df_std = resampled.std().to_numpy()
    df_iqr = resampled.quantile(0.75) - resampled.quantile(0.25)
    df_iqr = df_iqr.to_numpy()
    df_mean_median = resampled.mean() - resampled.median()
    df_mean_median = df_mean_median.to_numpy()

    data = np.concatenate((np.repeat([[error_type_with_number, error_type, rpm, rpm_percent]], df_mean.shape[0], axis=0), df_mean, df_range, df_std, df_iqr, df_mean_median), axis=1)

    with open("fft_test.csv", "a") as f:
        for feature in data:
            f.write(";".join(feature) + "\n")    

    



for file in tqdm(raw_files):
    df = process_freq_kidaq(file) 

In [None]:

FREQ_WINDOWS = 16

def process_freq_kidaq(file):

    error_type_with_number = file.parts[8]
    error_type = error_type_with_number.split(" ")[0]
    rpm = file.parts[10].split("r")[0]
    rpm_percent = file.parts[11].split("m")[1].split("%")[0]
    
    mdf = asammdf.MDF(file)
    df = mdf.to_dataframe()
    df = df.reset_index()

    df = df[COLUMNS_KiDAQ]

    df_features = None

    windows = df.groupby(df.index // (int(win_sizes.value) * 20))

    for _, window in windows:

        for col in window.columns:
            X = np.fft.fft(window[col])
            amps = np.abs(X)

            xf = np.fft.fftfreq(len(window[col]), 1 / 20000)

            idxMax = np.argmax(amps)

            if df_features is None:
                df_features = np.concatenate(([xf[idxMax]], [amps[idxMax]]), axis=0)
            else:
                df_features = np.concatenate((df_features, [xf[idxMax]], [amps[idxMax]]), axis=0)

            freqs_window_size = len(amps) / FREQ_WINDOWS

            for x in range(FREQ_WINDOWS):
                freq_window = amps[int(x*freqs_window_size):int((x+1)*freqs_window_size)]
                df_features = np.concatenate((df_features, [np.max(freq_window, axis=0)], [np.average(freq_window, axis=0)], [np.mean(freq_window, axis=0)] ), axis=0)

    df_features = np.concatenate(([error_type_with_number], [error_type], [rpm], [rpm_percent], df_features), axis=0) 

    with open(f"data/freq_{raw_data_type}_features.csv", "w") as f:
        f.write(";".join(df_features))

    # for _, window in windows:

    #         for col in window.columns:
    #             X = np.fft.fft(window[col])
    #             amps = np.abs(X)

    #             xf = np.fft.fftfreq(len(window[col]), 1 / 20000)

    #             idxMax = np.argmax(amps)

    #             # create a dataframe with two columns amps and xf
    #             df_fft = pd.DataFrame({'amps': amps, 'xf': xf})

    #             freqs_window_size = len(amps) / FREQ_WINDOWS

    #             for x in range(FREQ_WINDOWS):
    #                 freq_window = amps[int(x*freqs_window_size):int((x+1)*freqs_window_size)]
    #                 window_features = {
    #                     "Fehlertyp": error_type_with_number,
    #                     "Fehlertyp_allgemein": error_type,
    #                     "rpm": rpm,
    #                     "rpm%": rpm_percent,
    #                     "xf_max": xf[idxMax],
    #                     "amps_max": amps[idxMax],
    #                     "amps_max_window": np.max(freq_window, axis=0),
    #                     "amps_avg_window": np.average(freq_window, axis=0),
    #                     "amps_mean_window": np.mean(freq_window, axis=0)
    #                 }

    #                 if df_features is None:
    #                     df_features = pd.DataFrame(window_features, index=[0])
    #                 else:
    #                     df_features = pd.concat([df_features, pd.DataFrame(window_features, index=[0])])


    # if not pl.Path(f"freq_{raw_data_type}_1_features.csv").exists():
    #     df_features.to_csv(f"data/freq_{raw_data_type}_features.csv", header=True, index=False, sep=";")
    # else:
         
    #     df_features.to_csv(f"freq_{raw_data_type}_1_features.csv", mode='a', header=False, index=False, sep=";")


        
for file in tqdm(raw_files):
    process_freq_kidaq(file)



  0%|          | 0/20 [00:00<?, ?it/s]

### Durch ausführen der folgenden Zelle, werden die Daten mit den ausgewählten Parametern vorbereitet und abgespeichert

## 4. Deskriptive/Explorative Datenanalyse [Philipp]

- Plots
- Beschreibung der Plots


## 5. Maschinelles Lernen [Kevin]

In diesem Bereich können, mithilfe drei verschiedener Learner, Modell anhand der generierten Trainings- und Testdaten erzeugt werden.

### 5.1 Trainings- und Testdaten wählen und laden

In [None]:
# SELECT TRAINING AND TEST DATA
featureDataDir = list(filter(lambda x: os.path.isfile(os.path.join(PATH_FEATURE_DATA, x)) and DEFAULT_RAW_DATA_TYPE in x.upper(), os.listdir(PATH_FEATURE_DATA)))

trainFileDropdown = widgets.Dropdown(description="training data")
trainFileDropdown.options = featureDataDir
selectedTrainFile = None
def onTrainigFileChange(change):
    global selectedTrainFile
    selectedTrainFile = change['new']
trainFileDropdown.observe(onTrainigFileChange, names='value')
display(trainFileDropdown)

testFileDropdown = widgets.Dropdown(description="test data")
testFileDropdown.options = featureDataDir
selectedTestFile = None
def onTestFileChange(change):
    global selectedTestFile
    selectedTestFile = change['new']
testFileDropdown.observe(onTestFileChange, names='value')
display(testFileDropdown)


In [None]:
# LOAD DATA

csvTrain = read_csv(PATH_FEATURE_DATA + selectedTrainFile, delimiter=";")
csvTest = read_csv(PATH_FEATURE_DATA + selectedTestFile, delimiter=";")



trainData = csvTrain.values
testData = csvTest.values


featureNames = csvTrain.columns.values[2:].tolist()
trainX, trainY = trainData[:, 2:].astype('float32'), trainData[:, 1:2]
testX, testY = testData[:, 2:].astype('float32'), testData[:, 1:2]

### 5.2 Entscheidungsbaum - sklearn (empfohlen)


In diesem Abschnitt wird mit der sklearn-Bibliothek ein Entscheidungsbaummodell trainiert. Dabei können verschiedene Parameter frei gewählt werden, wie die maximale Baumtiefe, die gewünschte Mindestgenauigkeit und die Anzahl der Suchiterationen.
Das Programm wird dann versuchen, ausgehend von einer Baumtiefe von eins, einen möglichst einfachen Entscheidungsbaum zu generieren, der die gewünschte Genauigkeit erreicht.

#### 5.2.1 Konfiguration

In [None]:
DT_MAX_DEPTH = 10                           # Maximum depth of the tree
DT_EXPLORATION_TARGET_VAL_ACCURACY = 0.95   # Target accuracy for the decision tree
DT_EXPLORATION_MAX_ITER = 100000            # Maximum number of iterations for the random search

#### 5.2.2 Trainieren

In [None]:
# Train DecisionTreeClassifier
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from IPython.display import clear_output
import joblib

labelEncoder = LabelEncoder()
labelEncoder = labelEncoder.fit(np.ravel(trainY))
labelEncodedTrainY = labelEncoder.transform(np.ravel(trainY))
labelEncodedTestY = labelEncoder.transform(np.ravel(testY))


testAccuracy = 0.0
trainAccuracy = 0.0
iterations = 0
depth = 1
while (
    testAccuracy < DT_EXPLORATION_TARGET_VAL_ACCURACY
    or trainAccuracy < DT_EXPLORATION_TARGET_VAL_ACCURACY
) and iterations < DT_EXPLORATION_MAX_ITER:
    # Create decision tree classifier
    

    dt = DecisionTreeClassifier(max_depth=depth, splitter="random")
    dt.fit(trainX, labelEncodedTrainY)

    testPredictions = dt.predict(testX)
    testAccuracy = accuracy_score(labelEncodedTestY, testPredictions)

    trainPredictions = dt.predict(trainX)
    trainAccuracy = accuracy_score(labelEncodedTrainY, trainPredictions)
    iterations += 1
    
    print("Iteration: %d" % iterations)
    print("Depth: %d" % depth)
    print("Train Accuracy: %.2f%%" % (trainAccuracy * 100.0))
    print("Test Accuracy: %.2f%%" % (testAccuracy * 100.0))

    depth = 1 + iterations // (DT_EXPLORATION_MAX_ITER // DT_MAX_DEPTH)




modelName = "dtc." + str(round(time.time()))
os.makedirs(PATH_MODEL + modelName)
with open(PATH_MODEL + modelName + "/encoder.pickle", "wb") as f:
    joblib.dump(labelEncoder, f)
joblib.dump(dt, PATH_MODEL + modelName + "/dtc.model")

### 5.3 Tiefes neuronales Netz - tensorflow

Im folgenden Abschnitt wird mit der tensorflow-Bibliothek ein tiefes neuronales Netzwerk trainiert. Dabei können verschiedene Parameter eingestellt werden, wie die gewünschte Mindestgenauigkeit, die Anzahl der Suchiterationen, die minimale und maximale Anzahl an Schichten und Neuronen, die Toleranz für einen vorzeitigen Abbruch einer Iteration, die Ausgabeform, die maximale Anzahl an Epochen, die Batchgröße und die Batch Normalisierung.

Bitte beachte, dass die genaue Syntax und die verfügbaren Optionen von tensorflow abhängen können. Es ist empfehlenswert, die offizielle tensorflow-Dokumentation für detaillierte Informationen zu konsultieren.

#### 5.3.1 Konfiguration

In [None]:
DNN_EXPLORATION_TARGET_VAL_ACCURACY = 0.98  # Target accuracy for the neural network
DNN_EXPLORATION_MAX_ITER = 100              # Maximum number of iterations for the random search
DNN_EXPLORATION_HIDDEN_LAYERS_MIN = 1       # Minimum number of hidden layers
DNN_EXPLORATION_HIDDEN_LAYERS_MAX = 3       # Maximum number of hidden layers
DNN_EXPLORATION_NEURONS_MIN = 4             # Minimum number of neurons per layer
DNN_EXPLORATION_NEURONS_MAX = 16            # Maximum number of neurons per layer


DNN_EARLY_STOPPING_PATIENCE = 50            # Patience for early stopping
DNN_VERBOSE = 1                             # Verbosity level for the neural network
DNN_EPOCHS = 2000                           # Maximum number of epochs for the neural network
DNN_BATCH_SIZE = 128                        # Batch size for the neural network
DNN_BATCH_NORMALIZATION = True              # Batch normalization for the neural network

#### 5.3.2 Trainieren

In [None]:
# Train Neural Network
import json
import random
import joblib
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, InputLayer, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

onehotencoder = OneHotEncoder()
onehotencoder = onehotencoder.fit(trainY)
onehotEncodedTrainY = onehotencoder.transform(trainY).toarray()
onehotEncodedTestY = onehotencoder.transform(testY).toarray()

explorationResults = []


n_features = trainX.shape[1]
categorieCount = len(onehotEncodedTrainY[0])

testAccuracy = 0.0
trainAccuracy = 0.0
model = Sequential()
interation = 0
while (
    testAccuracy < DNN_EXPLORATION_TARGET_VAL_ACCURACY
    or trainAccuracy < DNN_EXPLORATION_TARGET_VAL_ACCURACY
) and interation < DNN_EXPLORATION_MAX_ITER:
    model = Sequential()
    model.add(InputLayer(input_shape=(n_features,)))
    if DNN_BATCH_NORMALIZATION:
        model.add(BatchNormalization())
    denseCount = random.randint(
        DNN_EXPLORATION_HIDDEN_LAYERS_MIN, DNN_EXPLORATION_HIDDEN_LAYERS_MAX
    )
    denseNeurons = []
    for i in range(0, denseCount):
        neuronCount = random.randint(
            DNN_EXPLORATION_NEURONS_MIN, DNN_EXPLORATION_NEURONS_MAX
        )
        denseNeurons.append(neuronCount)
        model.add(Dense(neuronCount, activation="tanh"))
        if DNN_BATCH_NORMALIZATION:
            model.add(BatchNormalization())
    model.add(Dense(categorieCount, activation="sigmoid"))
    model.compile(
        optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]
    )
    model.fit(
        trainX,
        onehotEncodedTrainY,
        epochs=DNN_EPOCHS,
        batch_size=DNN_BATCH_SIZE,
        verbose=DNN_VERBOSE,
        validation_data=(testX, onehotEncodedTestY),
        callbacks=[
            EarlyStopping(
                monitor="val_loss",
                patience=DNN_EARLY_STOPPING_PATIENCE,
                restore_best_weights=True,
            )
        ],
    )

    trainLoss, trainAccuracy = model.evaluate(trainX, onehotEncodedTrainY)
    testLoss, testAccuracy = model.evaluate(testX, onehotEncodedTestY)

    explorationResults.append(
        {
            "dense_count": denseCount,
            "dense_neurons": denseNeurons,
            "train_loss": trainLoss,
            "train_acc": trainAccuracy,
            "test_loss": testLoss,
            "test_acc": testAccuracy,
        }
    )
    interation += 1

modelName = "dnn." + str(round(time.time()))

model.save(PATH_MODEL + modelName)

with open(PATH_MODEL + modelName + "/encoder.pickle", "wb") as f:
    joblib.dump(onehotencoder, f)

with open(PATH_EXPLORATION_DATA + modelName + ".exploration_results.json", "w") as f:
    json.dump(explorationResults, f, indent=4)

### 5.4 Extremes Gradienten-Boosting - XGBoost

In diesem Abschnitt wird mit der xgboost-Bibliothek ein Modell mithilfe des "Extreme Gradient Boosting" trainiert. Dabei können verschiedene Parameter frei gewählt werden, wie die maximale Anzahl der Bäume, die gewünschte Mindestgenauigkeit und die Anzahl der Suchiterationen.

#### 5.4.1 Konfiguration

In [None]:
DT_MAX_DEPTH = 10                           # Maximum depth of the tree
DT_EXPLORATION_TARGET_VAL_ACCURACY = 0.95   # Target accuracy for the decision tree
DT_EXPLORATION_MAX_ITER = 100000            # Maximum number of iterations for the random search

DT_NUM_OF_ESTIMATORS = None                 # number of estimators (default = None -> number of estimators = number of classes)

#### 5.4.2 Trainieren

In [None]:
# Train XGBClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import joblib


labelEncoder = LabelEncoder()
labelEncoder = labelEncoder.fit(np.ravel(trainY))
labelEncodedTrainY = labelEncoder.transform(np.ravel(trainY))
labelEncodedTestY = labelEncoder.transform(np.ravel(testY))

testAccuracy = 0.0
trainAccuracy = 0.0
iterations = 0
while (
    testAccuracy < DT_EXPLORATION_TARGET_VAL_ACCURACY
    or trainAccuracy < DT_EXPLORATION_TARGET_VAL_ACCURACY
) and iterations < DT_EXPLORATION_MAX_ITER:
    xgb = XGBClassifier(
        tree_method="hist",
        enable_categorical=True,
        max_depth=DT_MAX_DEPTH,
        n_estimators=labelEncoder.classes_.size if DT_NUM_OF_ESTIMATORS == None else DT_NUM_OF_ESTIMATORS,
        random_state=np.random.randint(),
    )
    # fit model
    labeledTrainX = pd.DataFrame(trainX, columns=featureNames)
    labeledTestX = pd.DataFrame(testX, columns=featureNames)

    xgb.fit(
        labeledTrainX, labelEncodedTrainY, eval_set=[(labeledTestX, labelEncodedTestY)]
    )

    testPredictions = xgb.predict(testX)
    testAccuracy = accuracy_score(labelEncodedTestY, testPredictions)

    trainPredictions = xgb.predict(trainX)
    trainAccuracy = accuracy_score(labelEncodedTrainY, trainPredictions)
    iterations += 1

modelName = "xgb." + str(round(time.time()))
os.makedirs(PATH_MODEL + modelName)

with open(PATH_MODEL + modelName + "/encoder.pickle", "wb") as f:
    joblib.dump(labelEncoder, f)


XGBClassifier.save_model(xgb, PATH_MODEL + modelName + "/xgb.model")

featureMap = xgb.get_booster().get_score(importance_type="gain")

with open(PATH_MODEL + modelName + "/feature_map.txt", "w") as file:
    for index, feature in enumerate(xgb.get_booster().feature_names):
        file.write(f"{index}\t{feature}\tq\n")


print("Accuracy: %.2f%%" % (testAccuracy * 100.0))

## 6. Modelanalyse des Learners [Kevin]

**In diesem Abschnitt können in Abhängigkeit von der Auswahl des zu analysierenden Modells verschiedene Analysen durchgeführt werden.**

- *Visuelle Darstellung (nur Entscheidungsbaum/Extremes Gradienten-Boosting)*: Eine visuelle Darstellung des Modells kann erstellt werden, um die Entscheidungslogik und Struktur intuitiv zu erfassen.

- *Konfusionsmatrix*: Eine Konfusionsmatrix wird erstellt, um die Leistung des Modells bei der Klassifikation zu bewerten. Sie zeigt, wie gut das Modell verschiedene Klassen korrekt vorhersagt und welche Fehler gemacht werden.

- *Test-Genauigkeit*: Die Test-Genauigkeit wird berechnet, um zu bewerten, wie gut das Modell auf unbekannten Daten abschneidet. Dies gibt einen Indikator dafür, wie zuverlässig die Vorhersagen des Modells sind.

- *Feature Importances*: Es wird eine Analyse der Feature Importances durchgeführt, um die relative Bedeutung der verschiedenen Merkmale bei der Vorhersage zu bestimmen. Dies ermöglicht Einblicke in die relevanten Merkmale und kann bei der Feature-Auswahl oder -Gewichtung helfen.

Diese Analysen bieten einen umfassenden Einblick in die Leistung und Funktionsweise des ausgewählten Modells und unterstützen bei der Interpretation der Ergebnisse.

In [None]:
# SELECT LEARNER AND TEST DATA
featureDataDir = list(filter(lambda x: os.path.isfile(os.path.join(PATH_FEATURE_DATA, x)) and DEFAULT_RAW_DATA_TYPE in x.upper(), os.listdir(PATH_FEATURE_DATA)))
modelDir = os.listdir(PATH_MODEL)

modelDropdown = widgets.Dropdown(description="model")
modelDropdown.options = modelDir
selectedModelFile = None
def onTrainigFileChange(change):
    global selectedModelFile
    selectedModelFile = change['new']
modelDropdown.observe(onTrainigFileChange, names='value')
display(modelDropdown)

testFileDropdown = widgets.Dropdown(description="test data")
testFileDropdown.options = featureDataDir
selectedTestFile = None
def onTestFileChange(change):
    global selectedTestFile
    selectedTestFile = change['new']
testFileDropdown.observe(onTestFileChange, names='value')
display(testFileDropdown)


In [None]:
# ANALYZE MODEL
import tensorflow as tf
from tensorflow import math as tfmath
import tensorflow_probability as tfp
import eli5
from eli5.sklearn import PermutationImportance
import joblib
from matplotlib import pyplot
from xgboost import XGBClassifier, plot_tree
from sklearn import tree
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

csvTest = read_csv(PATH_FEATURE_DATA + selectedTestFile, delimiter=";")
testData = csvTest.values
testX, testY = testData[:, 2:].astype('float32'), testData[:, 1:2]
featureNames = csvTest.columns.values[2:].tolist()




model = None
predictions = None
transformedTestY = None
confusionMatrix = None
classes = None
if (selectedModelFile.startswith('dnn')):
    onehotencoder = None
    with open(PATH_MODEL + selectedModelFile + '/encoder.pickle', 'rb') as f:
        onehotencoder = joblib.load(f)
    classes = onehotencoder.categories_[0]
    transformedTestY = onehotencoder.transform(testY).toarray()    

    model = tf.keras.models.load_model(PATH_MODEL + selectedModelFile)
    predictions = model.predict(testX)

    confusionMatrix = tf.math.confusion_matrix(np.argmax(transformedTestY, axis=1), np.argmax(predictions, axis=1))
    equality = tf.math.equal(np.argmax(predictions, axis=1), np.argmax(transformedTestY, axis=1))
    accuracy = tf.math.reduce_mean(tf.cast(equality, tf.float32))
elif (selectedModelFile.startswith('xgb')):
    labelEncoder = None
    with open(PATH_MODEL + selectedModelFile + '/encoder.pickle', 'rb') as f:
        labelEncoder = joblib.load(f)
    classes = labelEncoder.classes_
    transformedTestY = labelEncoder.transform(np.ravel(testY))

    model = XGBClassifier()
    model.load_model(PATH_MODEL + selectedModelFile + '/xgb.model')
    predictions = model.predict(testX)
    confusionMatrix = tfmath.confusion_matrix(transformedTestY, predictions)

    for i in range(model.n_estimators):
        plot_tree(model, num_trees=i, fmap=PATH_MODEL + selectedModelFile + '/feature_map.txt')
        pyplot.gcf().set_dpi(1200)
        pyplot.show()


    #pyplot.show()
    equality = tf.math.equal(predictions, transformedTestY)
    accuracy = tf.math.reduce_mean(tf.cast(equality, tf.float32))
elif (selectedModelFile.startswith('dtc')):
    labelEncoder = None
    with open(PATH_MODEL + selectedModelFile + '/encoder.pickle', 'rb') as f:
        labelEncoder = joblib.load(f)
    classes = labelEncoder.classes_
    transformedTestY = labelEncoder.transform(np.ravel(testY))

    model = joblib.load(PATH_MODEL + selectedModelFile + '/dtc.model')

    predictions = model.predict(testX)
    accuracy = accuracy_score(transformedTestY, predictions)
    confusionMatrix = tfmath.confusion_matrix(transformedTestY, predictions)
    plt.figure(figsize=(120, 40))       
    tree.plot_tree(model, feature_names=featureNames, class_names=labelEncoder.classes_, filled=True)
    plt.show()

mat = pyplot.matshow(confusionMatrix, 1)
mat.axes.set_xticks(np.arange(0, len(classes), 1))
mat.axes.set_yticks(np.arange(0, len(classes), 1))
mat.axes.set_xticklabels(classes, rotation=90)
mat.axes.set_yticklabels(classes)
for (x, y), value in np.ndenumerate(confusionMatrix):
    pyplot.text(y, x, f"{value:.2f}", va="center", ha="center")
pyplot.show()


print('Test Accuracy: %.3f' % accuracy)


#correlationMatrix = tfp.stats.correlation(testX)
#pyplot.matshow(correlationMatrix)
#pyplot.show()

perm = PermutationImportance(model, scoring="neg_mean_squared_error", random_state=1).fit(testX, transformedTestY)
print(eli5.format_as_text(eli5.explain_weights(perm, feature_names=featureNames)))

## 7. Statische Interpretation des Resultats

- Welches Ergebnis haben wir erzielt und wie kann man es anwenden?
