# Daten Klassifizieren
---

## Aufgabenstellung

- 🏆 Aufgabe 12.4 (20%): Klassifikationsmodell für defekte Flaschen
Abgabeformalien: Fügen sie ein Kapitel in ihrer Dokumentation hinzu, in dem Sie die Ergebnisse der Klasisfikation mittels Confusion Matrix und unten gezeigter Tabelle dokumentieren
- Erstellen Sie ein Klassifikationsmodell zur Vorhersage von defekten Flaschen anhand der Daten aus der Drop Vibration. Diese repräsentieren eine Zeitreihe der Vibrationen von Flaschen bei der Vereinzelung.
- Erstellen Sie eine Tabelle, welche die genuzten Spalten für die Vorhersage enthält und den F1-Score für die jeweiligen Spalten
- Als Orientierung kann folgendes Notebook dienen 9_Classification_Python.ipynb, welches auch im nächsten Abschnitt vorgestellt wird

## Imports

In [6]:
# Load NeuroKit and other useful packages
# Install NeuroKit with pip install neurokit2 in notebook: pip install neurokit2

import neurokit2 as nk
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
# Alternate heart rate and noise levels
ecg50 = nk.ecg_simulate(duration=10, noise=0.05, heart_rate=50)
ecg100 = nk.ecg_simulate(duration=10, noise=0.01, heart_rate=100)

# Visualize
ecg_df = pd.DataFrame({"ECG_100": ecg100, "ECG_50": ecg50})

nk.signal_plot(ecg_df, subplots=True)

---
## Simulation

In [None]:
data = {}

typ = "Ruhe EKG"
for i in range(100):
    heart_rate = np.random.normal(70, 35)
    # Ensure heart rate is positive
    if heart_rate < 40:
        heart_rate = 40
    data[i] = {}
    data[i]["EGK"] = nk.ecg_simulate(duration=10, noise=0.1, heart_rate=heart_rate)
    data[i]["Type"] = typ
print(i)


typ = "Belastung EKG"
for i in range(100,200):
    heart_rate = np.random.normal(140, 60)
    if heart_rate < 40:
        heart_rate = 40
    data[i] = {}
    data[i]["EGK"] = nk.ecg_simulate(duration=10, noise=0.1, heart_rate=heart_rate)
    data[i]["Type"] = typ
print(i)

---
## Visualisierung

In [None]:
data[0]

# Visualize
ecg_df = pd.DataFrame({"Ruhe_1": data[0]["EGK"], "Belastung_1": data[100]["EGK"]})

nk.signal_plot(ecg_df, subplots=True)


In [None]:
## make a dataframe with the mean and standard deviation of the ECG signals

df = pd.DataFrame(data).T
df["Mean"] = df["EGK"].apply(np.mean)
df["STD"] = df["EGK"].apply(np.std)
df.head()

In [None]:
# make a boxplot of the mean of the ECG signals

sns.boxplot(x="Type", y="Mean", data=df)

In [None]:
# make a boxplot of the mean of the ECG signals

sns.boxplot(x="Type", y="STD", data=df)

---
# Feature Engineering
Hiermit haben wir bereits den ersten Schritt im Feature Engineering gemacht und können Mean und STD als Features verwenden.

In [None]:
df.head()

In [None]:
### Fourier Transform

# Add colums for different frequencies

for i in range(1, 100):
    df[f"Freq_{i}"] = df["EGK"].apply(lambda x: np.abs(np.fft.fft(x))[i])
df.head()


In [None]:
# Make a plot for the Frequencies of the first ECG signal

for id in [0,1,100,101]:

    plt = df.iloc[id, 4:].plot()
    plt.set_title(df.iloc[id, 1])
    plt.set_xlabel("Frequenz")
    plt.set_ylabel("Amplitude")
    plt.legend()


In [None]:
# Melt the DataFrame so that we have a column for the type and the frequency

df_melted = pd.melt(df, id_vars=["Type"], value_vars=df.columns[4:])
df_melted.head()

In [None]:
# plot the frequencies grouped by type

sns.lineplot(x="variable", y="value", hue="Type", data=df_melted)


---
## Daten aufteilen

In [None]:
y = df['Type']
# X = df.drop(['Type','EGK'], axis=1)
X = df[['Mean', 'STD']]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

In [None]:
df = pd.get_dummies(df, columns=['Type'])

---
## Trainieren des Modells

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

model.coef_

In [None]:
y_pred = model.predict(X_train)

y_pred

In [None]:
df_result = pd.DataFrame({'Actual': y_train, 'Predicted': y_pred})
df_result.merge(X_train, left_index=True, right_index=True)

df_result


---
## Evaluieren

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train, y_pred)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

cm = confusion_matrix(y_train, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()


In [None]:
# Weitere Fehlermetriken

from sklearn.metrics import classification_report

print(classification_report(y_train, y_pred))


---
## Evalusieren auf den Testdaten

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

---
## k-Nearest Neighbors

In [None]:
# Import knn
from sklearn.neighbors import KNeighborsClassifier

# Define the model with the number of neighbors
model = KNeighborsClassifier(n_neighbors=3)

# Fit the model

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

cm = confusion_matrix(y_train, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
y_pred = grid_search.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
### Decision Tree

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

cm = confusion_matrix(y_train, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

print(classification_report(y_test, y_pred))


---
## Visualisierung decision tree

In [None]:
# Plot the decision tree
from sklearn import tree
import matplotlib.pyplot as plt

plt.figure(figsize=(20,10))
tree.plot_tree(model, filled=True, feature_names=X.columns, class_names=model.classes_)
plt.show()

In [None]:
# Define a decision tree with a maximum depth of 2

model = DecisionTreeClassifier(max_depth=2)

model.fit(X_train, y_train)

y_pred = model.predict(X_train)

cm = confusion_matrix(y_train, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

print(classification_report(y_train, y_pred))

In [None]:
# Test Set

y_pred = model.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)

disp.plot()

In [None]:
# Plot the decision tree

plt.figure(figsize=(20,10))

tree.plot_tree(model, filled=True, feature_names=X.columns, class_names=model.classes_)

plt.show()


---
# weitere Kniffe

## Dummy Var

In [None]:
df.head()

In [None]:
pd.get_dummies(df['Type'])


### Normalisieren von Daten

In [None]:
# Standardize the Freq_1 column using the standard scaler

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df['Freq_1'] = scaler.fit_transform(df[['Freq_1']])

df.head()


In [None]:
# Normalize the Freq_2 column using the MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df['Freq_2'] = scaler.fit_transform(df[['Freq_2']])

df.head()

import pandas as pd

df = pd.read_csv('bottle_data.csv')
print(df)