# IN PROGRESS!!!

# DNN Model for MIT data 
As we concluded before, for MIT data, we apply the following preprocessing:   
resampling: Oversampling \
rescaling: MinMax Scaler

If you don't have the original files: run the notebook `preprocessing_mit_minmax_oversampling.ipynb`     
Input file: (The preprocessed data)   
mitbih_train_clean_minmax_oversampling.csv   
mitbih_test_clean_minmax_oversampling.csv

Output: DNN model trained  
model_dnn_mit.pkl  

In [1]:
import sys
import os 

data_path = ''
model_output_path = ''
# check if the enviorment is Google Colab 

if 'google.colab' in sys.modules:
    print("Running on Google Colab")
    # Install required libraries
    !pip install scikit-learn -q
    !pip install pandas -q
    !pip install numpy -q
    !pip install imbalanced-learn -q

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')
    # set the path where the csv file stored in your google drive. 
    data_path = '/content/drive/MyDrive/Heartbeat_Project/'
    model_output_path = data_path

else:
    print("Running on local environment")

    current_path = os.getcwd()
    print("Current working directory:", current_path)
    data_path = '../data/processed/'
    model_output_path = '../models/'

Running on local environment
Current working directory: g:\Meine Ablage\heartbeat-analysis-ai\notebooks


## Read data 


In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from imblearn.pipeline import Pipeline as ImbPipeline  # Use ImbPipeline for oversampling
import matplotlib.pyplot as plt
import seaborn as sns


RawFiles = dict({
    'train': data_path + 'mitbih_train_clean_minmax_oversampling.csv', 
    'test': data_path + 'mitbih_test_clean_minmax_oversamling.csv'  
})


OutputFiles = dict({
    'model': model_output_path +  'model_dnn_mit.pkl'
})

train = pd.read_csv(RawFiles.get('train'),sep=',',header=0)
test = pd.read_csv(RawFiles.get('test'),sep=',',header=0)

y_train = train['target']
X_train = train.drop('target', axis=1)

y_test = test['target']
X_test = test.drop('target', axis=1)



# DNN with MinMax Scaler and Oversampling

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# define model
model = Sequential() # Sequential model is used to add layers one by one to the model, in a linear manner

# Input Layer
model.add(Dense(32, activation='relu', input_dim=X_train.shape[1])) # relu, because we want to avoid negative values
# input_dim is the number of features in the dataset: 187

# Hidden Layers: 3 hidden layers with 64, 128, and 32 neurons respectively
model.add(Dense(64, activation='relu')) 
model.add(BatchNormalization()) # BatchNormalization is used to normalize the activations of the previous layer at each batch
model.add(Dropout(0.3)) # Dropout is used to prevent overfitting. 0.3 means 30% of the neurons will be turned off randomly

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Output Layer for binary classification of the target
model.add(Dense(1, activation='sigmoid')) # 1 means 1 neuron in the output layer, sigmoid, because we want to predict probabilities

# compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # adam is a popular optimizer, binary_crossentropy is used for binary classification

# Callbacks for early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True) # patience is the number of epochs with no improvement after which training will be stopped
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001) # the factor by which the learning rate will be reduced, min_lr is the lower bound for the learning rate

# Callbacks for plotting
train_loss = [] # to store training loss
val_loss = []
train_accuracy = []
val_accuracy = []


epochs = 1000 # number of epochs for training, 1000 is a large number, but early stopping will stop the training when the model stops improving
for epoch in range(epochs):
    history = model.fit(X_train, y_train, epochs=1, batch_size=10, validation_data=(X_test, y_test), 
                        verbose=0, callbacks=[early_stopping, lr_scheduler])
    
    # Append metrics to the respective lists
    train_loss.append(history.history['loss'][0])
    val_loss.append(history.history['val_loss'][0])
    train_accuracy.append(history.history['accuracy'][0])
    val_accuracy.append(history.history['val_accuracy'][0])
    
    # Print progress at each 10th epoch for feedback
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{epochs} - Loss: {train_loss[-1]:.4f}, Val Loss: {val_loss[-1]:.4f}, "
              f"Accuracy: {train_accuracy[-1]:.4f}, Val Accuracy: {val_accuracy[-1]:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 10/1000 - Loss: 0.1826, Val Loss: 0.1210, Accuracy: 0.9367, Val Accuracy: 0.9618
Epoch 20/1000 - Loss: 0.1605, Val Loss: 0.1345, Accuracy: 0.9453, Val Accuracy: 0.9520
Epoch 30/1000 - Loss: 0.1441, Val Loss: 0.0825, Accuracy: 0.9527, Val Accuracy: 0.9727
Epoch 40/1000 - Loss: 0.1347, Val Loss: 0.0955, Accuracy: 0.9552, Val Accuracy: 0.9684
Epoch 50/1000 - Loss: 0.1355, Val Loss: 0.1102, Accuracy: 0.9549, Val Accuracy: 0.9687
Epoch 60/1000 - Loss: 0.1275, Val Loss: 0.1584, Accuracy: 0.9592, Val Accuracy: 0.9607
Epoch 70/1000 - Loss: 0.1190, Val Loss: 0.1262, Accuracy: 0.9618, Val Accuracy: 0.9721
Epoch 80/1000 - Loss: 0.1170, Val Loss: 0.1250, Accuracy: 0.9626, Val Accuracy: 0.9713
Epoch 90/1000 - Loss: 0.1117, Val Loss: 0.2043, Accuracy: 0.9645, Val Accuracy: 0.9611
Epoch 100/1000 - Loss: 0.1107, Val Loss: 0.0993, Accuracy: 0.9642, Val Accuracy: 0.9729
Epoch 110/1000 - Loss: 0.1082, Val Loss: 0.4132, Accuracy: 0.9651, Val Accuracy: 0.9448
Epoch 120/1000 - Loss: 0.1067, Val Loss: 

KeyboardInterrupt: 

# Save DNN Model

In [30]:
# Save model 
import pickle
# Save the model to a file
with open(OutputFiles.get('model'), 'wb') as model_file:
    pickle.dump(knn_best_model, model_file)