# Preparation

## Downloading dataset from dataset [https://drive.google.com/drive/folders/1-vrY2Ln9XaTvRi3LqhEIsWBvcMdHNWp9?usp=sharing]

In [None]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

from pathlib import Path

import os


os.getcwd()
os.chdir('gdrive/MyDrive')

In [None]:
os.getcwd()
os.chdir('gdrive/MyDrive')
os.getcwd()

### Defining preprocessing functions

In [None]:
import glob
import os
from itertools import chain

import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import as_strided
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import backend as K

np.random.seed(42)

def splitting_data(
    df, df_target=None, take_time_stamps=124, overlap=62, zero_padding=True, predict_n_future_timesteps=-1, **kwargs
):
    def windowed_view_adj(
        arr, window=take_time_stamps, overlap=overlap, zero_padding=zero_padding, 
        predict_n_future_timesteps=predict_n_future_timesteps, **kwargs
    ):
        windows = windowed_view(arr, window, overlap, predict_n_future_timesteps, **kwargs)
        if zero_padding:
            re = add_zero_padding(arr, window, overlap)
            return np.append(windows, re, axis=0)
        return windows

    def calculate_number_of_created_samples(
        arr, window=take_time_stamps, overlap=overlap, zero_padding=zero_padding,
        predict_n_future_timesteps=predict_n_future_timesteps, **kwargs
    ):
        window_step = window - overlap
        new_shape = ((arr.shape[-1] - overlap) // window_step, window)
        return new_shape[0] + 1 if zero_padding else new_shape[0]

    vals = df.values
    vals_shape = vals.shape
    if vals_shape[1] >= take_time_stamps:
        if df_target is None:
            data = list(map(windowed_view_adj, vals))
            return data, None
        else:
            targ_data = df_target.values
            values_1 = vals if predict_n_future_timesteps in (-1, 0) else vals[:, : -predict_n_future_timesteps]
            temp_re = [
                (
                    [
                        windowed_view_adj(l),
                        np.array(list(d) * calculate_number_of_created_samples(l)),
                    ]
                )
                for l, d in zip(values_1, targ_data)
            ]
            data, data_target = zip(*temp_re)
            data = np.array(data)
            dat_shape = data.shape
            data = data.reshape(dat_shape[0] * dat_shape[1], dat_shape[-1])
            data_target = list(chain(*data_target))
            assert data.shape[0] == len(
                data_target
            ), "Target and data rows are different size!"
            if predict_n_future_timesteps in (-1, 0):
                return data, data_target
            # additional processing
            values = vals[:, predict_n_future_timesteps:]
            # taking last element which will not be full:
            data_y = np.array(list(map(windowed_view_adj, values)))
            data_y = data_y.reshape(data.shape)
            a_1 = data_y[0][:-predict_n_future_timesteps]
            b_1 = data[0][predict_n_future_timesteps:]
            a_2 = data_y[-1][:-predict_n_future_timesteps]
            b_2 = data[-1][predict_n_future_timesteps:]
            assert np.all(a_1 == b_1), "Y Data is not equal!"
            assert np.all(a_2 == b_2), "Y Data is not equal!"
            return data, data_y, data_target

    else:
        print("Not enough samples")
        return None, None


def windowed_view(arr, window, overlap):
    arr = np.asarray(arr)
    window_step = window - overlap
    new_shape = arr.shape[:-1] + ((arr.shape[-1] - overlap) // window_step, window)
    new_strides = arr.strides[:-1] + (window_step * arr.strides[-1],) + arr.strides[-1:]
    return as_strided(arr, shape=new_shape, strides=new_strides)


def add_zero_padding(arr, window, overlap):
    # need_zeros = len(arr)
    array_len = len(arr)
    window_step = window - overlap
    number_of_els = (arr.shape[-1] - overlap) // window_step
    take_ind = number_of_els * window_step
    number_of_left_elements = array_len - take_ind
    padded_arr = np.array(
        list(arr[take_ind:]) + (window - number_of_left_elements) * [0]
    ).reshape(1, window)
    assert padded_arr.shape == (
        1,
        window,
    ), f"Wrong dimensions after zero padding, expected (1, {window}), got {padded_arr.shape}"
    return padded_arr


def load_preprocessed_datasets_and_processe(
    main_data_folder,
    exclude_dataset_for_testing,
    save_result_folder=None,
    windows_size=128,
    overlap=64,
    zero_padding=False,
    predict_n_future_timesteps=-1,
):
    train_data_dict = {}
    test_data_dict = {}
    train_target_dict = {}
    test_target_dict = {}
    data_folders = os.listdir(main_data_folder)
    exceptions = {}
    predict_future_param_is_set = False if predict_n_future_timesteps in (-1, 0) else True
    print(f"Total datasets {len(data_folders)}")
    for f in data_folders:
        try:
            test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
            target_test_df = pd.read_csv(f"{main_data_folder}/{f}/test_target.csv")
            test_shape = test_df.shape
            if test_shape[1] < windows_size or (predict_future_param_is_set and test_shape[1] < windows_size + predict_n_future_timesteps):
                exceptions[
                    f
                ] = f"Not enough samples in row, found {test_shape[1]}, expected (window size) {windows_size}"
                continue
            if f == exclude_dataset_for_testing:
                splitted_train, splitted_train_target = pd.DataFrame(), pd.DataFrame()
                continue

        
            train_df = pd.read_csv(f"{main_data_folder}/{f}/train.csv")
            test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
            target_train_df = pd.read_csv(
                f"{main_data_folder}/{f}/train_target.csv"
            )
            additional_data_params = {'predict_n_future_timesteps': predict_n_future_timesteps} if predict_future_param_is_set else {}
            
            #splitted_train, splitted_train_target 
            splitted_train_data = splitting_data(
                train_df,
                target_train_df,
                take_time_stamps=windows_size,
                overlap=overlap,
                zero_padding=zero_padding,
                **additional_data_params
                #predict_n_future_timesteps=predict_n_future_timesteps
            )
            #splitted_test, splitted_test_target 
            splitted_test_data = splitting_data(
                test_df,
                target_test_df,
                take_time_stamps=windows_size,
                overlap=overlap,
                zero_padding=zero_padding,
                **additional_data_params
                #predict_n_future_timesteps=predict_n_future_timesteps
            )
            if predict_future_param_is_set:
                splitted_train, splitted_train_y, splitted_train_target = splitted_train_data
                splitted_test, splitted_test_y, splitted_test_target = splitted_test_data
            else:
                splitted_train, splitted_train_target = splitted_train_data
                splitted_test, splitted_test_target = splitted_test_data
            if save_result_folder:
                if not os.path.exists(save_result_folder):
                    os.mkdir(save_result_folder)
                additional_folder = f"{save_result_folder}/w_{windows_size}_o_{overlap}_p_{int(zero_padding)}"
                if not os.path.exists(additional_folder):
                    os.mkdir(additional_folder)
                dataset_folder = f"{additional_folder}/{f}"
                if not os.path.exists(dataset_folder):
                    os.mkdir(dataset_folder)

                # splitted_train = None
                pd.DataFrame(splitted_test).to_csv(
                    f"{dataset_folder}/test.csv", index=False, encoding="utf-8"
                )
                pd.DataFrame(splitted_test_target).to_csv(
                    f"{dataset_folder}/test_target.csv", index=False, encoding="utf-8"
                )
                if len(splitted_train):
                    pd.DataFrame(splitted_train).to_csv(
                        f"{dataset_folder}/train.csv", index=False, encoding="utf-8"
                    )
                    pd.DataFrame(splitted_train_target).to_csv(
                        f"{dataset_folder}/train_target.csv",
                        index=False,
                        encoding="utf-8",
                    )
                if predict_future_param_is_set:
                    pd.DataFrame(splitted_train_y).to_csv(
                        f"{dataset_folder}/train_y.csv", index=False, encoding="utf-8"
                    )
                    pd.DataFrame(splitted_test_y).to_csv(
                        f"{dataset_folder}/test_y.csv",
                        index=False,
                        encoding="utf-8",
                    )
            else:
                test_target_dict[f] = splitted_test_target
                test_data_dict[f] = splitted_test
                if len(splitted_train):
                    train_target_dict[f] = splitted_train_target
                    train_data_dict[f] = splitted_train

        except Exception as e:
            print(f"Error with {f}: {e}")
            exceptions[f] = e
    return (
        train_data_dict,
        test_data_dict,
        train_target_dict,
        test_target_dict,
        exceptions,
    )

### Data loading, model training functions

#### create_CNN_architecture - creates autoencoder CNN
#### load_data - loads data
#### train_model - trains model with selected hyperparameters
#### load_model - for model retraining
#### save_model_data - saves model data 
#### save_detailed_data - saves info about hyperparameters

In [None]:
import time
import numpy as np
import os
from keras import layers, models, callbacks, regularizers, optimizers

# from keras.layers import advanced_activations
from contextlib import redirect_stdout
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)


def create_CNN_architecture(
    window_size,
    number_of_layers_in_encoder,
    encoder_filters,
    activation_functions,
    kernel_sizes,
    batch_normalizations,
    max_poolings,
    max_pooling_size=2,
    allowed_bottleneck_sizes=[16, 24, 32],
    **kwargs,
):
    TIMESTEPS = window_size
    num_inputs = 1
    input_placeholder = layers.Input(shape=[TIMESTEPS, num_inputs])
    encoded = input_placeholder
    for i in range(number_of_layers_in_encoder):
        encoder_filter = encoder_filters[i]
        activation_function = activation_functions[i]
        kernel_size = kernel_sizes[i]
        batch_normalization = batch_normalizations[i]
        max_pooling = max_poolings[i]

        encoded = layers.Conv1D(
            encoder_filter,
            kernel_size=kernel_size,
            padding="same",
            activation=activation_function,
        )(encoded)
        if max_pooling:
            encoded = layers.MaxPool1D(max_pooling_size)(encoded)
        if batch_normalization:
            encoded = layers.BatchNormalization()(encoded)
    # bottleneck
    encoded = layers.Dense(1, activation="relu")(encoded)
    encoded = layers.BatchNormalization(name=f"embedding")(encoded)
    bottleneck_shape = list(encoded.shape)[1]
    # print(f'Bottleneck size: {bottleneck_shape}')
    if not (bottleneck_shape in allowed_bottleneck_sizes):
        raise Exception(f"Wrong bottleneck shape: {bottleneck_shape}")

    decoded = encoded

    for i in reversed(range(number_of_layers_in_encoder)):
        encoder_filter = encoder_filters[i]
        activation_function = activation_functions[i]
        kernel_size = kernel_sizes[i]
        batch_normalization = batch_normalizations[i]
        decoded = layers.Conv1DTranspose(
            encoder_filter,
            kernel_size=kernel_size,
            padding="same",
            activation=activation_function,
        )(decoded)
        if batch_normalization:
            decoded = layers.BatchNormalization()(decoded)
        if max_pooling:
            decoded = layers.UpSampling1D(max_pooling_size)(decoded)

    decoded = layers.Conv1DTranspose(
        filters=1, kernel_size=kernel_size, padding="same"
    )(decoded)

    autoencoder = models.Model(inputs=input_placeholder, outputs=decoded)
    return autoencoder, bottleneck_shape


import pandas as pd
import numpy as np


def load_data(main_data_folder, exclude_dataset_for_testing, predict_n_future_steps=False):
    data_folders = os.listdir(main_data_folder)
    train_data_df = pd.DataFrame()
    test_data_df = pd.DataFrame()
    exceptions = {}
    train_length = 0
    test_length = 0
    print(f"Total datasets {len(data_folders)}")
    for f in data_folders:
        try:
            test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
            
            if f == exclude_dataset_for_testing:
                continue
            else:
                test_length += len(test_df)
                train_df = pd.read_csv(f"{main_data_folder}/{f}/train.csv")
            train_length += len(train_df)
            train_data_df = pd.concat(
                [train_data_df, train_df], ignore_index=True
            )  # train_data_df.append(train_df, ignore_index=True)
            test_data_df = pd.concat(
                [test_data_df, test_df], ignore_index=True
            )  # test_data_df.append(test_df, ignore_index=True)

        except Exception as e:
            exceptions[f] = e
    assert train_length == len(
        train_data_df
    ), "Not all training data was appended to final training set"
    assert test_length == len(
        test_data_df
    ), "Not all testing data was appended to final testing set"
    if not predict_n_future_steps:
        return train_data_df, test_data_df, exceptions
    else:
        train_y_df = pd.read_csv(f"{main_data_folder}/{f}/train_y.csv")
        test_y_df = pd.read_csv(f"{main_data_folder}/{f}/train_y.csv")
        return train_data_df, test_data_df, train_y_df, test_y_df, exceptions
        

def compile_model(model, optimizer, loss="mse"):
    model.compile(optimizer=optimizer, loss=loss)
    return model



def train_model(
    model,
    model_name,
    train_data,
    test_data,
    main_model_folder,
    epochs=100,
    batch_size=32,
    use_early_stopping=True,
    train_data_shifted=None,
    test_data_shifted=None
):
    train_data_y = train_data_shifted if train_data_shifted is not None else train_data
    test_data_y = test_data_shifted if test_data_shifted is not None else test_data
    if use_early_stopping:
        history = model.fit(
            train_data,
            train_data_y,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(test_data, test_data_y),
            callbacks=[
                callbacks.ModelCheckpoint(
                    f"{main_model_folder}/"
                    + model_name
                    + f"/callbacks"
                    + "/epoch{epoch:02d}-loss{val_loss:.3f}.tf"
                ),
                callbacks.ModelCheckpoint(
                    f"{main_model_folder}/" + model_name + f"/callbacks" + "/best.tf",
                    save_best_only=True,
                ),
                callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=10,
                              verbose=1, mode='auto')
            ],
            verbose=1,
        )
    else:
        history = model.fit(
            train_data,
            train_data,
            batch_size=batch_size,
            epochs=epochs,
            validation_data=(test_data, test_data),
            callbacks=[
                callbacks.ModelCheckpoint(
                    f"{main_model_folder}/"
                    + model_name
                    + f"/callbacks"
                    + "/epoch{epoch:02d}-loss{val_loss:.3f}.tf"
                ),
                callbacks.ModelCheckpoint(
                    f"{main_model_folder}/" + model_name + f"/callbacks" + "/best.tf",
                    save_best_only=True,
                )

            ],
            verbose=1,
        )

    return history

def load_model(model_path):
    K.set_learning_phase(0)
    model_path = f'{model_path}/best.tf'
    new_model = tf.keras.models.load_model(model_path)
    return new_model
    

def save_model_data(model, history, main_model_folder):
    def save_model_summary(model, path_to_save):
        with open(f"{path_to_save}/model_summary.txt", "w") as f:
            with redirect_stdout(f):
                model.summary()
        pd.DataFrame.from_dict(history.history).to_csv(f"{path_to_save}/history.csv")

    if not os.path.exists(main_model_folder):
        os.mkdir(main_model_folder)
    with open(f'{main_model_folder}' + '/model_structure.json', mode='w') as ofile:
        ofile.write(model.to_json())
    save_model_summary(model, main_model_folder)
    

def save_detailed_data(main_model_folder, **kwargs):
    df = pd.DataFrame()
    for k, v in kwargs.items():
        df[str(k)] = [str(v)]
    df= df.T
    df.to_csv(f'{main_model_folder}/details.csv')

## Preparing data (if needed)

In [None]:
exclude_dataset_for_testing = "InsectWingbeat"
windows_size = 128
overlap = 64
zero_padding = False
predict_n_future_timesteps = -1

(
    train_data_dict,
    test_data_dict,
    train_target_dict,
    test_target_dict,
    exceptions,
) = load_preprocessed_datasets_and_processe(
    "processed_datasets", exclude_dataset_for_testing,
    save_result_folder='fully_processed_data', windows_size=windows_size, overlap=overlap, zero_padding=zero_padding,
    predict_n_future_timesteps=predict_n_future_timesteps
)

# Defining model architecture, loading data

In [None]:
unique_model_name = 'model_x_3'

model_arch = {
    "name": unique_model_name,
    "window_size": 128,
    "number_of_layers_in_encoder": 2,
    "input": 128,
    "encoder_filters": [128, 64],
    "kernel_sizes": [13, 7],
    "activation_functions": ["relu"] * 2,
    "batch_normalizations": [False] * 2,
    "max_poolings": [True] * 2,
}
model, emb = create_CNN_architecture(**model_arch)
#save_detailed_data

In [None]:
model.summary()

In [None]:
info = {}

### Loading data

In [None]:
# Set parameters if they are not already set
try:
    info['windows_size'] = windows_size
except:
    windows_size = 128
    info['windows_size'] = windows_size
    
try:
    info['overlap'] = overlap
except:
    overlap = 64
    info['overlap'] = overlap
    
try:
    info['zero_padding'] = zero_padding
except:
    zero_padding = False
    info['zero_padding'] = zero_padding
    
try:
    info['predict_n_future_timesteps'] = predict_n_future_timesteps
except:
    predict_n_future_timesteps = -1
    info['predict_n_future_timesteps'] = predict_n_future_timesteps

exclude_dataset_for_testing = "InsectSound"
folder_name = f"fully_processed_data/w_{windows_size}_o_{overlap}_p_{int(zero_padding)}"
train_data_df, test_data_df, exceptions = load_data(
    folder_name, exclude_dataset_for_testing
)
train_data = train_data_df.values
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1], 1)
test_data = test_data_df.values
test_data = test_data.reshape(test_data.shape[0], test_data.shape[1], 1)
main_model_folder = 'trained_models'

In [None]:
main_model_folder = 'trained_models'
BATCH_SIZE = 32
learning_rate = .00001
opt = optimizers.Adam(learning_rate=learning_rate)
epochs = 100
loss = 'mse'

info['batch_size'] = BATCH_SIZE
info['learning_rate']  = learning_rate
info['opt'] =  opt
info['epochs'] = epochs

info.updated(model_arch)

# Model training

In [None]:
k = model_arch['name']
#model
# showing model info
print(f'INFO:')
for k, v in model_arch.items():
    print(f'{k}: {v}')

model.compile(optimizer=opt, loss=loss)
folder_name = f'{main_model_folder}/{k}'
history = train_model(model, k, train_data, test_data, main_model_folder, epochs=epochs)
save_model_data(model, history, folder_name)
save_detailed_data(folder_name, **info)
re = model.evaluate(test_data)
hist_df = pd.DataFrame.from_dict(history.history)
lowest_test_val_loss = hist_df.iloc[hist_df['val_loss'].argmin()]
print(f'Model {k} results {re}')
vals = lowest_test_val_loss.values
print(f'Val loss: train {vals[0]} test: {vals[1]}')
print("========================= Finished training model =========================")
print('\n')

### Retraining loaded model

In [None]:
model_path = ""
model = load_model(model_path)
k = 'model_name'

folder_name = f'{main_model_folder}/{k}_retrained'
history = train_model(model, k, train_data, test_data, main_model_folder, epochs=epochs)
save_model_data(model, history, folder_name)
re = model.evaluate(test_data)
hist_df = pd.DataFrame.from_dict(history.history)
lowest_test_val_loss = hist_df.iloc[hist_df['val_loss'].argmin()]
print(f'Model {k} results {re}')
vals = lowest_test_val_loss.values
print(f'Val loss: train {vals[0]} test: {vals[1]}')
print("========================= Finished training model =========================")
print('\n')