In [1]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

from pathlib import Path

import os


os.getcwd()
os.chdir('gdrive/MyDrive')

Mounted at /content/gdrive/


In [2]:
import glob
import os
from itertools import chain

import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import as_strided
from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

np.random.seed(42)

def splitting_data(
    df, df_target=None, take_time_stamps=124, overlap=62, zero_padding=True
):
    def windowed_view_adj(
        arr, window=take_time_stamps, overlap=overlap, zero_padding=zero_padding
    ):
        windows = windowed_view(arr, window, overlap)
        if zero_padding:
            re = add_zero_padding(arr, window, overlap)
            return np.append(windows, re, axis=0)
        return windows

    def calculate_number_of_created_samples(
        arr, window=take_time_stamps, overlap=overlap, zero_padding=zero_padding
    ):
        window_step = window - overlap
        new_shape = ((arr.shape[-1] - overlap) // window_step, window)
        return new_shape[0] + 1 if zero_padding else new_shape[0]

    vals = df.values
    vals_shape = vals.shape
    if vals_shape[1] >= take_time_stamps:
        if df_target is None:
            data = list(map(windowed_view_adj, vals))
            return data, None
        else:
            targ_data = df_target.values
            temp_re = [
                (
                    [
                        windowed_view_adj(l),
                        np.array(list(d) * calculate_number_of_created_samples(l)),
                    ]
                )
                for l, d in zip(vals, targ_data)
            ]
            data, data_target = zip(*temp_re)
            data = np.array(data)
            dat_shape = data.shape
            data = data.reshape(dat_shape[0] * dat_shape[1], dat_shape[-1])
            data_target = list(chain(*data_target))
            assert data.shape[0] == len(
                data_target
            ), "Target and data rows are different size!"
            return data, data_target

    else:
        print("Not enough samples")
        return None, None


def windowed_view(arr, window, overlap):
    arr = np.asarray(arr)
    window_step = window - overlap
    new_shape = arr.shape[:-1] + ((arr.shape[-1] - overlap) // window_step, window)
    new_strides = arr.strides[:-1] + (window_step * arr.strides[-1],) + arr.strides[-1:]
    return as_strided(arr, shape=new_shape, strides=new_strides)


def add_zero_padding(arr, window, overlap):
    # need_zeros = len(arr)
    array_len = len(arr)
    window_step = window - overlap
    number_of_els = (arr.shape[-1] - overlap) // window_step
    take_ind = number_of_els * window_step
    number_of_left_elements = array_len - take_ind
    padded_arr = np.array(
        list(arr[take_ind:]) + (window - number_of_left_elements) * [0]
    ).reshape(1, window)
    assert padded_arr.shape == (
        1,
        window,
    ), f"Wrong dimensions after zero padding, expected (1, {window}), got {padded_arr.shape}"
    return padded_arr


def load_preprocessed_datasets_and_processe(
    main_data_folder,
    exclude_dataset_for_testing,
    save_result_folder=None,
    windows_size=128,
    overlap=64,
    zero_padding=False,
):
    train_data_dict = {}
    test_data_dict = {}
    train_target_dict = {}
    test_target_dict = {}
    data_folders = os.listdir(main_data_folder)
    exceptions = {}
    print(f"Total datasets {len(data_folders)}")
    for f in data_folders:
        try:
            test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
            target_test_df = pd.read_csv(f"{main_data_folder}/{f}/test_target.csv")
            test_shape = test_df.shape
            if test_shape[1] < windows_size:
                exceptions[
                    f
                ] = f"Not enough samples in row, found {test_shape[1]}, expected (window size) {windows_size}"
                continue
            if f == exclude_dataset_for_testing:
                splitted_train, splitted_train_target = pd.DataFrame(), pd.DataFrame()

            else:
                train_df = pd.read_csv(f"{main_data_folder}/{f}/train.csv")
                test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
                target_train_df = pd.read_csv(
                    f"{main_data_folder}/{f}/train_target.csv"
                )

                splitted_train, splitted_train_target = splitting_data(
                    train_df,
                    target_train_df,
                    take_time_stamps=windows_size,
                    overlap=overlap,
                    zero_padding=zero_padding,
                )
            splitted_test, splitted_test_target = splitting_data(
                test_df,
                target_test_df,
                take_time_stamps=windows_size,
                overlap=overlap,
                zero_padding=zero_padding,
            )
            if save_result_folder:
                if not os.path.exists(save_result_folder):
                    os.mkdir(save_result_folder)
                additional_folder = f"{save_result_folder}/w_{windows_size}_o_{overlap}_p_{int(zero_padding)}"
                if not os.path.exists(additional_folder):
                    os.mkdir(additional_folder)
                dataset_folder = f"{additional_folder}/{f}"
                if not os.path.exists(dataset_folder):
                    os.mkdir(dataset_folder)

                # splitted_train = None
                pd.DataFrame(splitted_test).to_csv(
                    f"{dataset_folder}/test.csv", index=False, encoding="utf-8"
                )
                pd.DataFrame(splitted_test_target).to_csv(
                    f"{dataset_folder}/test_target.csv", index=False, encoding="utf-8"
                )
                if len(splitted_train):
                    pd.DataFrame(splitted_train).to_csv(
                        f"{dataset_folder}/train.csv", index=False, encoding="utf-8"
                    )
                    pd.DataFrame(splitted_train_target).to_csv(
                        f"{dataset_folder}/train_target.csv",
                        index=False,
                        encoding="utf-8",
                    )
            else:
                test_target_dict[f] = splitted_test_target
                test_data_dict[f] = splitted_test
                if len(splitted_train):
                    train_target_dict[f] = splitted_train_target
                    train_data_dict[f] = splitted_train

        except Exception as e:
            print(f"Error with {f}: {e}")
            exceptions[f] = e
    return (
        train_data_dict,
        test_data_dict,
        train_target_dict,
        test_target_dict,
        exceptions,
    )

In [3]:
exclude_dataset_for_testing = "InsectWingbeat"
(
    train_data_dict,
    test_data_dict,
    train_target_dict,
    test_target_dict,
    exceptions,
) = load_preprocessed_datasets_and_processe(
    "processed_datasets", exclude_dataset_for_testing,
    save_result_folder='fully_processed_data', windows_size=128, overlap=32
)

Total datasets 120


KeyboardInterrupt: ignored

In [4]:
import time
import numpy as np
import os
from keras import layers, models, callbacks, regularizers, optimizers

# from keras.layers import advanced_activations
from contextlib import redirect_stdout
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)


def create_CNN_architecture(
    window_size,
    number_of_layers_in_encoder,
    encoder_filters,
    activation_functions,
    kernel_sizes,
    batch_normalizations,
    max_poolings,
    max_pooling_size=2,
    allowed_bottleneck_sizes=[16, 24, 32],
    **kwargs,
):
    TIMESTEPS = window_size
    num_inputs = 1
    input_placeholder = layers.Input(shape=[TIMESTEPS, num_inputs])
    encoded = input_placeholder
    for i in range(number_of_layers_in_encoder):
        encoder_filter = encoder_filters[i]
        activation_function = activation_functions[i]
        kernel_size = kernel_sizes[i]
        batch_normalization = batch_normalizations[i]
        max_pooling = max_poolings[i]

        encoded = layers.Conv1D(
            encoder_filter,
            kernel_size=kernel_size,
            padding="same",
            activation=activation_function,
        )(encoded)
        if max_pooling:
            encoded = layers.MaxPool1D(max_pooling_size)(encoded)
        if batch_normalization:
            encoded = layers.BatchNormalization()(encoded)
    # bottleneck
    encoded = layers.Dense(1, activation="relu")(encoded)
    encoded = layers.BatchNormalization(name=f"embedding")(encoded)
    bottleneck_shape = list(encoded.shape)[1]
    # print(f'Bottleneck size: {bottleneck_shape}')
    if not (bottleneck_shape in allowed_bottleneck_sizes):
        raise Exception(f"Wrong bottleneck shape: {bottleneck_shape}")

    decoded = encoded

    for i in reversed(range(number_of_layers_in_encoder)):
        encoder_filter = encoder_filters[i]
        activation_function = activation_functions[i]
        kernel_size = kernel_sizes[i]
        batch_normalization = batch_normalizations[i]
        max_pooling = max_poolings[i]
        decoded = layers.Conv1DTranspose(
            encoder_filter,
            kernel_size=kernel_size,
            padding="same",
            activation=activation_function,
        )(decoded)
        if batch_normalization:
            decoded = layers.BatchNormalization()(decoded)
        if max_pooling:
            decoded = layers.UpSampling1D(max_pooling_size)(decoded)

    decoded = layers.Conv1DTranspose(
        filters=1, kernel_size=kernel_size, padding="same"
    )(decoded)

    autoencoder = models.Model(inputs=input_placeholder, outputs=decoded)
    return autoencoder, bottleneck_shape


import pandas as pd
import numpy as np


def load_data(main_data_folder, exclude_dataset_for_testing):
    data_folders = os.listdir(main_data_folder)
    train_data_df = pd.DataFrame()
    test_data_df = pd.DataFrame()
    exceptions = {}
    train_length = 0
    test_length = 0
    print(f"Total datasets {len(data_folders)}")
    for f in data_folders:
        try:
            test_df = pd.read_csv(f"{main_data_folder}/{f}/test.csv")
            
            if f == exclude_dataset_for_testing:
                continue
            else:
                test_length += len(test_df)
                train_df = pd.read_csv(f"{main_data_folder}/{f}/train.csv")
            train_length += len(train_df)
            train_data_df = pd.concat(
                [train_data_df, train_df], ignore_index=True
            )  # train_data_df.append(train_df, ignore_index=True)
            test_data_df = pd.concat(
                [test_data_df, test_df], ignore_index=True
            )  # test_data_df.append(test_df, ignore_index=True)

        except Exception as e:
            exceptions[f] = e
    assert train_length == len(
        train_data_df
    ), "Not all training data was appended to final training set"
    assert test_length == len(
        test_data_df
    ), "Not all testing data was appended to final testing set"
    return train_data_df, test_data_df, exceptions

def compile_model(model, optimizer, loss="mse"):
    model.compile(optimizer=optimizer, loss=loss)
    return model



def train_model(
    model,
    model_name,
    train_data,
    test_data,
    main_model_folder,
    epochs=100,
    batch_size=32,
):
    history = model.fit(
        train_data,
        train_data,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(test_data, test_data),
        callbacks=[
            callbacks.ModelCheckpoint(
                f"{main_model_folder}/"
                + model_name
                + f"/callbacks"
                + "/epoch{epoch:02d}-loss{val_loss:.3f}.tf"
            ),
            callbacks.ModelCheckpoint(
                f"{main_model_folder}/" + model_name + f"/callbacks" + "/best.tf",
                save_best_only=True,
            ),
            callbacks.EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=10,
                              verbose=1, mode='auto')
        ],
        

        verbose=1,
    )

    return history


def save_model_data(model, history, main_model_folder):
    def save_model_summary(model, path_to_save):
        with open(f"{path_to_save}/model_summary.txt", "w") as f:
            with redirect_stdout(f):
                model.summary()
        pd.DataFrame.from_dict(history.history).to_csv(f"{path_to_save}/history.csv")

    if not os.path.exists(main_model_folder):
        os.mkdir(main_model_folder)
    with open(f'{main_model_folder}' + '/model_structure.json', mode='w') as ofile:
        ofile.write(model.to_json())
    save_model_summary(model, main_model_folder)


In [5]:
import time
import numpy as np
import os
from keras import layers, models, callbacks, regularizers, optimizers
#from keras.layers import advanced_activations


# Create graph structure.
TIMESTEPS = 128
num_inputs = 1
input_placeholder = layers.Input(shape=[TIMESTEPS, num_inputs])
# Encoder.
encoded = layers.Conv1D(512, kernel_size=23, padding="same", activation='relu')(input_placeholder)
encoded = layers.MaxPool1D(2)(encoded)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Conv1D(
            filters=256, kernel_size=13, padding="same", activation='relu')(encoded)

encoded = layers.MaxPool1D(2)(encoded)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Conv1D(
            filters=256, kernel_size=7, padding="same", activation='relu')(encoded)
encoded = layers.MaxPool1D(2)(encoded)
encoded = layers.BatchNormalization()(encoded)
encoded = layers.Conv1D(
            filters=128, kernel_size=3, padding="same", activation='relu')(encoded)
encoded = layers.Dense(1, activation='relu')(encoded)
encoded = layers.BatchNormalization(name='embedding')(encoded)
# Decoder.

decoded = layers.Conv1DTranspose(128, kernel_size=3, padding="same", activation='relu')(encoded)
decoded = layers.Conv1DTranspose(256, kernel_size=7, padding="same", activation="relu")(decoded)
decoded = layers.BatchNormalization()(decoded)

decoded = layers.UpSampling1D(2)(decoded)
decoded = layers.Conv1DTranspose(256, kernel_size=7, padding="same", activation="relu")(decoded)
decoded = layers.BatchNormalization()(decoded)

decoded = layers.UpSampling1D(2)(decoded)
decoded = layers.Conv1DTranspose(512, kernel_size=13, padding="same", activation="relu")(decoded)

decoded = layers.BatchNormalization()(decoded)

decoded = layers.UpSampling1D(2)(decoded)

decoded = layers.Conv1DTranspose(filters=1, kernel_size=13, padding="same")(decoded)

encoder = models.Model(inputs=input_placeholder, outputs=encoded)
model = models.Model(inputs=input_placeholder, outputs=decoded)

"""model_7 = {
    "name": "model_8",
    "window_size": 128,
    "number_of_layers_in_encoder": 4,
    "input": 128,
    "encoder_filters": [512, 256, 128],
    "kernel_sizes": [23, 13, 3],
    "activation_functions": ["relu"] * 3,
    "batch_normalizations": [False] * 3,
    "max_poolings": [True] * 3,
}"""

#model, emb = create_CNN_architecture(**model_7)

'model_7 = {\n    "name": "model_8",\n    "window_size": 128,\n    "number_of_layers_in_encoder": 4,\n    "input": 128,\n    "encoder_filters": [512, 256, 128],\n    "kernel_sizes": [23, 13, 3],\n    "activation_functions": ["relu"] * 3,\n    "batch_normalizations": [False] * 3,\n    "max_poolings": [True] * 3,\n}'

In [6]:
model_4 = {
    "name": "model_4",
    "window_size": 128,
    "number_of_layers_in_encoder": 2,
    "input": 128,
    "encoder_filters": [128, 64],
    "kernel_sizes": [13, 7],
    "activation_functions": ["relu"] * 2,
    "batch_normalizations": [False] * 2,
    "max_poolings": [True] * 2,
}
model, emb = create_CNN_architecture(**model_4)

In [7]:
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 128, 1)]          0         
                                                                 
 conv1d_4 (Conv1D)           (None, 128, 128)          1792      
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 64, 128)          0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 64, 64)            57408     
                                                                 
 max_pooling1d_4 (MaxPooling  (None, 32, 64)           0         
 1D)                                                             
                                                                 
 dense_1 (Dense)             (None, 32, 1)             65  

In [8]:
main_model_folder = 'trained_models'
#os.mkdir(f'{main_model_folder}/')
EPOCHS = 100
BATCH_SIZE = 32

In [None]:
exclude_dataset_for_testing = "InsectWingbeat"
folder_name = "fully_processed_data/w_128_o_32_p_0"
train_data_df, test_data_df, exceptions = load_data(
    folder_name, exclude_dataset_for_testing
)
train_data = train_data_df.values
train_data = train_data.reshape(train_data.shape[0], train_data.shape[1], 1)
test_data = test_data_df.values
test_data = test_data.reshape(test_data.shape[0], test_data.shape[1], 1)
main_model_folder = 'trained_models'

Total datasets 97


In [None]:
model_arch = 'model_14'
opt = optimizers.Adam(learning_rate=.00001)
epochs = 100
k = 'model_4_32_1'

model_arch = model_4

print(f'Model iteration: 0 name: {k}')
if model_arch is not None:
  number_of_layers_in_encoder = model_arch['number_of_layers_in_encoder']
  encoder_filters = model_arch['encoder_filters']
  kernel_sizes = model_arch['kernel_sizes']
  
  model, embed = create_CNN_architecture(**model_arch)
  embedding_size = embed
  print(f'INFO: Layers: {number_of_layers_in_encoder} | embedding size {embedding_size} | Kernel filters {encoder_filters} | Kernel sizes {kernel_sizes}')
else:  
  model, embed = model, 32#create_CNN_architecture(**model_arch)
model.compile(optimizer=opt, loss='mse')
folder_name = f'{main_model_folder}/{k}'
history = train_model(model, k, train_data, test_data, main_model_folder, epochs=epochs)
save_model_data(model, history, folder_name)
re = model.evaluate(test_data)
hist_df = pd.DataFrame.from_dict(history.history)
lowest_test_val_loss = hist_df.iloc[hist_df['val_loss'].argmin()]
print(f'Model {k} results {re}')
vals = lowest_test_val_loss.values
print(f'Val loss: train {vals[0]} test: {vals[1]}')
print("========================= Finished training model =========================")
print('\n')

Model iteration: 0 name: model_4_32
INFO: Layers: 2 | embedding size 32 | Kernel filters [128, 64] | Kernel sizes [13, 7]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
 3348/16410 [=====>........................] - ETA: 1:33 - loss: 0.0533