In [4]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import keras_tuner as kt


In [5]:
# set random seed for reproducibility
np.random.seed(69)
tf.random.set_seed(69)

In [6]:
# function to load and process data
def load_augmented_sequences(folders):
    """ 
    load csvs as sequences, split each into 4 smaller examples and remove examples where >50% of columns are all zeros 
    """
    data, labels = [], []

    for folder, label in folders:
        if os.path.exists(folder):
            for file in os.listdir(folder):
                if file.endswith('.csv'):
                    file_path = os.path.join(folder, file)
                    df = pd.read_csv(file_path)

                    # remove 'frame' column and keep only movement data
                    features = df.iloc[:, 2:].values  
                    num_splits = 4
                    split_size = len(features) // num_splits

                    # split into 4 equal parts
                    for i in range(num_splits):
                        sub_features = features[i * split_size: (i + 1) * split_size]

                        # check if more than 50% of the columns are completely zero
                        zero_columns = np.sum(sub_features == 0, axis=0) == sub_features.shape[0]
                        if np.mean(zero_columns) > 0.5:
                            continue  # skip this example

                        data.append(sub_features)
                        labels.append(label)
        else:
            print(f"warning folder {folder} not found")

    return np.array(data), np.array(labels)

In [7]:
# define training folders with labels
train_folders = [
    ('../rat_dance_csv/train', 1),
    ('../neg_control_csv/train', 0)
]

# load dataset (with augmentation)
X, y = load_augmented_sequences(train_folders)

# normalize features
scaler = StandardScaler()
X = np.array([scaler.fit_transform(sample) for sample in X])  # normalize each sequence separately

# shuffle dataset
indices = np.random.permutation(len(X))
X, y = X[indices], y[indices]

# reshape X for RNN (samples, timesteps, features)
timesteps, features = X.shape[1], X.shape[2]
X = X.reshape(len(X), timesteps, features)

# function to build the model for hyperparameter tuning
def build_model(hp):
    model = keras.Sequential([
        keras.layers.LSTM(
            units=hp.Int("units", min_value=16, max_value=128, step=16),
            return_sequences=False,
            recurrent_dropout=hp.Float("recurrent_dropout", 0.1, 0.5, step=0.1),
            input_shape=(timesteps, features),
        ),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(hp.Float("dropout", 0.3, 0.7, step=0.1)),
        keras.layers.Dense(hp.Int("dense_units", 16, 64, step=16), activation="relu", kernel_regularizer=l2(0.01)),
        keras.layers.Dense(1, activation="sigmoid"),
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=hp.Choice("learning_rate", [0.001, 0.0005, 0.0001])
        ),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

# use keras tuner to search for best hyperparameters
tuner = kt.tuners.RandomSearch(
    hypermodel=build_model,
    objective="val_loss", 
    max_trials=15,
    executions_per_trial=2,
    directory="delete_me_post_search",
    project_name="lstm_tuning",
)

# split data for tuning
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
train_index, val_index = next(kf.split(X, y))  # use the first fold for tuning
X_train, X_val = X[train_index], X[val_index]
y_train, y_val = y[train_index], y[val_index]

# search for best hyperparameters
tuner.search(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)

# get best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# print best values
print(f"Best LSTM units: {best_hps.get('units')}")
print(f"Best dropout: {best_hps.get('dropout')}")
print(f"Best recurrent dropout: {best_hps.get('recurrent_dropout')}")
print(f"Best dense units: {best_hps.get('dense_units')}")
print(f"Best learning rate: {best_hps.get('learning_rate')}")

# create final model using best hyperparameters
final_model = tuner.hypermodel.build(best_hps)

# define early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

Trial 15 Complete [00h 00m 06s]
val_loss: 1.1839964985847473

Best val_loss So Far: 0.8906864523887634
Total elapsed time: 00h 03m 49s
Best LSTM units: 16
Best dropout: 0.4
Best recurrent dropout: 0.2
Best dense units: 32
Best learning rate: 0.0001


# Best I have gotten

Best val_loss So Far: 0.8906864523887634

Best LSTM units: 16

Best dropout: 0.4

Best recurrent dropout: 0.2

Best dense units: 32

Best learning rate: 0.0001

In [13]:
# train final model on full dataset
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
cv_accuracies = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    final_model.fit(X_train, y_train, epochs=15, batch_size=64, verbose=0, validation_data=(X_val, y_val), callbacks=[early_stopping])

    # evaluate the model
    val_loss, val_acc = final_model.evaluate(X_val, y_val, verbose=0)
    cv_accuracies.append(val_acc)

# print cross-validation accuracy
cross_val_acc = np.mean(cv_accuracies)
print(f"Final Cross-Validation Accuracy: {cross_val_acc:.4f}")

Final Cross-Validation Accuracy: 0.7714


In [12]:
# load validation dataset
val_folders = [
    ('../rat_dance_csv/val', 1),
    ('../neg_control_csv/val', 0)
]

X_val, y_val = load_augmented_sequences(val_folders)
X_val = np.array([scaler.transform(sample) for sample in X_val])  
X_val = X_val.reshape(len(X_val), timesteps, features)  

# evaluate final model on validation set
y_pred_prob = final_model.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype(int)  

accuracy = accuracy_score(y_val, y_pred)
class_report = classification_report(y_val, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_val, y_pred)

# show results
print(f"\nValidation Performance:")
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(class_report)
print("\nConfusion Matrix:")
print(conf_matrix)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step

Validation Performance:
Accuracy: 0.7170

Classification Report:
                      precision    recall  f1-score   support

negative control (0)       0.71      0.79      0.75        28
        ratdance (1)       0.73      0.64      0.68        25

            accuracy                           0.72        53
           macro avg       0.72      0.71      0.71        53
        weighted avg       0.72      0.72      0.72        53


Confusion Matrix:
[[22  6]
 [ 9 16]]
