In [5]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import keras_tuner as kt

In [6]:
# set random seed
np.random.seed(69)
tf.random.set_seed(69)

In [7]:
# load sequential data
def load_sequential_data(folders):
    """ load csvs as sequences removing the frame column and discarding the first 90 rows of each file """
    data, labels = [], []

    for folder, _ in folders:
        if os.path.exists(folder):
            for file in os.listdir(folder):
                if file.endswith('.csv'):
                    file_path = os.path.join(folder, file)
                    df = pd.read_csv(file_path)

                    # remove first 90
                    df = df.iloc[90:]

                    # remove frame and label
                    label = df.iloc[0, 0] 
                    features = df.iloc[:, 2:].values
                    
                    data.append(features)
                    labels.append(label)
        else:
            print(f"warning: folder {folder} not found.")

    return np.array(data), np.array(labels)

In [8]:
# define training folders
train_folders = [
    ('../rat_dance_csv/train', 1),
    ('../neg_control_csv/train', 0)
]

# load dataset
X, y = load_sequential_data(train_folders)

# normalize features
scaler = StandardScaler()
X = np.array([scaler.fit_transform(sample) for sample in X])  # normalize each sequence separately

# shuffle dataset
indices = np.random.permutation(len(X))
X, y = X[indices], y[indices]

# reshape X for RNN samples, timesteps, features
timesteps, features = X.shape[1], X.shape[2]
X = X.reshape(len(X), timesteps, features)

In [22]:
# function to build a simplified rnn model
def build_simple_rnn():
    model = keras.Sequential([
        # single bidirectional lstm layer
        keras.layers.Bidirectional(keras.layers.LSTM(
            units=64,  
            return_sequences=False,
            recurrent_dropout=0.2,
            input_shape=(timesteps, features),
        )),
        keras.layers.Dropout(0.4),

        # fully connected dense layer
        keras.layers.Dense(64, activation="relu", kernel_regularizer=l2(0.01)),
        keras.layers.BatchNormalization(),
        keras.layers.Dropout(0.4),

        # output layer
        keras.layers.Dense(1, activation="sigmoid"),
    ])

    # compile model with fixed learning rate
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.0001),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [24]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)
cv_accuracies = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # build model
    model = build_simple_rnn()

    # define callbacks
    early_stopping = keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=5, restore_best_weights=True
    )
    reduce_lr = keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6
    )

    # train model
    model.fit(
        X_train, y_train, epochs=20, batch_size=32, verbose=1,
        validation_data=(X_val, y_val), callbacks=[early_stopping, reduce_lr]
    )

    # evaluate model
    val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
    cv_accuracies.append(val_acc)

# print cross-validation accuracy
cross_val_acc = np.mean(cv_accuracies)
print(f"cross-validation accuracy: {cross_val_acc:.4f}")


Epoch 1/20


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.4375 - loss: 1.9823 - val_accuracy: 0.3750 - val_loss: 1.6201 - learning_rate: 1.0000e-04
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step - accuracy: 0.4062 - loss: 2.0476 - val_accuracy: 0.3750 - val_loss: 1.6186 - learning_rate: 1.0000e-04
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 168ms/step - accuracy: 0.4688 - loss: 1.8661 - val_accuracy: 0.3750 - val_loss: 1.6171 - learning_rate: 1.0000e-04
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step - accuracy: 0.4062 - loss: 1.8689 - val_accuracy: 0.3750 - val_loss: 1.6155 - learning_rate: 1.0000e-04
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - accuracy: 0.5000 - loss: 1.7033 - val_accuracy: 0.3750 - val_loss: 1.6141 - learning_rate: 1.0000e-04
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [25]:
# train final model on full dataset
final_model = build_simple_rnn()

# train with early stopping
final_model.fit(
    X, y, epochs=20, batch_size=32, verbose=1, callbacks=[early_stopping, reduce_lr]
)


Epoch 1/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - accuracy: 0.4354 - loss: 2.0202 - learning_rate: 1.0000e-04
Epoch 2/20
[1m1/2[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 141ms/step - accuracy: 0.3438 - loss: 1.9103

  current = self.get_monitor_value(logs)
  callback.on_epoch_end(epoch, logs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.3646 - loss: 1.9429 - learning_rate: 1.0000e-04
Epoch 3/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.5708 - loss: 1.7232 - learning_rate: 1.0000e-04
Epoch 4/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.5542 - loss: 1.7244 - learning_rate: 1.0000e-04
Epoch 5/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.5917 - loss: 1.7651 - learning_rate: 1.0000e-04
Epoch 6/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.4021 - loss: 1.8266 - learning_rate: 1.0000e-04
Epoch 7/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.5375 - loss: 1.7183 - learning_rate: 1.0000e-04
Epoch 8/20
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - accuracy: 0.5813 - loss: 1.7343 - learning_rate: 

<keras.src.callbacks.history.History at 0x454503b50>

In [26]:
# load validation dataset
val_folders = [
    ('../rat_dance_csv/val', 1),
    ('../neg_control_csv/val', 0)
]

X_val, y_val = load_sequential_data(val_folders)
X_val = np.array([scaler.transform(sample) for sample in X_val])  
X_val = X_val.reshape(len(X_val), timesteps, features)  

# evaluate on validation set
y_pred_prob = final_model.predict(X_val)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_val, y_pred)
class_report = classification_report(y_val, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_val, y_pred)

# show results
print(f"\nvalidation performance:")
print(f"accuracy: {accuracy:.4f}")
print("\nclassification report:")
print(class_report)
print("\nconfusion matrix:")
print(conf_matrix)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 230ms/step

validation performance:
accuracy: 0.5714

classification report:
                      precision    recall  f1-score   support

negative control (0)       0.67      0.29      0.40         7
        ratdance (1)       0.55      0.86      0.67         7

            accuracy                           0.57        14
           macro avg       0.61      0.57      0.53        14
        weighted avg       0.61      0.57      0.53        14


confusion matrix:
[[2 5]
 [1 6]]


In [27]:
# load validation dataset
test_folders = [
    ('../rat_dance_csv/test', 1),
    ('../neg_control_csv/test', 0)
]

X_test, y_test = load_sequential_data(test_folders)
X_test = np.array([scaler.transform(sample) for sample in X_test])  
X_test = X_test.reshape(len(X_test), timesteps, features)  

# evaluate on validation set
y_pred_prob = final_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=["negative control (0)", "ratdance (1)"])
conf_matrix = confusion_matrix(y_test, y_pred)

# show results
print(f"\nvalidation performance:")
print(f"accuracy: {accuracy:.4f}")
print("\nclassification report:")
print(class_report)
print("\nconfusion matrix:")
print(conf_matrix)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step

validation performance:
accuracy: 0.6429

classification report:
                      precision    recall  f1-score   support

negative control (0)       1.00      0.29      0.44         7
        ratdance (1)       0.58      1.00      0.74         7

            accuracy                           0.64        14
           macro avg       0.79      0.64      0.59        14
        weighted avg       0.79      0.64      0.59        14


confusion matrix:
[[2 5]
 [0 7]]
