# Building Fusion model

In [6]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import TensorBoard
from tensorboard.plugins.hparams import api as hp
import datetime

import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
import keras
import runai.ga


In [2]:
# build_rgb_stream
class Build_Fusion_Model:
    # build_ssd
    def build_ssd_stream(self):
        base_model = MobileNetV2(input_shape=(320, 320, 3), include_top=False, weights='imagenet')
        # Freeze the base model layers
        for layer in base_model.layers:
            layer.trainable = False
        
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = Dense(1024, activation='relu')(x)
        x = Dense(256, activation='relu')(x)
        x = layers.Dropout(0.5)(x)

        self.model = Model(inputs=base_model.input, outputs=x)
        # print(self.model.summary())
        return self.model
    
    # build_pose_stream
    def build_pose_stream(self,input_shape=(20, 17*2)):
        self.model = models.Sequential()

        # Add LSTM layers for processing pose keypoints over time
        self.model.add(layers.LSTM(64, input_shape=input_shape, return_sequences=True))
        self.model.add(layers.LSTM(128, return_sequences=True))
        self.model.add(layers.LSTM(256, return_sequences=True))

        self.model.add(layers.Dense(256, activation='relu'))
        self.model.add(layers.Dropout(0.5))

        return self.model

# build_fusion_model
    def build_fusion_model(self,ssd_stream, pose_stream, num_classes):
        # Combine the two streams
        self.ssd_stream_flattened = layers.Flatten()(ssd_stream.output)
        self.pose_stream_flattened = layers.Flatten()(pose_stream.output)
        combined_input = layers.concatenate([self.pose_stream_flattened, self.ssd_stream_flattened])

        # Add fully connected layers for fusion
        self.fusion_dense = layers.Dense(512, activation='relu')(combined_input) # replace with (combined_input)
        self.fusion_dense = layers.Dropout(0.5)(self.fusion_dense)

        self.fusion_output = layers.Dense(num_classes, activation='sigmoid')(self.fusion_dense)

        # Create the final model
        self.fusion_model = models.Model(inputs=[pose_stream.input, ssd_stream.input], outputs=self.fusion_output)

        return self.fusion_model

In [3]:
# Set the number of classes for your action recognition task
num_classes = 1
build_model = Build_Fusion_Model()
# Build the RGB stream
ssd_stream = build_model.build_ssd_stream()

# Build the Pose stream
pose_stream = build_model.build_pose_stream(input_shape=(10, 34))

# Build the Fusion model
fusion_model = build_model.build_fusion_model(ssd_stream, pose_stream, num_classes)


# Define hyperparameters
hp_epochs = hp.HParam('epochs', hp.IntInterval(10, 50))
hp_learning_rate = hp.HParam('learning_rate', hp.RealInterval(1e-4, 1e-2))

# Create a summary file for TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Create a callback for hyperparameter logging
hparams_callback = hp.KerasCallback(log_dir, hparams={hp_epochs: 20, hp_learning_rate: 1e-3})


# Compile the model with an appropriate optimizer, loss, and metrics
optimizer = keras.optimizers.Adam()
optimizer = runai.ga.keras.optimizers.Adam(steps=128)
fusion_model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Display the model summary
# fusion_model.summary()




AttributeError: module 'runai.ga' has no attribute 'keras'

In [4]:
# tf.keras.utils.plot_model(fusion_model, show_shapes=True, show_layer_names=True)

In [5]:
fusion_model.input_shape

[(None, 10, 34), (None, 320, 320, 3)]

In [6]:
fusion_model.output_shape

(None, 1)

# Inference

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from video_processor import VideoProcessor

In [8]:
df = pd.read_csv('../datasets/KTH/dataset.csv')
df.head()

Unnamed: 0,clip_path,label
0,../datasets/KTH/boxing\person01_boxing_d1_unco...,boxing
1,../datasets/KTH/boxing\person01_boxing_d2_unco...,boxing
2,../datasets/KTH/boxing\person01_boxing_d3_unco...,boxing
3,../datasets/KTH/boxing\person01_boxing_d4_unco...,boxing
4,../datasets/KTH/boxing\person02_boxing_d1_unco...,boxing


In [9]:
df['label'].unique()

array(['boxing', 'handclapping', 'handwaving', 'jogging', 'running', 'walking'], dtype=object)

In [10]:
def three_class(x):
    if x not in ['golf', 'kick_ball', 'pushup', 'shoot_ball',  
             'shoot_bow', 'shoot_gun', 'swing_baseball', 
             'thow', 'BoxingPunchingBag', 'boxing', 'punch', 
             'kick', 'point', 'handwaving', 'wave']:
        return 0
    else:
        return 1

In [11]:
df['label'] = df['label'].apply(three_class)

In [12]:
classwise_df = []

for i in df['label'].unique():
    classwise_df.append(df[df['label']==i].reset_index())

# Inference on Dataset

In [13]:
from ultralytics import YOLO
import cv2

In [14]:
model = YOLO('models/yolov8m-pose.pt') 

In [15]:
total_df = pd.DataFrame()
for df in classwise_df:
    total_df = pd.concat([total_df, df])


In [16]:
for i, d in total_df.sample(5).groupby('label'):
    print(d)

     index                                          clip_path  label
369    569  ../datasets/KTH/walking\person18_walking_d3_un...      0
289    489  ../datasets/KTH/running\person23_running_d3_un...      0
56     156  ../datasets/KTH/handclapping\person15_handclap...      0
352    552  ../datasets/KTH/walking\person14_walking_d2_un...      0
    index                                          clip_path  label
88     88  ../datasets/KTH/boxing\person23_boxing_d1_unco...      1


# Training

In [17]:
import cv2
import numpy as np
import pandas as pd
from random import shuffle
import os

def data_generator(total_df, batch_size=1, shuffle_data=True, resize=320):
    """
    Yields the next training batch.
    Suppose `total_df` is a pandas DataFrame with columns 'clip_path' and 'label'.
    """
    num_samples = len(total_df)
    samples = list(total_df.iterrows())  # Convert DataFrame to a list of (index, row) tuples

    while True:  # Loop forever so the generator never terminates
        if shuffle_data:
            shuffle(samples)

        # Get index to start each batch: [0, batch_size, 2*batch_size, ..., max multiple of batch_size <= num_samples]
        for offset in range(0, num_samples, batch_size):
            # Get the samples you'll use in this batch
            batch_samples = samples[offset:offset+batch_size]

            # Initialise pose_buffer and labels arrays for this batch
            pose_buffer = []
            rbg_buffer = []
            labels = []

            # For each example
            for _, batch_sample in batch_samples:
                # Load video (X) and label (y)
                vid_path = batch_sample['clip_path']
                label = batch_sample['label']
                cap = cv2.VideoCapture(vid_path)
                frame_buffer = []
                while True:
                    ret, frame = cap.read()
                    if not ret:
                        cap.release()
                        break

                    frame = cv2.resize(frame, (resize, resize))
                    if len(frame_buffer) < 10:
                        frame_buffer.append(frame)
                    elif len(frame_buffer) == 10:
                        batch = vp.process_video(frame_buffer)
                        batch = np.array(batch)[:, 0, :].reshape(-1, 10, 34)
                        pose_buffer.append(batch[0])
                        rbg_buffer.append(frame)
                        frame_buffer = []
                        labels.append(label)
                    else:
                        frame_buffer = []

                # labels.extend([label] * len(pose_buffer))  # Extend the labels list for this video's frames

            # Make sure they're numpy arrays (as opposed to lists)
            pose_buffer = np.array(pose_buffer)
            rbg_buffer = np.array(rbg_buffer)
            labels = np.array(labels)
            # print(len(labels))


            # res = [pose_buffer, rbg_buffer], labels
            # print(res[0][0].shape, res[0][1].shape, res[1].shape)

            # The generator-y part: yield the next training batch
            yield [pose_buffer, rbg_buffer], labels  # ([(None, 10, 34), (None, 640, 640, 3)], (None,1))


In [18]:
vp = VideoProcessor(model, max_frames=100, img_sz=320, show_stream=False)

In [19]:
from sklearn.model_selection import train_test_split

# total_df = total_df[:10]
train_df, test_df = train_test_split(total_df, test_size=0.2, random_state=42)
train_df, val_df  = train_test_split(total_df, test_size=0.25, random_state=42)


In [21]:
batch_size = 1
num_epochs = 4
train_generator = data_generator(train_df, batch_size=batch_size)
val_generator = data_generator(val_df, batch_size=batch_size)
steps_per_epoch = len(total_df) // batch_size
# steps_per_epoch = 10
# Now, you can use this generator to train your model
history = fusion_model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs,)
                        #    validation_data = val_generator, validation_steps=steps_per_epoch//2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [22]:
batch_size = 1
num_epochs = 25
history1 = fusion_model.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs,)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25

In [None]:
fusion_model.save('models/rbg_lstm_checkpoint_30_epochs.keras')