In [1]:
import json
import numpy as np
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, TimeDistributed, Attention
from tensorflow.keras.models import Model

# Load preprocessed data
with open('./validation/preprocessed_sentences.json', 'r') as json_file:
    data = json.load(json_file)

# Load feature vectors
feature_vectors = {}
for item in data:
    sentence_name = item['SENTENCE_NAME']
    with open(f'./validation/NASNetLarge_feature_vectors/{sentence_name}.json', 'r') as f:
        frame_data = json.load(f)
    feature_vectors[sentence_name] = [frame for frame in frame_data.values()]

# Prepare data for the model
decoder_inputs = np.array([item['PREPROCESSED_SENTENCE'][:-1] for item in data])
decoder_targets = np.array([item['PREPROCESSED_SENTENCE'][1:] for item in data])
encoder_inputs = np.array([feature_vectors[item['SENTENCE_NAME']] for item in data])

# Adjust these parameters to match your data
feature_vector_length = 4032  # Length of feature vectors from NASNetLarge
max_seq_length = len(decoder_inputs[0])  # Maximum length of output sentences
vocab_size = 10000  # Size of your vocabulary
dropout_rate = 0.5  # Rate of dropout, adjust as needed

# Inputs
video_input = Input(shape=(None, feature_vector_length))
decoder_input = Input(shape=(max_seq_length,))

# Frame level
lstm1 = LSTM(256, return_sequences=True)(video_input)
lstm1 = Dropout(dropout_rate)(lstm1)
temporal_attention = Attention()([lstm1, lstm1])

# Sequence level
lstm2 = LSTM(256)(temporal_attention)
lstm2 = Dropout(dropout_rate)(lstm2)
lstm3 = LSTM(256, return_sequences=True)(decoder_input)
lstm3 = Dropout(dropout_rate)(lstm3)

# Output
dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
output = dense(lstm3)

# Model
model = Model(inputs=[video_input, decoder_input], outputs=output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Reshape targets to match the output shape of the model
decoder_targets = decoder_targets.reshape(*decoder_targets.shape, 1)

# Train the model
model.fit([encoder_inputs, decoder_inputs], decoder_targets, batch_size=32, epochs=10)


In [None]:
def hierarchical_lstm_model(input_shape_video, input_shape_description, lstm_units=64):
    # Input layers for video features and video descriptions
    video_input = Input(shape=input_shape_video, name='video_input')
    description_input = Input(shape=input_shape_description, name='description_input')

    # Layer 2: Temporal layer with attention mechanism
    # Attention mechanism to focus on relevant video features
    video_attention = Attention()([video_input, description_input])

    # Layer 3: LSTM layer for video features
    lstm_video = LSTM(lstm_units, return_sequences=True)(video_attention)

    # Layer 4: LSTM layer for video descriptions
    lstm_description = LSTM(lstm_units, return_sequences=True)(description_input)

    # Combine the outputs of the two LSTM layers
    lstm_combined = tf.concat([lstm_video, lstm_description], axis=1)

    # Layer 5: Additional LSTM layer for the hierarchical LSTM
    lstm_hierarchical = LSTM(lstm_units, return_sequences=False)(lstm_combined)

    # Output layer (modify the units and activation function based on your task)
    output = Dense(num_classes, activation='softmax')(lstm_hierarchical)

    # Create the model
    model = Model(inputs=[video_input, description_input], outputs=output)

    return model
