In [None]:
from mlflow import MLFlow

input_json_file = '../Dataset/train.json'
output_folder = '../Dataset/train/'
# processing_steps = ["crop", "reduce_noise", "add_noise", "rotate", "brightness", "contrast", "saturation"]

processor = MLFlow(input_json_file, output_folder)
vectors_df = processor.process_video_and_extract_features()

In [None]:
vectors_df.shape

In [None]:
print(len(vectors_df.features[0]))
print(len(vectors_df.features[1]))
# print(len(vectors_df.features[2]))

print(len(vectors_df.labels[0]))
print(len(vectors_df.labels[1]))
# print(len(vectors_df.labels[2]))


In [None]:
import tensorflow as tf
from keras.layers import Input, LSTM, Dense, Attention, Concatenate, Reshape
from keras.models import Model

feature_vectors = []
feature_vectors.append(vectors_df.features[0])
feature_vectors.append(vectors_df.features[1])
# feature_vectors.append(vectors_df.features[2])

max_sequence_length = max([len(video) for video in feature_vectors])

# Padding the sequences
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(feature_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')

In [None]:
label_vectors = []
label_vectors.append(vectors_df.labels[0])
label_vectors.append(vectors_df.labels[1])

padded_labels = tf.keras.preprocessing.sequence.pad_sequences(label_vectors, maxlen=max_sequence_length, padding='post', dtype='float32')


In [None]:
# Define the input layer with masking
video_input = Input(shape=(3, max_sequence_length), batch_size=32, name='video_input')

description_input = Input(shape=(3, max_sequence_length), batch_size=32, name='description_input')

video_attention = Attention()([video_input, description_input])

lstm_video = LSTM(32, return_sequences=True)(video_attention)

lstm_description = LSTM(32, return_sequences=True)(description_input)

lstm_combined = Concatenate()([lstm_video, lstm_description])

lstm_hierarchical = LSTM(32, return_sequences=False)(lstm_combined)

output = Dense(max_sequence_length, activation='linear')(lstm_hierarchical)

model = Model(inputs=[video_input, description_input], outputs=output)

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])

In [None]:
from sklearn.model_selection import train_test_split

# independent variables
ftrs = vectors_df.iloc[:, 1:-1]
# dependent variable
lbls = vectors_df.iloc[:, -1]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(ftrs, lbls, test_size=0.2, random_state=42)

# Train the model
model.fit([X_train, y_train], y_train, batch_size=64, epochs=10)

# Evaluate the model
loss, mse = model.evaluate([X_test, y_test], y_test)
print(f"Mean Squared Error: {mse}")

In [None]:
import pickle
import numpy as np

# Load the saved tokenizer
with open('../Dataset/train/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Generate predictions
predicted_vectors = model.predict(X_test)

# Convert predicted vectors back to text using the tokenizer
def vectors_to_text(predicted_vectors):
    texts = []
    for vector in predicted_vectors:
        predicted_indices = np.argmax(vector, axis=-1)
        text = tokenizer.sequences_to_texts([predicted_indices])[0]
        texts.append(text)
    return texts

# Convert predicted vectors to text
predicted_texts = vectors_to_text(predicted_vectors)

# Print the predicted texts
for text in predicted_texts:
    print(text)
