<a href="https://colab.research.google.com/github/Rosalee830/Rosalee830/blob/main/YouTube_8m.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import tempfile

from matplotlib import pyplot as plt
import numpy as np # linear algebra
import tensorflow as tf

tmpdir = tempfile.mkdtemp()

In [2]:
physical_devices = tf.config.list_physical_devices('GPU')
for device in physical_devices:
  tf.config.experimental.set_memory_growth(device, True)

In [3]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from IPython.display import YouTubeVideo
import matplotlib.pyplot as plt
!pip install chart-studio
import chart_studio.plotly as py
import os
print(os.listdir("/YouTube-8m"))
# video level feature file
print(os.listdir("/YouTube-8m/video"))
# frame level features file
print(os.listdir("/YouTube-8m/frame"))


Collecting chart-studio
  Downloading chart_studio-1.1.0-py3-none-any.whl (64 kB)
[?25l[K     |█████                           | 10 kB 25.6 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 28.6 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 33.5 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 37.3 MB/s eta 0:00:01[K     |█████████████████████████▍      | 51 kB 32.2 MB/s eta 0:00:01[K     |██████████████████████████████▌ | 61 kB 25.7 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 3.0 MB/s 
Installing collected packages: chart-studio
Successfully installed chart-studio-1.1.0
['frame', 'video', '.ipynb_checkpoints']
['train01.tfrecord', 'train00.tfrecord']
['train01.tfrecord', 'train00.tfrecord']


In [4]:
# keras imports
from keras.layers import Dense, Input, LSTM, Dropout, Bidirectional
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import BatchNormalization
from keras.layers.embeddings import Embedding
from keras.layers.merge import concatenate
from keras.callbacks import TensorBoard
from keras.models import load_model
from keras.models import Model
import operator
import time 
import gc
import os

Creating training and dev set

In [5]:
def create_train_dev_dataset(video_rgb, video_audio, frame_rgb, frame_audio, labels):
    """
    Method to created training and validation data
    """
    shuffle_indices = np.random.permutation(np.arange(len(labels)))
    video_rgb_shuffled = video_rgb[shuffle_indices]
    video_audio_shuffled = video_audio[shuffle_indices]
    frame_rgb_shuffled = frame_rgb[shuffle_indices]
    frame_audio_shuffled = frame_audio[shuffle_indices]
    labels_shuffled = labels[shuffle_indices]

    dev_idx = max(1, int(len(labels_shuffled) * validation_split_ratio))

    del video_rgb
    del video_audio
    del frame_rgb
    del frame_audio
    gc.collect()

    train_video_rgb, val_video_rgb = video_rgb_shuffled[:-dev_idx], video_rgb_shuffled[-dev_idx:]
    train_video_audio, val_video_audio = video_audio_shuffled[:-dev_idx], video_audio_shuffled[-dev_idx:]
    
    train_frame_rgb, val_frame_rgb = frame_rgb_shuffled[:-dev_idx], frame_rgb_shuffled[-dev_idx:]
    train_frame_audio, val_frame_audio = frame_audio_shuffled[:-dev_idx], frame_audio_shuffled[-dev_idx:]
    
    train_labels, val_labels = labels_shuffled[:-dev_idx], labels_shuffled[-dev_idx:]
    
    del video_rgb_shuffled, video_audio_shuffled, frame_rgb_shuffled, frame_audio_shuffled, labels_shuffled
    gc.collect()
    
    return (train_video_rgb, train_video_audio, train_frame_rgb, train_frame_audio, train_labels, val_video_rgb, val_video_audio, 
            val_frame_rgb, val_frame_audio, val_labels)

Defining Model parameters and creating architecture

In [6]:
max_frame_rgb_sequence_length = 10
frame_rgb_embedding_size = 1024

max_frame_audio_sequence_length = 10
frame_audio_embedding_size = 128

number_dense_units = 1000
number_lstm_units = 100
rate_drop_lstm = 0.2
rate_drop_dense = 0.2
activation_function='relu'
validation_split_ratio = 0.2
label_feature_size = 10

def create_model(video_rgb, video_audio, frame_rgb, frame_audio, labels):
    """Create and store best model at `checkpoint` path ustilising bi-lstm layer for frame level data of videos"""
    train_video_rgb, train_video_audio, train_frame_rgb, train_frame_audio, train_labels, val_video_rgb, val_video_audio, val_frame_rgb, val_frame_audio, val_labels = create_train_dev_dataset(video_rgb, video_audio, frame_rgb, frame_audio, labels) 
    
    # Creating 2 bi-lstm layer, one for rgb and other for audio level data
    lstm_layer_1 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    lstm_layer_2 = Bidirectional(LSTM(number_lstm_units, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))
    
    # creating input layer for frame-level data
    frame_rgb_sequence_input = Input(shape=(max_frame_rgb_sequence_length, frame_rgb_embedding_size), dtype='float32')
    frame_audio_sequence_input = Input(shape=(max_frame_audio_sequence_length, frame_audio_embedding_size), dtype='float32')
    
    frame_x1 = lstm_layer_1(frame_rgb_sequence_input)
    frame_x2 = lstm_layer_2(frame_audio_sequence_input)
    
    # creating input layer for video-level data
    video_rgb_input = Input(shape=(video_rgb.shape[1],))
    video_rgb_dense = Dense(int(number_dense_units/2), activation=activation_function)(video_rgb_input)
    
    video_audio_input = Input(shape=(video_audio.shape[1],))
    video_audio_dense = Dense(int(number_dense_units/2), activation=activation_function)(video_audio_input)
    
    # merging frame-level bi-lstm output and later passed to dense layer by applying batch-normalisation and dropout
    merged_frame = concatenate([frame_x1, frame_x2])
    merged_frame = BatchNormalization()(merged_frame)
    merged_frame = Dropout(rate_drop_dense)(merged_frame)
    merged_frame_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_frame)
    
    # merging video-level dense layer output
    merged_video = concatenate([video_rgb_dense, video_audio_dense])
    merged_video = BatchNormalization()(merged_video)
    merged_video = Dropout(rate_drop_dense)(merged_video)
    merged_video_dense = Dense(int(number_dense_units/2), activation=activation_function)(merged_video)
    
    # merging frame-level and video-level dense layer output
    merged = concatenate([merged_frame_dense, merged_video_dense])
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
     
    merged = Dense(number_dense_units, activation=activation_function)(merged)
    merged = BatchNormalization()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    preds = Dense(label_feature_size, activation='sigmoid')(merged)
    
    model = Model(inputs=[frame_rgb_sequence_input, frame_audio_sequence_input, video_rgb_input, video_audio_input], outputs=preds)
    print(model.summary())
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    
    STAMP = 'lstm_%d_%d_%.2f_%.2f' % (number_lstm_units, number_dense_units, rate_drop_lstm, rate_drop_dense)

    checkpoint_dir = 'checkpoints/' + str(int(time.time())) + '/'

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)

    bst_model_path = checkpoint_dir + STAMP + '.h5'
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False)
    tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time()))
  
    model.fit([train_frame_rgb, train_frame_audio, train_video_rgb, train_video_audio], train_labels,
              validation_data=([val_frame_rgb, val_frame_audio, val_video_rgb, val_video_audio], val_labels),
              epochs=200, batch_size=64, shuffle=False, callbacks=[early_stopping, model_checkpoint, tensorboard])    
    return model

Creating random data set for training 

Here I am creating a sample dataset of same size and dimension of training sample and will use it to train the model

In [7]:
import numpy as np
import random

sample_length = 1000

video_rgb = np.random.rand(sample_length, 1024)
video_audio = np.random.rand(sample_length, 128)

frame_rgb = np.random.rand(sample_length, 10, 1024)
frame_audio = np.random.rand(sample_length, 10, 128)

# Here I have considered that I have only 10 labels.
labels = np.zeros([sample_length,10])
for i in range(len(labels)):
    j = random.randint(0,9)
    labels[i][j] = 1 

Training Model

In [12]:
model = create_model(video_rgb, video_audio, frame_rgb, frame_audio, labels)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 10, 1024)]   0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 10, 128)]    0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 128)]        0                                            
____________________________________________________________________________________________

#As the model trains, the loss and accuracy metrics are displayed. This model reaches an accuracy of about 0.99 (or 99%) on the training data.#

In [14]:
model.summary()


Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 10, 1024)]   0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 10, 128)]    0                                            
__________________________________________________________________________________________________
input_7 (InputLayer)            [(None, 1024)]       0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 128)]        0                                            
____________________________________________________________________________________________

Testing with created random test data

In [15]:
test_video_rgb = np.random.rand(1, 1024)
test_video_audio = np.random.rand(1, 128)

test_frame_rgb = np.random.rand(1, 10, 1024)
test_frame_audio = np.random.rand(1, 10, 128)

preds = list(model.predict([test_frame_rgb, test_frame_audio, test_video_rgb, test_video_audio], verbose=1).ravel())
index, value = max(enumerate(preds), key=operator.itemgetter(1))
print("Predicted Label - %s with probability - %s" % (str(index), str(value)))

Predicted Label - 6 with probability - 0.118174404


Ploting the perfomance of model

SyntaxError: ignored

In [16]:
model.eval()

drop_remainder = True

scores = model.evaluate(x = (video_rgb, video_audio, frame_rgb, frame_audio), y = labels)
print("Bi-LSTM:test_loss: %f, accuracy: %f" % (scores[0], scores[1]) )

AttributeError: ignored

In [None]:
from sklearn import metrics
print("")
print("Confusion Matrix:")
confusion_matrix = metrics.confusion_matrix(labels, pred)
print(confusion_matrix)
normalised_confusion_matrix = np.array(confusion_matrix, dtype=np.float32)/np.sum(confusion_matrix)*100

print("")
print("Confusion matrix (normalised to % of total test data):")
print(normalised_confusion_matrix)
print("Note: training and testing data is not equally distributed amongst classes, ")
print("so it is normal that more than a 6th of the data is correctly classifier in the last category.")




Confusion Matrix:


NameError: ignored

In [None]:
testPredict = model.predict([test_frame_rgb, test_frame_audio, test_video_rgb, test_video_audio], verbose=1).ravel()
trainPredict = model.predict([frame_rgb, frame_audio, video_rgb, video_audio], verbose=1).ravel()



In [None]:
import numpy as np
def prediction(model):
    prediction = model.predict([test_frame_rgb, test_frame_audio, test_video_rgb, test_video_audio], verbose=1)
    return prediction
prediction_bilstm = prediction(model)
# Define a function to calculate MAE and RMSE
def evaluate_prediction(predictions, actual, model):
    errors = predictions - actual
    mse = np.square(errors).mean()
    rmse = np.sqrt(mse)
    mae = np.abs(errors).mean()

    print('Mean Absolute Error: {:.4f}'.format(mae))
    print('Root Mean Square Error: {:.4f}'.format(rmse))
    print('')
evaluate_prediction(prediction_bilstm, y_test, 'Bidirectional LSTM')

NameError: ignored