### Tech Exellence Advanced Data Science - Generative AI Video Classification Project

##### Project Authors: Tim Tieng, Afia Owusu-Forfie

**Objective**: Develop a model to classify video content into categories such as sports, news, movies, etc., and enhance this classification by generating descriptive captions or summaries that provide additional context about the content. This can be particularly useful for content curation platforms, accessibility applications (e.g., providing descriptions for the hearing impaired), or educational tools where supplementary information enhances learning.

**Data**: Public Dataset: Use a dataset like the YouTube-8M, which has a vast collection of labeled video data suitable for training video classification models.

In [None]:
# !pip install chardet

In [None]:
# Import Packages for project

# Standard Libraries
import csv
import numpy as np
import os
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms, Modeling and Data Pre-processing
import feature_engine
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from scipy.stats import anderson, chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,precision_score, roc_auc_score,recall_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Deep Learning
import keras
from keras import layers
from keras.layers import RandomFlip, RandomRotation, Rescaling, BatchNormalization, Conv2D, MaxPooling2D, Dense, Input
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Model Optimization and Hyperparameter Tuning
import hyperopt
from hyperopt import STATUS_OK, Trials, fmin, tpe, hp
import mlflow

import tensorboard

In [None]:
# Obtain the data - Due to issues with larger dataset, the project group is using the dataset provided via kaggle competition that used youtube-8M data

frame_level_record1 = "../data/frame-sample/frame/train00.tfrecord"
frame_level_record2 = "../data/frame-sample/frame/train01.tfrecord"
validation_level_record1 = "../data/validate-sample/validate/validate00.tfrecord"
validation_level_record2 = "../data/validate-sample/validate/validate01.tfrecord"
print(f"Frame Directory Data Present: {os.listdir('../data/frame-sample/frame')}")
print(f"Validation Directory Data Present: {os.listdir('../data/validate-sample/validate')}")


This confirms that we have frame-level data and validation data loaded into our project directory.

Extract Video-Level Information from the frame-level files: train00.tfrecord

### Tim Test/Experimental code

In [None]:
def parse_tfrecord(example_proto):
    """
    Parses a single example from a TFRecord file into a tensor suitable for training or evaluation.

    This function defines and uses a fixed schema to parse each example in the TFRecord file. The schema is defined using
    TensorFlow's parsing functions which map the data from a serialized `tf.train.Example` protobuf to tensors. The keys
    in the `feature_description` dictionary specify the expected features in the TFRecord, and their corresponding values
    define the type and shape of the data.

    Parameters:
    example_proto (tf.Tensor): A tensor containing a serialized `tf.train.Example` protobuf.

    Returns:
    dict: A dictionary where each key corresponds to a feature specified in the TFRecord schema. Each value is a
    TensorFlow tensor. The keys and their respective tensors are:
        - 'id': A tensor containing the unique identifier of the video. This is a scalar string tensor.
        - 'labels': A sparse tensor containing a list of integer labels associated with the video.
        - 'rgb': A dense tensor of shape [1024] containing the RGB features of the video frame.
        - 'audio': A dense tensor of shape [128] containing the audio features of the video frame.
        - 'segment_start_times': (Optional) A sparse tensor containing start times for each labeled segment.
        - 'segment_end_times': (Optional) A sparse tensor containing end times for each labeled segment.
        - 'segment_labels': (Optional) A sparse tensor containing labels for each segment.
        - 'segment_scores': (Optional) A sparse tensor containing binary scores indicating positive or negative sentiment for each segment label.

    The optional keys ('segment_start_times', 'segment_end_times', 'segment_labels', 'segment_scores') should be uncommented
    in the feature description if segment-level data is being processed.

    Example:
    To use this function, ensure it is mapped over a dataset created from a TFRecord file, like so:
    dataset = tf.data.TFRecordDataset("path_to_tfrecord_file.tfrecord")
    parsed_dataset = dataset.map(parse_tfrecord)
    """
    
    # Define your feature description
    feature_description = {
        'id': tf.io.FixedLenFeature([], tf.string),
        'labels': tf.io.VarLenFeature(tf.int64),
        'rgb': tf.io.FixedLenFeature([1024], tf.float32, default_value=np.zeros([1024], dtype=np.float32)),
        'audio': tf.io.FixedLenFeature([128], tf.float32, default_value=np.zeros([128], dtype=np.float32)),
        # Uncomment these if you're handling segment data
        'segment_start_times': tf.io.VarLenFeature(tf.int64),
        'segment_end_times': tf.io.VarLenFeature(tf.int64),
        'segment_labels': tf.io.VarLenFeature(tf.int64),
        'segment_scores': tf.io.VarLenFeature(tf.float32),
    }
    return tf.io.parse_single_example(example_proto, feature_description)

In [None]:
def load_dataset(file_path):
    """
    Purpose: To take load tfrecord files for future manipulation
    Arguments: a filepath or variable that stores a filepath to a tfrecord file
    """
    raw_dataset = tf.data.TFRecordDataset(file_path)
    parsed_dataset = raw_dataset.map(parse_tfrecord) # map the data
    return parsed_dataset

In [None]:
# Create a dataset object using the load_dataset() which calls in parse_tfrecord()
dataset = load_dataset(frame_level_record1)

In [None]:
#View and inspect parsed dataset of train00.tfrecord
dataset.element_spec

In [None]:
type(dataset)

In [None]:
# Since there are over observations, printing only the first 20
for index, item in enumerate(dataset.as_numpy_iterator()):
    if index < 10:
        print(item)
    else:
        break

### Observations

We can see a each record can have numerous label values. The values in the label list of each item can be mapped to the values in the vocabulary data dictionary. 

**Audio** - this is an empty list that was pre-populated with np.zeroes during the collection phase

**RGB** - like audio, this is an array or np.zeros

**Segment_X** thes are all empty arrays. these features are labeled as optional as per youtube-8m documenation

In [None]:
# Load the vocabulary from the 'vocabulary.csv' file
vocabulary = {}
with open('../data/vocabulary.csv', mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        numerical_value = row[0]  # Assuming numerical values are in the first column
        name = row[3]                   # Assuming names are in the second column
        vocabulary[numerical_value] = name

In [None]:
#Confirm that the index value and name are created into the dictionary
vocabulary

### EDA on Vocabulary.csv

In [None]:
# Read in the vocabulary csv file
vocab_path = "../data/vocabulary.csv"
vocab= pd.read_csv(vocab_path)
vocab_df = pd.DataFrame(vocab)

vocab_df

In [None]:
vocab_df.info(memory_usage='deep')

In [None]:
# Null value count and percentage per column
vocab_df.isna().mean()

In [None]:
# Null value count and percentage per column
vocab_df.isna().sum()

In [None]:
# Get the unique labels in the vocabulary df
unique_labels = vocab_df['Name'].unique()
unique_labels

In [None]:
unique_label_count = vocab_df['Name'].nunique()
print(f"Unique Video Labels: {unique_label_count}")

In [None]:
# Get the top 50 videl labels present 

n = 50 # although, we'll only show those that appear in the 1,000 for this competition
top_n = Counter([item for sublist in labels for item in sublist]).most_common(n)
top_n_labels = [int(i[0]) for i in top_n]
top_n_label_names = [label_mapping[x] for x in top_n_labels if x in label_mapping] # filter out the labels that aren't in the 1,000 used for this competition
print(top_n_label_names)

In [None]:
# Visualization of the frequencie's labels

labels_count_dict = dict(top_n)
labels_count_df = pd.DataFrame.from_dict(labels_count_dict, orient='index').reset_index()
labels_count_df.columns = ['label', 'count']
labels_count_df['label'] = labels_count_df['label'].map(label_mapping, na_action='ignore')
TOP_labels = list(labels_count_df['label'])[:n]
fig, ax = plt.subplots(figsize=(10,7))
sns.barplot(y='label', x='count', data=labels_count_df)
plt.title('Top {} labels with sample count'.format(n))