In [None]:
# Import Packages for project

# Standard Libraries
import csv
import cv2
import imageio
from IPython.display import Image
import glob
import numpy as np
import os
import pandas as pd
import re
import utils

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Algorithms, Modeling and Data Pre-processing
import feature_engine
from feature_engine.encoding import OrdinalEncoder
from feature_engine.transformation import YeoJohnsonTransformer
from scipy.stats import anderson, chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,precision_score, roc_auc_score,recall_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Deep Learning
import keras
from keras import layers
from keras.layers import RandomFlip, RandomRotation, Rescaling, BatchNormalization, Conv2D, MaxPooling2D, Dense, Input
from keras.models import Model, Sequential
from keras.optimizers import Adam, SGD
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import callbacks
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Model Optimization and Hyperparameter Tuning
import hyperopt
from hyperopt import STATUS_OK, Trials, fmin, tpe, hp
import mlflow

import tensorboard

In [None]:
# Obtain the Data
filepath = '../data/K400/video_annotations.csv'
raw_csv = pd.read_csv(filepath)
k400_df = pd.DataFrame(raw_csv)

k400_df.info(memory_usage='deep')

In [None]:
# initial inspection of complete dataframe
k400_df.head

In [None]:
# Check for null values/percentage of null values:

k400_df.isna().mean()

In [None]:
k400_df.isna().sum()

### Observations 

1. No null values
2. Over 240k Observations
3. 6 Attributes of string/int datatypes

In [None]:
# check for dup
num_unique = k400_df.nunique()
num_unique

### Observations

1. THere are 400 unique labels
2. There are about 20K youtube_id with only about 850 videos
3. Videos duration is only 10 seconds as annotated by the difference between time_start and time_end values

**Next Steps**: to reduce the dimensionality, I need to create a function that will map a video file to a youtube id value in the video_annotations.csv file and create a new dataframe where we have a match. Data Cleaning required on the names of the video files

In [None]:
youtube_id_values = k400_df['youtube_id']
print(f"Total Youtube ID Values in Dataset: {youtube_id_values.count()}")

In [None]:
# Get the amount of unique youtube_id
number_unique_id = youtube_id_values.nunique()
print(f"Unique Youtube ID Values: {number_unique_id}")

In [None]:
# Check for unique values
unique_youtube_id = youtube_id_values.unique()
unique_youtube_id

In [None]:
# Get the number of video files we are working with
def count_video_files(directory):
    """
    Purpose - to get a video file count within a given directory
    Arguments - directory variable that holds the filepath to a video directory
    Returns - video_count of type integer
    
    """
    # Set the allowed video file extensions
    video_extensions = ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']

    # Initialize the count
    video_count = 0

    # Iterate through all files in the directory
    for file_path in glob.glob(os.path.join(directory, '*')):
        # Check if the file has a video file extension
        if os.path.isfile(file_path) and any(file_path.lower().endswith(ext) for ext in video_extensions):
            video_count += 1

    return video_count


In [None]:
# Test Funcationality and return video count

# Provide the directory path to count video files
directory_path = '../data/K400/videos'

# Call the function to count video files
num_videos = count_video_files(directory_path)
print(f'Total number of video files: {num_videos} videos present')

### Video Observations

1. There seems to be a match with the youtube_id values in the video_annotations.csv file and the initial naming convention of the video files.
2. The videofile names have a timestamp that highlights how the 10second video frame was captured. 

**Next Steps**: In order to load in local video data correctly, I need to perform regular expressions to rename the video files to exclude the timestamps.

In [None]:
def remove_timestamp(filename):
    """
    Purpose: to remove the timestampe suffix at the end of our local video files
    Arguments: filename 
    Retunrs: Cleaned filename
    """
    # Split the filename by underscores
    parts = filename.rsplit('_')

    # Filter out parts that are likely numbers
    cleaned_parts = [part for part in parts if not part.isdigit()]

    # Join the cleaned parts with underscores to form the new filename
    cleaned_filename = '_'.join(cleaned_parts)

    return cleaned_filename # Remove leading/trailing whitespaces


In [None]:
def rename_files(directory):
    """
    Purpose: To rename all the local video files in our directory for future loading 
    Arguments: Filepath to the video directory
    Returns: None

    Other Functions: Calls the remove_timestamp()
    """
    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is a regular file (not a directory)
        if os.path.isfile(os.path.join(directory, filename)):
            # Remove timestamp from the filename
            new_filename = remove_timestamp(filename)
            # Rename the file if the filename has changed
            if new_filename != filename:
                os.rename(os.path.join(directory, filename),
                          os.path.join(directory, new_filename))

In [None]:
# Test
video_directory = "../data/K400/videos"

rename_files(video_directory)

In [None]:
video_directory = '../data/K400/videos'

# Iterate through each YouTube ID
for youtube_id in youtube_id_values:
    # Find the corresponding video file in the directory
    for filename in os.listdir(video_directory):
        if youtube_id in filename:
            # Extract the file extension
            file_extension = os.path.splitext(filename)[1]

            # Construct the new file name without the timestamp
            new_filename = youtube_id + file_extension

            # Construct the full paths for old and new files
            old_filepath = os.path.join(video_directory, filename)
            new_filepath = os.path.join(video_directory, new_filename)

            # Rename the file
            os.rename(old_filepath, new_filepath)
            print(f'Renamed {filename} to {new_filename}')
            break

### Observations

Removed the start_time portion of the timestamp, but left the end_time timestamp in the video file name

### Define Hyperparameters

In [None]:
IMG_SIZE = 224
BATCH_SIZE = 64
EPOCHS = 10

MAX_SEQ_LENGTH = 20
NUM_FEATURES = 2048

### Data Preparation

In [None]:
split_percent = .30

# Split the k400_df into test and train df

# Split the dataframe into train and test using pd.sample()
test_df = k400_df.sample(frac=split_percent, random_state=42)
train_df = k400_df.drop(test_df.index)

# Reset the index of the new dataframes
test_df.reset_index(drop=True, inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [None]:
len(test_df)

In [None]:
len(train_df)

### Notes

train_df has 13934 Rows with 6 Attributes

test_df has 5972 rows

In [None]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

In [None]:
def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frame = frame[:, :, [2, 1, 0]]
            frames.append(frame)

            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return np.array(frames)

In [None]:
def build_feature_extractor():
    feature_extractor = keras.applications.InceptionV3(
        weights="imagenet",
        include_top=False,
        pooling="avg",
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
    preprocess_input = keras.applications.inception_v3.preprocess_input

    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
    preprocessed = preprocess_input(inputs)

    outputs = feature_extractor(preprocessed)
    return keras.Model(inputs, outputs, name="feature_extractor")


feature_extractor = build_feature_extractor()


In [None]:
label_processor = keras.layers.StringLookup(
    num_oov_indices=0, vocabulary=np.unique(train_df["label"])
)
print(label_processor.get_vocabulary())


In [None]:
def prepare_all_videos(df, root_dir):
    num_samples = len(df)
    video_paths = df["youtube_id"].values.tolist()
    labels = df["label"].values
    labels = keras.ops.convert_to_numpy(label_processor(labels[..., None]))

    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
    # masked with padding or not.
    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
    frame_features = np.zeros(
        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
    )

    # For each video.
    for idx, path in enumerate(video_paths):
        # Gather all its frames and add a batch dimension.
        frames = load_video(os.path.join(root_dir, path))
        frames = frames[None, ...]

        # Initialize placeholders to store the masks and features of the current video.
        temp_frame_mask = np.zeros(
            shape=(
                1,
                MAX_SEQ_LENGTH,
            ),
            dtype="bool",
        )
        temp_frame_features = np.zeros(
            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
        )

        # Extract features from the frames of the current video.
        for i, batch in enumerate(frames):
            video_length = batch.shape[0]
            length = min(MAX_SEQ_LENGTH, video_length)
            for j in range(length):
                temp_frame_features[i, j, :] = feature_extractor.predict(
                    batch[None, j, :], verbose=0,
                )
            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked

        frame_features[idx,] = temp_frame_features.squeeze()
        frame_masks[idx,] = temp_frame_mask.squeeze()

    return (frame_features, frame_masks), labels

In [None]:
# Test functionality
train_data, train_labels = prepare_all_videos(train_df, "train")
test_data, test_labels = prepare_all_videos(test_df, "test")

In [None]:
print(f"Frame features in train set: {train_data[0].shape}")
print(f"Frame masks in train set: {train_data[1].shape}")