In [1]:
import cv2
from sklearn.cluster import KMeans
from keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
import numpy as np
import pandas as pd
import os
from math import floor,sqrt
from moviepy.editor import VideoFileClip
import librosa
from sklearn.preprocessing import StandardScaler, LabelEncoder

### Annotations loader

In [2]:
# Load annotations
def load_annotations(anno_file, info_file):
    # Read the annotation and info files
    annotations = pd.read_csv(anno_file, sep='\t', header=None)
    info = pd.read_csv(info_file, sep='\t', header=None)
    
    # Rename columns for better understanding
    annotations.columns = ['video_id', 'category', 'importance_score']
    info.columns = ['category_code', 'video_id', 'title', 'url', 'length']
    
    return annotations, info

### Preprocessing and Feature Extraction


#### a. Video Processing and Frame Extraction


In [3]:
def extract_frames(video_path, frame_rate=25):
    video = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frames = []
    
    while success:
        success, image = video.read()
        if count % frame_rate == 0 and success:
            frames.append(image)
        count += 1

    video.release()
    return frames

#### b. Audio Extraction


In [4]:

def extract_audio_from_video(video_path, output_audio_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(output_audio_path)
    video.close()

#### c.Audio Features

In [5]:
def extract_audio_features(audio_path):
    """
    Extract audio features from an audio file, specifically MFCCs.

    :param audio_path: Path to the audio file.
    :return: Array of MFCCs.
    """
    # Load the audio file
    y, sr = librosa.load(audio_path)

    # Extract MFCCs from the audio
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # You can change n_mfcc based on your needs

    # To capture the variation over time, you might compute statistics across MFCCs over time
    # Here, we compute the mean of the MFCCs across time
    mfccs_processed = np.mean(mfccs.T,axis=0)

    return mfccs_processed

#### d. Feature Extraction (example with visual features using a CNN)


In [6]:
model = VGG16(weights='imagenet', include_top=False)

def extract_visual_features(frames):
    features = []
    for frame in frames:
        if frame is not None:
            img = cv2.resize(frame, (224, 224))  # Resize frame to 224x224
            img = img_to_array(img)        # Convert to array
            img = np.expand_dims(img, axis=0)    # Add batch dimension
            img = preprocess_input(img)          # Preprocess for VGG16
            
            feature = model.predict(img,use_multiprocessing=True,workers=4)
            features.append(feature.flatten())

    return features

### Connect Audio/Annotation/Video

#### Padding Features to fit the model

In [8]:
def padding(visual_features, audio_features, annotation_features):
    # inputs are at least 1D arrays
    visual_features = [np.array(v, ndmin=1, dtype=float) for v in visual_features]
    audio_features = np.array(audio_features, ndmin=1, dtype=float)
    annotation_features = [np.array(a, ndmin=1, dtype=float) for a in annotation_features]

    # Find the maximum length of the features
    max_length = max(
        max(v.size for v in visual_features),
        audio_features.size,
        max(a.size for a in annotation_features)
    )

    # Pad features to match the maximum length
    visual_padded = [np.pad(v, (0, max_length - v.size), 'constant') for v in visual_features]
    audio_padded = np.pad(audio_features, (0, max_length - audio_features.size), 'constant')
    annotation_padded = [np.pad(a, (0, max_length - a.size), 'constant') for a in annotation_features]

    return visual_padded, audio_padded, annotation_padded


#### Annotation To List

In [9]:
def annotation2List(annotation_features):
    # Make the string '1,1,1,3,2,2,4,4,1' to float list
    annotation_float_array=[]
    for annotation in annotation_features:
        
        if isinstance(annotation,str):
            annotation = annotation.split(',')
        if(isinstance(annotation,list)):
            for anno in annotation:
                annotation_float_array.append(float(anno))
        else:
            annotation_float_array.append(float(annotation))
    return annotation_float_array

#### Kmeans and feature connection

In [10]:
def integrate_and_cluster_features(video_path, anno_file, info_file, num_clusters=1):
    """
    Integrate visual, audio, and annotation features from a video,
    and perform clustering on the combined features.

    :param video_path: Path to the video file.
    :param anno_file: Path to the annotation file.
    :param info_file: Path to the info file.
    :param num_clusters: Number of clusters to use in KMeans.
    :return: Cluster labels for each data point.
    """
    # Extract frames from the video
    frames = extract_frames(video_path) 

    # # Extract visual features
    visual_features = extract_visual_features(frames) 

    # Extract audio
    audio_output_path = 'datasets/extractedAudio/extracted_audio.wav'
    extract_audio_from_video(video_path, audio_output_path) 

    # Extract audio features
    audio_features = extract_audio_features(audio_output_path) 

    # Load annotations
    annotations, info = load_annotations(anno_file, info_file)

    # If annotations include categorical data, convert it to numerical format
    # May exclude it later 
    label_encoder = LabelEncoder()
    categorical_columns = ['category']
    for column in categorical_columns:
        if column in annotations.columns:
            annotations[column] = label_encoder.fit_transform(annotations[column])

    # Exclude non-numerical data from annotations if necessary
    annotation_features = annotations.drop(columns=['video_id']).values

    # Combine features
    combined_features = []
    for i, frame in enumerate(frames):
        # Concatenate visual, audio, and annotation features
        
        # Annotation features are in string format, so convert them to float list
        annotation_float_array=annotation2List(annotation_features[i])
        
        # make all the same length
        visual_padded, audio_padded, annotation_padded = padding(visual_features, audio_features, annotation_float_array)
            
        combined_feature = np.concatenate([
            np.array(visual_padded[i], dtype=float),
            np.array(audio_padded, dtype=float),
            np.array(annotation_padded[i], dtype=float),
        ])
            
        combined_features.append(combined_feature)
        
    # Convert the list of arrays to a 2D NumPy array
    combined_features_array = np.array(combined_features)

    # Normalize the features
    combined_features_normalized = StandardScaler().fit_transform(combined_features_array)

    # Calculate the number of clusters if not provided
    if num_clusters is None:
        num_clusters = floor(sqrt(len(combined_features_normalized)/1.3))

    # Perform clustering
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(combined_features_normalized)

    return [frames , kmeans.labels_]
    # return the frames and the labels


### Summarization

In [11]:
def create_summary(frames, labels):
    """
    Create a summary by selecting frames from the most populous cluster.

    :param frames: List of frames.
    :param labels: Cluster labels for each frame.
    :return: List of frames belonging to the most populous cluster.
    """
    # Count the number of frames in each cluster
    from collections import Counter
    cluster_counts = Counter(labels)

    # Find the cluster with the maximum number of frames
    max_cluster = max(cluster_counts, key=cluster_counts.get)

    # Select frames belonging to the most populous cluster
    summary_frames = [frame for frame, label in zip(frames, labels) if label == max_cluster]

    print(f"Number of frames selected for summary: {len(summary_frames)}")
    return summary_frames


#### Video Creator

In [12]:
def create_video_from_frames(frames, output_path, frame_rate=25):
    if not frames:
        print("No frames to create a video.")
        return 
    # Determine the width and height from the first frame
    height, width, layers = frames[0].shape

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))

    # Write each frame to the video
    for frame in frames:
        out.write(frame)

    # Release the VideoWriter object
    out.release()

# Code to create summary video

#### Variables

In [13]:
annotation_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-anno.tsv'
info_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-info.tsv'

In [14]:
video_path='datasets/ydata-tvsum50-v1_1/video/'
summary_video_path='datasets/summary_videos/'

##### Get the list of the videos in the folder

In [15]:
video_list = [video for video in os.listdir(video_path) if video.endswith('.mp4')]  # List comprehension

#### Function for videoSummarizion

In [16]:
def videoSummarize(annotation_path=None, info_path=None, video_path=None, summary_video_path=None,video_list=None):
    for video in video_list:  
        frames,labels = integrate_and_cluster_features(video_path+video, annotation_path, info_path)
        
        # Select Representative Frames
        summary_frames = create_summary(frames, labels)  # Select frames from the most populous cluster
        
        # Create Summary Video
        create_video_from_frames(summary_frames, summary_video_path+video,30) # Create a video from the selected frames
        

#### Code for dynamic annotation and video summarization

In [17]:
videoSummarize(annotation_path, info_path, video_path, summary_video_path, video_list)

MoviePy - Writing audio in datasets/extractedAudio/extracted_audio.wav


                                                                      

MoviePy - Done.
Number of frames selected for summary: 277
MoviePy - Writing audio in datasets/extractedAudio/extracted_audio.wav


                                                                      

MoviePy - Done.
Number of frames selected for summary: 142
MoviePy - Writing audio in datasets/extractedAudio/extracted_audio.wav


                                                                      

MoviePy - Done.


KeyboardInterrupt: 