In [1]:
from sklearnex import patch_sklearn
patch_sklearn()
import cv2
from sklearn.cluster import KMeans
from keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
import numpy as np
import pandas as pd
import os
from math import floor,sqrt,ceil,log2
from moviepy.editor import VideoFileClip
import librosa
from sklearn.preprocessing import StandardScaler, LabelEncoder
import h5py
from sklearn.metrics import precision_score, recall_score, f1_score
import objectDetection as od
from tabulate import tabulate
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from keras.models import Sequential
import h5py
from sklearn.feature_extraction.text import TfidfVectorizer
import isodata
from sklearn.cluster import SpectralClustering

from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 

import spacy_sentence_bert
tokenizer = spacy_sentence_bert.load_model('en_stsb_roberta_large')

from sklearn.impute import SimpleImputer


# Load pre-trained model tokenizer (vocabulary)
import sys
sys.path.append('AutoEncoder')
from autoEncoder import reduce_features_with_autoencoder


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Video Summarization

### Preprocessing and Feature Extraction


#### a. Video Processing and Frame Extraction


In [2]:
def extract_frames(video_path, frame_rate=15):
    video = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frames = []
    
    while success:
        success, image = video.read()
        if count % frame_rate == 0 and success:
            frames.append(image)
        count += 1

    video.release()
    return frames

#### b. Audio Extraction


In [3]:

def extract_audio_from_video(video_path, output_audio_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(output_audio_path)
    video.close()

#### c.Audio Features

In [4]:
def extract_audio_features_for_each_frame(audio_path, frame_rate,num_frames=None):
    y, sr = librosa.load(audio_path)
    
    # frame_rate 1/1 = 30
    # frame_rate 1/2 = 15

    # Calculate the number of audio samples per video frame
    samples_per_frame = sr / frame_rate
    
    print("samples_per_frame",samples_per_frame)
    print("len(y)",len(y))
    print("sr",sr)
    print("frame_rate",frame_rate)

    # Initialize an array to store MFCCs for each frame
    mfccs_per_frame = []

    # Iterate over each frame and extract corresponding MFCCs
    for frame in range(int(len(y) / samples_per_frame)):
        start_sample = int(frame * samples_per_frame)
        end_sample = int((frame + 1) * samples_per_frame)

        # Ensure the end sample does not exceed the audio length
        end_sample = min(end_sample, len(y))

        # Extract MFCCs for the current frame's audio segment
        mfccs_current_frame = librosa.feature.mfcc(y=y[start_sample:end_sample], sr=sr, n_mfcc=130)
        mfccs_processed = np.mean(mfccs_current_frame.T, axis=0)
        mfccs_per_frame.append(mfccs_processed)

    if(len(mfccs_per_frame)>num_frames):
        return mfccs_per_frame[:num_frames]
    return mfccs_per_frame


#### d. Visual Feature Extraction

In [5]:
import cv2
import numpy as np

def extract_color_features(frame, bins=512):
    """
    Extract color histogram features from a frame.

    :param frame: The frame from the video (as a NumPy array).
    :param bins: Number of bins for the histogram.
    :return: Normalized color histogram feature.
    """
    # Calculate the histogram for each color channel
    hist_features = []
    for i in range(3):  # Assuming frame is in BGR format
        hist = cv2.calcHist([frame], [i], None, [bins], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()
        hist_features.extend(hist)
    
    return np.array(hist_features)


In [6]:
def compute_optical_flow(prev_frame, curr_frame):
    """
    Compute optical flow between two frames.

    :param prev_frame: The previous frame in the video.
    :param curr_frame: The current frame in the video.
    :return: Optical flow magnitude and angle.
    """
    # Convert frames to grayscale
    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
    curr_gray = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY)

    # Calculate optical flow
    flow = cv2.calcOpticalFlowFarneback(prev_gray, curr_gray, None, 0.5, 3, 15, 3, 5, 1.2, 0)
    
    # Compute magnitude and angle of the flow vectors
    magnitude, angle = cv2.cartToPolar(flow[..., 0], flow[..., 1])
    
    # Normalize and flatten
    magnitude = cv2.normalize(magnitude, None, 0, 1, cv2.NORM_MINMAX).flatten()
    angle = angle.flatten()

    return magnitude, angle


In [7]:
# model = VGG16(weights='imagenet', include_top=False)

# Load the weights from the downloaded file
base_model = VGG16(weights=None, include_top=False)
weights_path = 'vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5' # Replace with the actual path
base_model.load_weights(weights_path)

# Create a new Sequential model and add the VGG16 base model
model = Sequential()
model.add(base_model)

def extract_visual_features(frames):
    features = []
    for frame in frames:
        if frame is not None:
            img = cv2.resize(frame, (224, 224))  # Resize frame to 224x224
            img = img_to_array(img)              # Convert to array
            img = np.expand_dims(img, axis=0)    # Add batch dimension
            img = preprocess_input(img)          # Preprocess for VGG16
            
            feature = model.predict(img,use_multiprocessing=True,workers=4)
            features.append(feature.flatten())

    return features


In [8]:

def integrate_features(frames,vgg_features=None,encoding_dim=256, autoencoder_epochs=50, autoencoder_batch_size=32):
    """
    Integrate color, motion, and VGG features for a list of frames.

    :param frames: List of frames from the video.
    :param bins: Number of bins for color histogram.
    :return: Integrated feature vector for each frame.
    """
    integrated_features = []
    if(vgg_features is None):
        vgg_features = extract_visual_features(frames) 
        
    for i in range(1, len(frames)):
        color_features = extract_color_features(frames[i])
        magnitude, angle = compute_optical_flow(frames[i-1], frames[i])

        # Concatenate features
        combined_features = np.concatenate([color_features, magnitude, angle, vgg_features[i]])
        integrated_features.append(combined_features)
    # Reduce features using autoencoder
    
    imputer = SimpleImputer(strategy='mean')  # Can also use 'median' or 'most_frequent'
    integrated_features = imputer.fit_transform(integrated_features)

    # integrated_features = StandardScaler().fit_transform(integrated_features)
    
    reduced_features = reduce_features_with_autoencoder(np.array(integrated_features), encoding_dim, autoencoder_epochs, autoencoder_batch_size)
    
    return reduced_features



### Connect Audio/Annotation/Video

#### Title Tokenized

In [9]:
def tfTitle(info_file,video_path):
    info_df = pd.read_csv(info_file, sep='\t')
    info_video=info_df[info_df['video_id']==video_path.split('/')[-1].split('.')[0]]
    title=info_video['title'].values[0]
    # Preprocess titles and extract features (TF-IDF or word embeddings)
    # For TF-IDF:
    print('title:',title)

    title_features = tokenizer(title).vector
    print('title_features:',title_features.shape)
    return np.array(title_features)

#### Extract Data

In [10]:
def extractData(video_path, anno_file, info_file,flag_to_extract):
    
    return_data=[]
    # Extract frames from the video
    if(flag_to_extract[0]):
        frames = extract_frames(video_path)
        return_data.append(['frames',frames])
    else:
        return_data.append(None)

    # # Extract visual features
    if(flag_to_extract[1]):
        visual_features = extract_visual_features(frames) 
        return_data.append(['visual',visual_features])
    else:
        return_data.append(None)

    # Extract audio
    if(flag_to_extract[2]):
        audio_output_path = 'datasets/extractedAudio/extracted_audio.wav'
        extract_audio_from_video(video_path, audio_output_path) 

        # Extract audio features
        audio_features = extract_audio_features_for_each_frame(audio_output_path,30,len(frames))
        return_data.append(['audio',audio_features])
    else:
        return_data.append(None)

    # Load titles from info file
    if(flag_to_extract[3]):
        title_features = tfTitle(info_file,video_path)
        return_data.append(['title',title_features])
    else:
        return_data.append(None)


    return return_data


### feature connection

##### Detect objects and encode

In [11]:
def detectObjects(frames, yolo_model, classes, objects=None,encoded_objects=None,video=None):
    if objects is None:
        print('Detecting objects in frames...')
        yolo_model, classes = loadYOLOv5()
        
        objects = od.detect_objects_in_all_frames(frames, yolo_model, classes)
        saveData('objects',objects,video)

    # Here, taking the first detected object
    objects = [frame_objects if frame_objects else ['None'] for frame_objects in objects]

    count=0
    for obj in objects:
        for o in obj:
            count+=1
            
    print(f'Total Encoded Obj: {count}')
    print("Objects:",len(objects))

    # One-hot encoding of objects
    if encoded_objects is None:
        encoded_objects = []
        for frame_objects in objects:
            # Encode each object in the frame
            encoded_frame_objects = [tokenizer(ob).vector for ob in frame_objects]
            # Add the list of encoded objects for this frame to the main list
            encoded_objects.append(encoded_frame_objects)
        
    
    return encoded_objects, objects

##### Save Data if ness

In [12]:
import pickle
def saveData(name,feature,video):
    if not os.path.exists(f'video_ext_data/{video}'):
        os.makedirs(f'video_ext_data/{video}')
    with open(f'video_ext_data/{video}/{name}.pkl', 'wb') as f:
        pickle.dump(feature,f)

##### Load Data if ness

In [13]:
import pickle
def getData(feature,video):
    if os.path.exists(f'video_ext_data/{video}/{feature}.pkl'):
        with open(f'video_ext_data/{video}/{feature}.pkl', 'rb') as f:
            feature = pickle.load(f)
        return feature
    else:
        return None

### Data Extraction Concat

In [14]:
def score_frames_with_title_object(integrated_features, title_vector, object_vectors, bins=32):

    frame_scores = []

    for i, frame_features in enumerate(integrated_features):
        # Assuming object_vectors[i] is the vector for objects in the ith frame
        # Adjust the logic here if you have a different way of storing object vectors

        # Calculate similarity or distance
        title_similarity = cosine_distance(frame_features, title_vector)
        sum_obj_sim=0
        for obj in object_vectors[i]:
            
            object_similarity = cosine_distance(frame_features, obj)
            sum_obj_sim+=object_similarity

        # Combine these similarities into a single score
        # This can be a simple average, weighted sum, or any other method that makes sense for your application
        combined_score = (title_similarity + sum_obj_sim) / 2

        frame_scores.append(combined_score)

    return frame_scores

In [15]:
def DataExtraction(video_path, anno_file, info_file,getDataFlag=False):
    """
    Integrate visual, audio, and annotation features from a video,
    and perform clustering on the combined features.

    :param video_path: Path to the video file.
    :param anno_file: Path to the annotation file.
    :param info_file: Path to the info file.
    :param num_clusters: Number of clusters to use in KMeans.
    :return: Cluster labels for each data point.
    """
    
    # Extract data from video
    objects=None
    
    video=video_path.split('/')[-1].split('.')[0]
    
    # GetData
    objects=getData('objects',video)
    frames=getData('frames',video)
    visual_features=getData('visual',video)
    audio_features=getData('audio',video)
    title_features=getData('title',video)
    encoded_objects=getData('encoded_objects',video)
    
    flag_to_extract=[True,True,True,True]
    
    if(frames is not None):
        flag_to_extract[0]=False
    if(visual_features is not None):
        flag_to_extract[1]=False
    if(audio_features is not None):
        flag_to_extract[2]=False
    if(title_features is not None):
        flag_to_extract[3]=False

    
    if not getDataFlag:
        # Extract data from video and save it
        data=extractData(video_path, anno_file, info_file,flag_to_extract)
        # Save extracted Data
        for d in data:
            if d is not None:
                if(d[0]=='objects'):
                    objects=d[1]
                elif(d[0]=='frames'):
                    frames=d[1]
                elif(d[0]=='visual'):
                    visual_features=d[1]
                elif(d[0]=='audio'):
                    audio_features=d[1]
                elif(d[0]=='title'):
                    title_features=d[1]
                
                saveData(d[0],d[1],video)
                
    if(objects is None):
        encoded_objects,objects = detectObjects(frames,len(visual_features),len(audio_features),encoded_objects=encoded_objects,video=video)
    else:
        encoded_objects,objects = detectObjects(frames,len(visual_features),len(audio_features),objects,encoded_objects=encoded_objects)
        
    saveData('encoded_objects',encoded_objects,video)
    

    integrated_features = integrate_features(frames,visual_features)
    score=score_frames_with_title_object(integrated_features, title_features, encoded_objects, bins=32)
    print("SCORE:",score)
    
    
    return [encoded_objects,title_features]


### Summarization

#### Video Creator

In [16]:
def create_video_from_frames(frames, output_path, frame_rate=30):
    if not frames:
        print("No frames to create a video.")
        return None
    # Determine the width and height from the first frame
    height, width, layers = frames[0].shape

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))
    
    # Write each frame to the video
    for frame in frames:
        out.write(frame)

    # Release the VideoWriter object
    out.release()
    return output_path

#### Load .Mat file

In [17]:
def decode_titles(encoded_titles, hdf5_file):
    decoded_titles = []
    for ref_array in encoded_titles:
        # Handle the case where each ref_array might contain multiple references
        for ref in ref_array:
            # Dereference each HDF5 object reference to get the actual data
            title_data = hdf5_file[ref]
            # Decode the title
            decoded_title = ''.join(chr(char[0]) for char in title_data)
            decoded_titles.append(decoded_title)
    return decoded_titles


def load_mat_file(file_path,videoID):
    """
    Load a .mat file and return its contents.

    :param file_path: Path to the .mat file.
    :return: Contents of the .mat file.
    """
    with h5py.File(file_path, 'r') as file:
        user_anno_refs=file['tvsum50']['user_anno'][:] # type: ignore
        video_refs=file['tvsum50']['video'][:] # type: ignore

        decoded_videos = decode_titles(video_refs,file)
    
        annotations = []        
        # Get the index from decoded video list to find the annotation for the video
        index = [i for i, x in enumerate(decoded_videos) if x.lower() in videoID.lower()][0]
        
        # Iterate over each reference
        for ref in user_anno_refs:
            # Dereference each HDF5 object reference
            ref_data = file[ref[0]]

            # Convert to NumPy array and add to the annotations list
            annotations.append(np.array(ref_data))
            
        return annotations[index]

#### Evaluation

In [18]:
def evaluate_summary(predicted_summary, user_summary, eval_method='avg'):
    max_len = max(len(predicted_summary), user_summary.shape[1])
    S = np.zeros(max_len, dtype=int)
    G = np.zeros(max_len, dtype=int)
    S[:len(predicted_summary)] = predicted_summary

    f_scores = []
    for user in range(user_summary.shape[0]):
        G[:user_summary.shape[1]] = user_summary[user]
        overlapped = S & G
        
        precision = sum(overlapped) / sum(S) if sum(S) != 0 else 0
        recall = sum(overlapped) / sum(G) if sum(G) != 0 else 0
        f_score = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
        f_scores.append(f_score * 100)  # multiplied by 100 for percentage

    if eval_method == 'max':
        return max(f_scores)
    else:  # 'avg'
        return sum(f_scores) / len(f_scores)


In [19]:
def evaluate_frame_selection(ground_truth, summary_indices):
    """
    Evaluate the selected frames by comparing them with the ground truth.
    
    Args:
    ground_truth: Ground truth annotations.
    summary_indices: Indices of the selected frames.
    
    Returns:
    Average importance score, max importance score, and proportion of frames with high importance score.
    """
    
    # Evaluate the selected frames
    selected_importance_scores = ground_truth[summary_indices]
    
    # Calculate metrics
    if selected_importance_scores.size == 0:
        average_importance = 0
        max_importance = 0
        proportion_high_importance = 0
    else:
        average_importance = np.mean(selected_importance_scores)  # Average importance score
        max_importance = np.max(selected_importance_scores)  # Max importance score
        # Calculate the proportion of frames with high importance score
        proportion_high_importance = np.mean(selected_importance_scores >= np.floor(max_importance))
        
    return average_importance, max_importance,proportion_high_importance


In [20]:
def Evaluation(ground_truth_path,summary_indices,videoID):
    
    # Get the ground_truth
    ground_truth = np.array(load_mat_file(ground_truth_path, videoID))

    f_score_max = evaluate_summary(summary_indices, ground_truth, 'max')
    f_score_avg = evaluate_summary(summary_indices, ground_truth)

    
    print(f'F-scoreA: {f_score_avg:.2}%')
    print(f'F-scoreM: {f_score_max:.2}%')
    


#### KnapSack

In [21]:
def knapsack_for_video_summary(values, weights, capacity, scale_factor=5):
    """
    Apply the 0/1 Knapsack algorithm to select video segments for summarization.

    :param values: List of importance scores for each segment.
    :param weights: List of durations for each segment in seconds.
    :param capacity: Maximum total duration for the summary in seconds.
    :param scale_factor: Factor to scale weights to integers.
    :return: Indices of the segments to include in the summary.
    """
    # Scale weights and capacity
    weights = [int(w * scale_factor) for w in weights]
    capacity = int(capacity * scale_factor)

    n = len(values)
    K = [[0 for _ in range(capacity + 1)] for _ in range(n + 1)]

    # Build table K[][] in a bottom-up manner
    for i in range(n + 1):
        for w in range(capacity + 1):
            if i == 0 or w == 0:
                K[i][w] = 0
            elif weights[i-1] <= w:
                K[i][w] = max(values[i-1] + K[i-1][w-weights[i-1]], K[i-1][w])
            else:
                K[i][w] = K[i-1][w]

    # Find the selected segments
    res = K[n][capacity]
    w = capacity
    selected_indices = []

    for i in range(n, 0, -1):
        if res <= 0:
            break
        if res == K[i-1][w]:
            continue
        else:
            selected_indices.append(i-1)
            res = res - values[i-1]
            w = w - weights[i-1]

    selected_indices.reverse()
    return selected_indices


#### importance score

In [22]:
def euclidean_distance(vec1, vec2):
    return np.linalg.norm(np.array(vec1) - np.array(vec2))

def manhattan_distance(vec1, vec2):
    return np.sum(np.abs(np.array(vec1) - np.array(vec2)))

from sklearn.metrics.pairwise import cosine_similarity

def cosine_distance(vec1, vec2):
    # Ensure the vectors are 2D and of the same length
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)

    # Check if either vector contains only NaNs
    if np.isnan(vec1).all() or np.isnan(vec2).all():
        return 0  # or handle as needed

    return cosine_similarity(vec1, vec2)[0][0]



In [23]:
def normalize_scores(scores):
    min_score = np.min(scores)
    max_score = np.max(scores)

    # Avoid division by zero in case all scores are the same
    if max_score == min_score:
        return np.zeros_like(scores)

    normalized_scores = (scores - min_score) / (max_score - min_score)
    return normalized_scores

In [24]:
def calculate_importance_ManDistFrame(title, objects):
    importance_scores = []

    for frame_objects in objects:
        frame_distance = 0

        for obj in frame_objects:
            obj_array = np.array(obj)
            title_array = np.array(title)

            # Calculate Manhattan distance and sum it up for the frame
            distance = euclidean_distance(obj_array, title_array)
            frame_distance += distance

        # Store the summed distance for the frame
        importance_scores.append(frame_distance)

    importance_scores=normalize_scores(importance_scores)
    
    # Invert the scores because lower Manhattan distance indicates higher similarity
    importance_scores = 1 - importance_scores
    
    # Handle zero scores to avoid division by zero later
    for i in range(len(importance_scores)):
        if importance_scores[i] == 0:
            importance_scores[i] = 0.0001
            
    return importance_scores

#### Map labels with frames

In [25]:
def map_frames_to_labels_with_indices(frames, labels):
    label_frame_dict = {}
    for label, (frame_index, frame) in zip(labels, enumerate(frames)):
        # Convert label from numpy array to scalar if necessary
        label_scalar = label.item() if isinstance(label, np.ndarray) else label
        if label_scalar not in label_frame_dict:
            label_frame_dict[label_scalar] = []
        label_frame_dict[label_scalar].append((frame_index, frame))
    return label_frame_dict


##### Delete pkl Files

In [26]:
def deletePKLfiles(video):
    dirpkl='video_ext_data/'+video+'/'
    if(os.path.exists(dirpkl+'frames.pkl')):
        os.remove(dirpkl+"frames.pkl")
    return


# Code to create summary video

#### Variables

In [27]:
annotation_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-anno.tsv'
info_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-info.tsv'

In [28]:
video_path='datasets/ydata-tvsum50-v1_1/video/'
summary_video_path='datasets/summary_videos/'

In [29]:
ground_truth_path='datasets/ydata-tvsum50-v1_1/ground_truth/ydata-tvsum50.mat'

##### Get the list of the videos in the folder

In [30]:
video_list = [video for video in os.listdir(video_path) if video.endswith('.mp4')]  # List comprehension

##### Yolo Model

In [31]:
import torch
def loadYOLOv5():
    # Load the model
    yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True,)

    with open("yolo/coco.names", "r") as f:
        classes = [line.strip() for line in f.readlines()]
        
    return yolo_model,classes

#### Map the mat file with h5

In [32]:
# Get the last change point and total frames for each video in the .h5 file
def get_video_data_from_h5(file_path):
    video_data_h5 = []
    with h5py.File(file_path, 'r') as file:
        for video_id in file.keys():
            last_change_point = file[str(video_id)]['change_points'][-1]
            total_frames = last_change_point[1]
            video_data_h5.append([video_id, total_frames])
    return video_data_h5

# Get frame numbers from the .mat file
def get_frame_numbers(encoded_frames, hdf5_file):
    frame_numbers = []
    for ref_array in encoded_frames:
        for ref in ref_array:
            frame_data = hdf5_file[ref]
            frame_numbers.extend([int(char[0]) for char in frame_data])
    return frame_numbers

# Extract data from .mat file
def get_video_data_from_mat(file_path):
    video_data_mat = []
    with h5py.File(file_path, 'r') as f:
        encoded_videos = f['tvsum50']['video'][:]
        encoded_frame_counts = f['tvsum50']['nframes'][:]
        decoded_videos = decode_titles(encoded_videos, f)
        decoded_frame_counts = get_frame_numbers(encoded_frame_counts, f)
        for i, video_id in enumerate(decoded_videos):
            video_data_mat.append([video_id, decoded_frame_counts[i]])
    return video_data_mat

# Comparing and mapping the data
h5_file_path = 'datasets/ydata-tvsum50-v1_1/eccv16_dataset_tvsum_google_pool5.h5'
mat_file_path = 'datasets/ydata-tvsum50-v1_1/ground_truth/ydata-tvsum50.mat'

video_data_h5 = get_video_data_from_h5(h5_file_path)
video_data_mat = get_video_data_from_mat(mat_file_path)

video_id_map = {}
for video_mat in video_data_mat:
    for video_h5 in video_data_h5:
        if video_mat[1] == video_h5[1] + 1:
            video_id_map[video_mat[0]] = video_h5[0]


def getChangingPoints(video_id):
    with h5py.File(h5_file_path, 'r') as file:
        return file[video_id]['change_points'][:]

#### Map to 1/1 frame rate

In [33]:
def map_scores_to_original_frames(sampled_scores, frame_rate):
    # Create an empty list to hold the mapped scores
    original_scores = []

    # Iterate over the sampled scores
    for score in sampled_scores:
        # Replicate each score frame_rate times
        original_scores.extend([score] * frame_rate)

    return original_scores

#### Find the importances for each clip

In [34]:
def getClipImportances(importance, video):
    
    # Get the changing points for the video
    changingPoints = getChangingPoints(video_id_map[video.split('.')[0]])
        
    # Initialize a dictionary to store clip importances
    clip_importances = {}

    # Iterate over each clip defined by changing points
    for clip_index, (start_frame, end_frame) in enumerate(changingPoints):
        clip_importance = 0
        num_frames=end_frame-start_frame

        # Calculate the total importance for this clip
        for i in range(start_frame, end_frame):
            clip_importance += importance[i]

        # Store the clip importance
        clip_importances[clip_index] = (clip_importance,num_frames)
        
    # normalize
    return clip_importances



#### Function for videoSummarizion

In [35]:
def getSelectedIndicesFromClips(selectedClips,video):
    changingPoints = getChangingPoints(video_id_map[video.split('.')[0]])

    # Initialize a list to store the selected indices
    selected_indices = []
    for i in selectedClips:
        selected_indices.extend(range(changingPoints[i][0], changingPoints[i][1]))
    return selected_indices

In [36]:
def videoSumm(annotation_path=None, info_path=None, video_path=None, summary_video_path=None,video_list=None):
    for video in video_list: 
        video='0tmA_C6XwfM.mp4'
        
        # if(os.path.exists(f'{summary_video_path}{video}')):
        #     continue
    
        print("VIDEO:",video)
        getDataFlag=False

        # Extract frames 1/1 from the video
        original_frames=extract_frames(video_path+video, frame_rate=1)
             
        # Extract Data
        objects,title_features = DataExtraction(video_path+video, annotation_path, info_path,getDataFlag=getDataFlag)
        
        # Calculate importance scores for this cluster
        importance = calculate_importance_ManDistFrame(title_features, objects)
        
        # Maping to 1/1 rate
        importance=map_scores_to_original_frames(importance, 15)

        # get the best cluster
        clip_info = getClipImportances(importance,video)
        # Extracting values (importance scores) and weights (number of frames)
        values = [score for score, frames in clip_info.values()]
        weights = [frames for score, frames in clip_info.values()]

        # Calculate the total number of frames in the video
        total_frames = len(original_frames)
        print("Total Frames:",total_frames)

        # Calculate the capacity as 15% of the total number of frames
        capacity = int(0.16 * total_frames)
        print("Capacity:",capacity)

        # Now apply the knapsack algorithm
        selected_clips = knapsack_for_video_summary(values, weights, capacity)
        print("Summary Indices:",selected_clips)
        
        selected_indices=getSelectedIndicesFromClips(selected_clips,video)
        print('Sum Len Frame:',len(selected_indices))
        
        summary_frames=[original_frames[i] for i in selected_indices]
        
        # Evaluate
        Evaluation(ground_truth_path, selected_indices, video.split('.')[0])
        
        # print(tabulate([evaluated_metrics], headers=['Threshold', 'Precision', 'Recall', 'F1', 'Avg. Importance', 'Max. Importance', 'Prop. High Importance']))
        getDataFlag=True
            
        video_name=video.split('.')[0]
        
        if(not os.path.exists(f'video_ext_data/{video_name}')):
            os.makedirs(f'video_ext_data/{video_name}')
        
        # Create Summary Video
        create_video_from_frames(summary_frames,f"{summary_video_path}{video}" , 30)
        
        # Extract data from video the next video
        getDataFlag=False
        deletePKLfiles(video_name)
        break
    # return

# Video summarization

In [37]:
videoSumm(annotation_path, info_path, video_path, summary_video_path, video_list)

VIDEO: 0tmA_C6XwfM.mp4


Total Encoded Obj: 238
Objects: 236
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
1/8 [==>...........................] - ETA: 20s - loss: 7.0024

KeyboardInterrupt: 