In [2]:
import cv2
from sklearn.cluster import KMeans
from keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
import numpy as np
import pandas as pd
import os
from math import floor,sqrt
from moviepy.editor import VideoFileClip
import librosa
from sklearn.preprocessing import StandardScaler, LabelEncoder
import h5py
from sklearn.metrics import precision_score, recall_score, f1_score

## Video Summarization

### Annotations loader

In [3]:
# Load annotations
def load_annotations(anno_file, info_file):
    # Read the annotation and info files
    annotations = pd.read_csv(anno_file, sep='\t', header=None)
    info = pd.read_csv(info_file, sep='\t', header=None)
    
    # Rename columns for better understanding
    annotations.columns = ['video_id', 'category', 'importance_score']
    info.columns = ['category_code', 'video_id', 'title', 'url', 'length']
    
    # print( annotations, info)
    
    return annotations, info

### Preprocessing and Feature Extraction


#### a. Video Processing and Frame Extraction


In [4]:
def extract_frames(video_path, frame_rate=1):
    video = cv2.VideoCapture(video_path)
    count = 0
    success = True
    frames = []
    
    while success:
        success, image = video.read()
        if count % frame_rate == 0 and success:
            frames.append(image)
        count += 1

    video.release()
    return frames

#### b. Audio Extraction


In [5]:

def extract_audio_from_video(video_path, output_audio_path):
    video = VideoFileClip(video_path)
    audio = video.audio
    audio.write_audiofile(output_audio_path)
    video.close()

#### c.Audio Features

In [6]:
def extract_audio_features(audio_path):
    """
    Extract audio features from an audio file, specifically MFCCs.

    :param audio_path: Path to the audio file.
    :return: Array of MFCCs.
    """
    # Load the audio file
    y, sr = librosa.load(audio_path)

    # Extract MFCCs from the audio
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # You can change n_mfcc based on your needs

    # To capture the variation over time, you might compute statistics across MFCCs over time
    # Here, we compute the mean of the MFCCs across time
    mfccs_processed = np.mean(mfccs.T,axis=0)

    return mfccs_processed

#### d. Feature Extraction (example with visual features using a CNN)


In [7]:
model = VGG16(weights='imagenet', include_top=False)

def extract_visual_features(frames):
    features = []
    for frame in frames:
        if frame is not None:
            img = cv2.resize(frame, (224, 224))  # Resize frame to 224x224
            img = img_to_array(img)        # Convert to array
            img = np.expand_dims(img, axis=0)    # Add batch dimension
            img = preprocess_input(img)          # Preprocess for VGG16
            
            feature = model.predict(img,use_multiprocessing=True,workers=4)
            features.append(feature.flatten())

    return features

### Connect Audio/Annotation/Video

#### Padding Features to fit the model

In [8]:
def padding(visual_features, audio_features, annotation_features=None):
    # inputs are at least 1D arrays
    visual_features = [np.array(v, ndmin=1, dtype=float) for v in visual_features]
    audio_features = np.array(audio_features, ndmin=1, dtype=float)
    if annotation_features is not None:
        annotation_features = [np.array(a, ndmin=1, dtype=float) for a in annotation_features]

    # Find the maximum length of the features
    max_length = max(
        max(v.size for v in visual_features),
        audio_features.size,
        # max(a.size for a in annotation_features)
    )

    # Pad features to match the maximum length
    visual_padded = [np.pad(v, (0, max_length - v.size), 'constant') for v in visual_features]
    audio_padded = np.pad(audio_features, (0, max_length - audio_features.size), 'constant')
    # annotation_padded = [np.pad(a, (0, max_length - a.size), 'constant') for a in annotation_features]

    return visual_padded, audio_padded


#### Annotation To List

In [9]:
def annotation2List(annotation_features):
    # Make the string '1,1,1,3,2,2,4,4,1' to float list
    annotation_float_array=[]
    for annotation in annotation_features:
        
        if isinstance(annotation,str):
            annotation = annotation.split(',')
        if isinstance(annotation,list):
            for anno in annotation:
                annotation_float_array.append(float(anno))
        else:
            annotation_float_array.append(float(annotation))
    return annotation_float_array

#### Add weights to ignore padding

In [10]:
def apply_feature_weighting(combined_features, original_feature_length, original_feature_weight=1, padding_weight=0.1):
    """
    Apply different weights to the original and padded parts of the feature vectors.

    :param combined_features: List of combined feature vectors (including padding).
    :param original_feature_length: The length of the original (non-padded) part of the feature vectors.
    :param original_feature_weight: Weight to be applied to the original features.
    :param padding_weight: Weight to be applied to the padded features.
    :return: List of weighted feature vectors.
    """
    weighted_combined_features = []

    for feature in combined_features:
        # Create a weight vector: higher weight for original features, lower for padding
        weights = [original_feature_weight] * original_feature_length
        weights += [padding_weight] * (len(feature) - original_feature_length)
        
        # Apply weights to the feature vector
        weighted_feature = np.array(feature) * np.array(weights)
        weighted_combined_features.append(weighted_feature)

    return weighted_combined_features


#### Extract Data

In [11]:
frames = extract_frames('datasets/ydata-tvsum50-v1_1/video/-esJrBWj2d8.mp4')
visual_features=extract_visual_features(frames)



In [12]:
def extractData(video_path, anno_file, info_file):
    # Extract frames from the video
    # frames = extract_frames(video_path)
    # print('frames',len(frames)) 

    # # # Extract visual features
    # visual_features = extract_visual_features(frames) 

    # Extract audio
    audio_output_path = 'datasets/extractedAudio/extracted_audio.wav'
    extract_audio_from_video(video_path, audio_output_path) 

    # Extract audio features
    audio_features = extract_audio_features(audio_output_path) 

    # # Load annotations
    # annotations, info = load_annotations(anno_file, info_file)
    
    # return visual_features, frames
    return audio_features
    # return annotations,audio_features,visual_features,frames


#### Remove all zeros

In [13]:
def compact_non_zeros(array):
    """
    Move all non-zero elements of an array to the front, keeping their order.

    :param array: Input array.
    :return: Compacted array with non-zeros at the front.
    """
    non_zeros = array[array != 0]
    if len(non_zeros) == 0:  # If the array is all zeros
        print("All zeros")
        return array
    return np.pad(non_zeros, (0, len(array) - len(non_zeros)), 'constant')

#### Kmeans and feature connection

In [41]:
def integrate_and_cluster_features(video_path, anno_file, info_file, num_clusters=None):
    """
    Integrate visual, audio, and annotation features from a video,
    and perform clustering on the combined features.

    :param video_path: Path to the video file.
    :param anno_file: Path to the annotation file.
    :param info_file: Path to the info file.
    :param num_clusters: Number of clusters to use in KMeans.
    :return: Cluster labels for each data point.
    """
    
    # Extract data from video
    # visual_features,frames=extractData(video_path, anno_file, info_file)
    """

    audio_features=extractData(video_path, anno_file, info_file)

    # # If annotations include categorical data, convert it to numerical format
    # # May exclude it later 
    # label_encoder = LabelEncoder()
    # categorical_columns = ['category']
    # for column in categorical_columns:
    #     if column in annotations.columns:
    #         annotations[column] = label_encoder.fit_transform(annotations[column])

    # # Exclude non-numerical data from annotations if necessary
    # annotation_features = annotations.drop(columns=['video_id']).values


    # Determine the original feature length
    original_feature_length = len(visual_features) + len(audio_features) #+ len(annotation_features)

    # Combine features with padding
    combined_features = []
    
    # Remove temporaraily the audio feature
    visual_padded, audio_padded = padding(visual_features, audio_features)

    for i, frame in enumerate(frames):
        
        # annotation_float_array = annotation2List(annotation_features[i])
        

        combined_feature = np.concatenate([
            np.array(visual_padded[i], dtype=float),
            # np.array(audio_padded, dtype=float),
            # np.array(annotation_padded[i], dtype=float),
        ])
        combined_features.append(combined_feature)
    combined_features.append(audio_padded)

    # Apply feature weighting
    # weighted_combined_features = apply_feature_weighting(combined_features, original_feature_length)

    # Convert to 2D NumPy array and normalize
    combined_features_array = np.array(combined_features)
    
    combined_features_normalized = StandardScaler().fit_transform(combined_features_array)
   
    
    # Calculate the number of clusters if not provided
    if num_clusters is None:
        num_clusters = floor(sqrt(len(combined_features_normalized)))
    # """
    num_clusters = 9
        
    print("Number of clusters:", num_clusters)
    
    # Perform clustering
    print("Performing clustering...")
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(visual_features)
    
    return [frames, kmeans.labels_]
    # return the frames and the labels


### Summarization

#### Find bigger cluster

In [15]:
from collections import Counter
def getBiggestCluster(labels):
    cluster_counts = Counter(labels)

    # Find the cluster with the maximum number of frames
    return max(cluster_counts, key=cluster_counts.get),cluster_counts

#### Frames Selection

In [16]:
def frame_selection(frames, labels):
    """
    Create a summary by selecting frames from the most populous cluster.

    :param frames: List of frames.
    :param labels: Cluster labels for each frame.
    :return: List of frames belonging to the most populous cluster.
    """
    # Count the number of frames in each cluster
    max_cluster,cluster_counts = getBiggestCluster(labels)

    # Initialize lists for indices and frames
    summary_indices = []  # to measure the importance based on annotation
    summary_frames = []

    # Iterate and select frames and their indices belonging to the most populous cluster
    for index, (frame, label) in enumerate(zip(frames, labels)):
        if label == max_cluster:
            summary_indices.append(index)
            summary_frames.append(frame)

    print(f"Number of frames selected for summary: {len(summary_frames)}, Cluster: {max_cluster}")
    # print all clusters len
    print(f"Cluster counts: {cluster_counts}")
    return summary_frames,summary_indices


#### Video Creator

In [39]:
def create_video_from_frames(frames, output_path, frame_rate=30):
    if not frames:
        print("No frames to create a video.")
        return 
    # Determine the width and height from the first frame
    height, width, layers = frames[0].shape

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, frame_rate, (width, height))

    # Write each frame to the video
    for frame in frames:
        out.write(frame)

    # Release the VideoWriter object
    out.release()

#### Load .Mat file

In [18]:
def decode_titles(encoded_titles, hdf5_file):
    decoded_titles = []
    for ref_array in encoded_titles:
        # Handle the case where each ref_array might contain multiple references
        for ref in ref_array:
            # Dereference each HDF5 object reference to get the actual data
            title_data = hdf5_file[ref]
            # Decode the title
            decoded_title = ''.join(chr(char[0]) for char in title_data)
            decoded_titles.append(decoded_title)
    return decoded_titles


def load_mat_file(file_path,videoID):
    """
    Load a .mat file and return its contents.

    :param file_path: Path to the .mat file.
    :return: Contents of the .mat file.
    """
    with h5py.File(file_path, 'r') as file:
        user_anno_refs=file['tvsum50']['user_anno'][:] # type: ignore
        video_refs=file['tvsum50']['video'][:] # type: ignore

        decoded_videos = decode_titles(video_refs,file)
    
        annotations = []
        
        print(file['tvsum50'].keys())
        
        # Get the index from decoded video list to find the annotation for the video
        index = [i for i, x in enumerate(decoded_videos) if x.lower() in videoID.lower()][0]
        
        # Iterate over each reference
        for ref in user_anno_refs: # type: ignore
            # Dereference each HDF5 object reference
            ref_data = file[ref[0]]

            # Convert to NumPy array and add to the annotations list
            annotations.append(np.array(ref_data))
            
        return annotations[index]

#### f1score with ground_truth

In [19]:
def evaluate_frame_selection(ground_truth, summary_indices):

    # Evaluate the selected frames
    selected_importance_scores = ground_truth[summary_indices]
    
    # Calculate metrics
    average_importance = np.mean(selected_importance_scores) # Average importance score
    max_importance=np.max(selected_importance_scores) # Max importance score
    # print('max:',max_importance) # 2.4
    
    proportion_high_importance = np.mean(selected_importance_scores >= floor(max_importance))  # Proportion of frames with max importance score
    
    return average_importance, max_importance,proportion_high_importance


In [20]:
def kmeans_f1score(ground_truth_path, summary_indices,videoID):
    ground_truth = np.array(load_mat_file(ground_truth_path, videoID))
    n_annotators = ground_truth.shape[0]
    
    all_evaluations = []

    for i in range(n_annotators):
        individual_ground_truth_scores = ground_truth[i]

        threshold=floor(np.mean(np.array(individual_ground_truth_scores)))
        # Binary conversion
        binary_ground_truth = np.where(individual_ground_truth_scores >= threshold, 1, 0) 

        # Selected frames binary conversion
        selected_frames_binary = np.zeros_like(binary_ground_truth)
        selected_frames_binary[summary_indices] = 1

        # Calculate metrics
        precision = precision_score(binary_ground_truth, selected_frames_binary)
        recall = recall_score(binary_ground_truth, selected_frames_binary)
        f1 = f1_score(binary_ground_truth, selected_frames_binary, average='weighted')

        # Importance-based evaluation
        avg_importance, max_importance, prop_high_importance = evaluate_frame_selection(individual_ground_truth_scores, summary_indices)

        all_evaluations.append({
            "annotator": i,
            "precision": precision,
            "recall": recall,
            "weighted_f1": f1,
            "average_importance": avg_importance,
            "maximum_importance": max_importance,
            "proportion_high_importance": prop_high_importance
        })
        
    from tabulate import tabulate
            
    table_data = []
    headers = ["Annotator", "Precision", "Recall", "Weighted F1", "Avg Importance", "Max Importance", "Proportion High Importance"]

    for result in all_evaluations:
        table_data.append([
            result['annotator'],
            f"{result['precision']:.3f}",
            f"{result['recall']:.3f}",
            f"{result['weighted_f1']:.3f}",
            f"{result['average_importance']:.3f}",
            f"{result['maximum_importance']:.3f}",
            f"{result['proportion_high_importance']:.3f}"
        ])

    # Print the table
    table_data.sort(key=lambda x: x[3],reverse=True)

    print(tabulate(table_data, headers=headers, tablefmt="fancy_grid"))
    return

# Code to create summary video

#### Variables

In [21]:
annotation_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-anno.tsv'
info_path='datasets/ydata-tvsum50-v1_1/data/ydata-tvsum50-info.tsv'

In [22]:
video_path='datasets/ydata-tvsum50-v1_1/video/'
summary_video_path='datasets/summary_videos/'

In [23]:
ground_truth_path='datasets/ydata-tvsum50-v1_1/ground_truth/ydata-tvsum50.mat'

##### Get the list of the videos in the folder

In [24]:
video_list = [video for video in os.listdir(video_path) if video.endswith('.mp4')]  # List comprehension

#### Function for videoSummarizion

In [42]:
def videoSummarize(annotation_path=None, info_path=None, video_path=None, summary_video_path=None,video_list=None):
    for video in video_list:  
        frames,labels = integrate_and_cluster_features(video_path+video, annotation_path, info_path)
        
        # Select Representative Frames
        summary_frames,summary_indices = frame_selection(frames, labels)  # Select frames from the most populous cluster
        
        # Create Summary Video
        create_video_from_frames(summary_frames, summary_video_path+video,30) # Create a video from the selected frames
        
        # video: -esJrBWj2d8.mp4
        kmeans_f1score(ground_truth_path,summary_indices,video.split('.')[0])
        break
        

#### Code for dynamic annotation and video summarization

In [43]:
videoSummarize(annotation_path, info_path, video_path, summary_video_path, video_list)

Number of clusters: 9
Performing clustering...
Number of frames selected for summary: 3234, Cluster: 0
Cluster counts: Counter({0: 3234, 2: 628, 4: 588, 5: 548, 8: 466, 6: 453, 3: 419, 1: 364, 7: 212})
<KeysViewHDF5 ['category', 'gt_score', 'length', 'nframes', 'title', 'user_anno', 'video']>
╒═════════════╤═════════════╤══════════╤═══════════════╤══════════════════╤══════════════════╤══════════════════════════════╕
│   Annotator │   Precision │   Recall │   Weighted F1 │   Avg Importance │   Max Importance │   Proportion High Importance │
╞═════════════╪═════════════╪══════════╪═══════════════╪══════════════════╪══════════════════╪══════════════════════════════╡
│           3 │       1     │    0.468 │         0.637 │            2.009 │                5 │                        0.036 │
├─────────────┼─────────────┼──────────┼───────────────┼──────────────────┼──────────────────┼──────────────────────────────┤
│           4 │       1     │    0.468 │         0.637 │            1.972 │ 

## Using Only visual_features

### All results was based on frame extraction with frame_rate=25

#### Results using sqrt(N) = 16 Clusters

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
| 2         | 0.941     | 0.063  | 0.345       | 3.338          | 5              | 0.131                      |
| 8         | 1.0       | 0.065  | 0.339       | 4.0            | 4              | 1.0                        |
| 12        | 0.766     | 0.052  | 0.337       | 2.111          | 3              | 0.131                      |
| 10        | 1.0       | 0.065  | 0.332       | 3.507          | 4              | 0.507                      |
| 0         | 0.469     | 0.031  | 0.292       | 1.735          | 3              | 0.131                      |
| 1         | 0.473     | 0.03   | 0.274       | 1.394          | 2              | 0.394                      |
| 9         | 0.0       | 0.0    | 0.222       | 1.0            | 1              | 1.0                        |
| 3         | 1.0       | 0.037  | 0.071       | 2.511          | 4              | 0.019                      |
| 4         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 5         | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 6         | 1.0       | 0.037  | 0.071       | 1.393          | 4              | 0.131                      |
| 7         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 11        | 1.0       | 0.037  | 0.071       | 3.617          | 5              | 0.131                      |
| 13        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 14        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 15        | 1.0       | 0.037  | 0.071       | 4.0            | 4              | 1.0                        |
| 16        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 17        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 18        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 19        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |


#### Results using 11 Clusters

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
| 2         | 0.941     | 0.063  | 0.345       | 3.315          | 5              | 0.117                      |
| 8         | 1.0       | 0.065  | 0.339       | 4.0            | 4              | 1.0                        |
| 12        | 0.766     | 0.052  | 0.337       | 2.099          | 3              | 0.117                      |
| 10        | 1.0       | 0.065  | 0.332       | 3.499          | 4              | 0.499                      |
| 0         | 0.469     | 0.031  | 0.292       | 1.718          | 3              | 0.117                      |
| 1         | 0.473     | 0.03   | 0.274       | 1.396          | 2              | 0.396                      |
| 9         | 0.0       | 0.0    | 0.222       | 1.0            | 1              | 1.0                        |
| 3         | 1.0       | 0.037  | 0.071       | 2.516          | 4              | 0.018                      |
| 4         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 5         | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 6         | 1.0       | 0.037  | 0.071       | 1.355          | 4              | 0.117                      |
| 7         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 11        | 1.0       | 0.037  | 0.071       | 3.595          | 5              | 0.117                      |
| 13        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 14        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 15        | 1.0       | 0.037  | 0.071       | 4.0            | 4              | 1.0                        |
| 16        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 17        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 18        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 19        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |


#### Results using 7 Clusters

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
| 2         | 0.941     | 0.063  | 0.345       | 3.351          | 5              | 0.134                      |
| 8         | 1.0       | 0.065  | 0.339       | 4.0            | 4              | 1.0                        |
| 12        | 0.766     | 0.052  | 0.337       | 2.115          | 3              | 0.134                      |
| 10        | 1.0       | 0.065  | 0.332       | 3.504          | 4              | 0.504                      |
| 0         | 0.469     | 0.031  | 0.292       | 1.744          | 3              | 0.134                      |
| 1         | 0.473     | 0.03   | 0.274       | 1.386          | 2              | 0.386                      |
| 9         | 0.0       | 0.0    | 0.222       | 1.0            | 1              | 1.0                        |
| 3         | 1.0       | 0.037  | 0.071       | 2.511          | 4              | 0.019                      |
| 4         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 5         | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 6         | 1.0       | 0.037  | 0.071       | 1.405          | 4              | 0.134                      |
| 7         | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 11        | 1.0       | 0.037  | 0.071       | 3.615          | 5              | 0.134                      |
| 13        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 14        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 15        | 1.0       | 0.037  | 0.071       | 4.0            | 4              | 1.0                        |
| 16        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 17        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |
| 18        | 1.0       | 0.037  | 0.071       | 1.0            | 1              | 1.0                        |
| 19        | 1.0       | 0.037  | 0.071       | 2.0            | 2              | 1.0                        |

### All results was based on frame extraction with frame_rate=1

#### Using sqrt(N) = 83 clusters
| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     12    |   0.587   |  0.053 |    0.332    |     2.27       |       5        |          0.091             |
|     2     |   0.622   |  0.055 |    0.327    |     1.839      |       4        |          0.109             |
|     8     |   0.434   |  0.038 |    0.294    |     1.801      |       5        |          0.041             |
|     10    |   0.463   |  0.04  |    0.29     |     1.894      |       4        |          0.111             |
|     1     |   0.522   |  0.044 |    0.289    |     1.604      |       5        |          0.021             |
|     0     |   0.246   |  0.022 |    0.273    |     1.378      |       3        |          0.132             |
|     9     |   0.384   |  0.032 |    0.263    |     1.575      |       5        |          0.026             |
|     3     |   1       |  0.049 |    0.094    |     2.279      |       5        |          0.114             |
|     4     |   1       |  0.049 |    0.094    |     2.282      |       5        |          0.114             |
|     5     |   1       |  0.049 |    0.094    |     1.267      |       2        |          0.267             |
|     6     |   1       |  0.049 |    0.094    |     2.525      |       4        |          0.282             |
|     7     |   1       |  0.049 |    0.094    |     2.428      |       5        |          0.111             |
|     11    |   1       |  0.049 |    0.094    |     2.144      |       4        |          0.138             |
|     13    |   1       |  0.049 |    0.094    |     1.358      |       3        |          0.111             |
|     14    |   1       |  0.049 |    0.094    |     1.443      |       4        |          0.091             |
|     15    |   1       |  0.049 |    0.094    |     1.56       |       5        |          0.021             |
|     16    |   1       |  0.049 |    0.094    |     1.117      |       2        |          0.117             |
|     17    |   1       |  0.049 |    0.094    |     1.299      |       4        |          0.091             |
|     18    |   1       |  0.049 |    0.094    |     2.563      |       5        |          0.111             |
|     19    |   1       |  0.049 |    0.094    |     1.472      |       4        |          0.111             |



#### Using 10 clusters
| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     8     |   0.856   | 0.583  |    0.706    |     2.486      |       5        |          0.009             |
|     3     |   1       | 0.385  |    0.556    |     2.039      |       5        |          0.017             |
|     4     |   1       | 0.385  |    0.556    |     2.124      |       5        |          0.017             |
|     5     |   1       | 0.385  |    0.556    |     1.505      |       3        |          0.003             |
|     6     |   1       | 0.385  |    0.556    |     2.135      |       5        |          0.027             |
|     7     |   1       | 0.385  |    0.556    |     1.85       |       5        |          0.014             |
|     11    |   1       | 0.385  |    0.556    |     2.12       |       5        |          0.023             |
|     13    |   1       | 0.385  |    0.556    |     1.337      |       4        |          0.008             |
|     14    |   1       | 0.385  |    0.556    |     1.738      |       5        |          0.084             |
|     15    |   1       | 0.385  |    0.556    |     1.756      |       5        |          0.003             |
|     16    |   1       | 0.385  |    0.556    |     1.363      |       4        |          0.007             |
|     17    |   1       | 0.385  |    0.556    |     1.571      |       4        |          0.03              |
|     18    |   1       | 0.385  |    0.556    |     2.139      |       5        |          0.014             |
|     19    |   1       | 0.385  |    0.556    |     1.479      |       4        |          0.037             |
|     12    |   0.624   | 0.439  |    0.543    |     2.105      |       5        |          0.039             |
|     2     |   0.538   | 0.373  |    0.468    |     1.975      |       5        |          0.045             |
|     10    |   0.389   | 0.262  |    0.337    |     1.718      |       5        |          0.008             |
|     1     |   0.391   | 0.259  |    0.33     |     1.784      |       5        |          0.051             |
|     0     |   0.331   | 0.226  |    0.299    |     1.646      |       5        |          0.084             |
|     9     |   0.302   | 0.197  |    0.253    |     1.72       |       5        |          0.132             |



#### Using 9 clusters 1.1m

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     8     |   0.837   |  0.487 |    0.648    |     2.706      |       5        |          0.158             |
|     3     |     1     |  0.33  |    0.496    |     1.934      |       5        |          0.024             |
|     4     |     1     |  0.33  |    0.496    |     1.922      |       5        |          0.024             |
|     5     |     1     |  0.33  |    0.496    |     1.313      |       2        |          0.313             |
|     6     |     1     |  0.33  |    0.496    |     2.235      |       5        |          0.057             |
|     7     |     1     |  0.33  |    0.496    |     1.859      |       5        |          0.017             |
|     11    |     1     |  0.33  |    0.496    |     1.998      |       4        |          0.113             |
|     13    |     1     |  0.33  |    0.496    |     1.187      |       3        |          0.025             |
|     14    |     1     |  0.33  |    0.496    |     1.324      |       4        |          0.026             |
|     15    |     1     |  0.33  |    0.496    |     1.602      |       5        |          0.026             |
|     16    |     1     |  0.33  |    0.496    |     1.106      |       4        |          0.008             |
|     17    |     1     |  0.33  |    0.496    |     1.293      |       4        |          0.026             |
|     18    |     1     |  0.33  |    0.496    |     2.089      |       5        |          0.017             |
|     19    |     1     |  0.33  |    0.496    |     1.64       |       5        |          0.026             |
|     12    |   0.536   |  0.323 |    0.462    |     1.965      |       5        |          0.026             |
|     2     |   0.416   |  0.247 |    0.373    |     1.655      |       5        |          0.026             |
|     1     |   0.388   |  0.22  |    0.33     |     1.812      |       5        |          0.053             |
|     10    |   0.344   |  0.198 |    0.308    |     1.519      |       4        |          0.017             |
|     0     |   0.249   |  0.145 |    0.252    |     1.433      |       3        |          0.184             |
|     9     |   0.283   |  0.158 |    0.252    |     1.442      |       5        |          0.045             |


#### Using 7 clusters
| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     8     |   0.77    |  0.68  |    0.706    |     2.657      |       5        |          0.104             |
|     3     |   1       |  0.5   |    0.666    |     2.002      |       5        |          0.017             |
|     4     |   1       |  0.5   |    0.666    |     1.876      |       5        |          0.017             |
|     5     |   1       |  0.5   |    0.666    |     1.444      |       3        |          0.002             |
|     6     |   1       |  0.5   |    0.666    |     1.954      |       5        |          0.028             |
|     7     |   1       |  0.5   |    0.666    |     1.887      |       5        |          0.017             |
|     11    |   1       |  0.5   |    0.666    |     1.991      |       5        |          0.017             |
|     13    |   1       |  0.5   |    0.666    |     1.518      |       3        |          0.155             |
|     14    |   1       |  0.5   |    0.666    |     1.767      |       5        |          0.065             |
|     15    |   1       |  0.5   |    0.666    |     2.054      |       5        |          0.033             |
|     16    |   1       |  0.5   |    0.666    |     1.605      |       4        |          0.12              |
|     17    |   1       |  0.5   |    0.666    |     1.603      |       4        |          0.017             |
|     18    |   1       |  0.5   |    0.666    |     1.88       |       5        |          0.017             |
|     19    |   1       |  0.5   |    0.666    |     1.865      |       5        |          0.017             |
|     12    |   0.623   |  0.569 |    0.577    |     2.119      |       5        |          0.03              |
|     2     |   0.468   |  0.421 |    0.415    |     1.871      |       5        |          0.039             |
|     10    |   0.436   |  0.38  |    0.367    |     1.82       |       4        |          0.07              |
|     1     |   0.432   |  0.371 |    0.355    |     1.893      |       5        |          0.069             |
|     9     |   0.435   |  0.369 |    0.351    |     1.839      |       5        |          0.089             |
|     0     |   0.333   |  0.295 |    0.272    |     1.666      |       5        |          0.065             |


## Combined Features

### All results was based on frame extraction with frame_rate=25

#### Using 16 clusters


#### Using 10 clusters


#### Using             7 clusters



### All results was based on frame extraction with frame_rate=1

#### Using sqrt(N) = 83 clusters

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     10    |     1     |  0.107 |    0.378    |     2.123      |       3        |          0.123             |
|     9     |     1     |  0.104 |    0.36     |     2          |       2        |          1                 |
|     8     |   0.825   |  0.089 |    0.359    |     2.65       |       3        |          0.825             |
|     0     |   0.683   |  0.074 |    0.339    |     1.683      |       2        |          0.683             |
|     1     |   0.757   |  0.08  |    0.332    |     2.506      |       4        |          0.135             |
|     12    |   0.123   |  0.014 |    0.273    |     1.123      |       2        |          0.123             |
|     2     |   0.123   |  0.014 |    0.265    |     1.123      |       2        |          0.123             |
|     3     |     1     |  0.061 |    0.115    |     1.752      |       3        |          0.243             |
|     4     |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     5     |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     6     |     1     |  0.061 |    0.115    |     1.369      |       4        |          0.123             |
|     7     |     1     |  0.061 |    0.115    |     3          |       3        |          1                 |
|     11    |     1     |  0.061 |    0.115    |     1.733      |       3        |          0.366             |
|     13    |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     14    |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     15    |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     16    |     1     |  0.061 |    0.115    |     3          |       3        |          1                 |
|     17    |     1     |  0.061 |    0.115    |     2          |       2        |          1                 |
|     18    |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |
|     19    |     1     |  0.061 |    0.115    |     1          |       1        |          1                 |


#### Using 10 clusters
| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     3     |     1     |  0.528 |    0.691    |     1.994      |       5        |          0.015             |
|     4     |     1     |  0.528 |    0.691    |     1.925      |       5        |          0.015             |
|     5     |     1     |  0.528 |    0.691    |     1.844      |       4        |          0.115             |
|     6     |     1     |  0.528 |    0.691    |     2.298      |       5        |          0.069             |
|     7     |     1     |  0.528 |    0.691    |     1.762      |       5        |          0.01              |
|     11    |     1     |  0.528 |    0.691    |     1.872      |       5        |          0.016             |
|     13    |     1     |  0.528 |    0.691    |     1.839      |       5        |          0.033             |
|     14    |     1     |  0.528 |    0.691    |     2.005      |       5        |          0.066             |
|     15    |     1     |  0.528 |    0.691    |     2.012      |       5        |          0.016             |
|     16    |     1     |  0.528 |    0.691    |     1.626      |       4        |          0.005             |
|     17    |     1     |  0.528 |    0.691    |     1.941      |       4        |          0.088             |
|     18    |     1     |  0.528 |    0.691    |     2.012      |       5        |          0.01              |
|     19    |     1     |  0.528 |    0.691    |     2.122      |       5        |          0.049             |
|     12    |   0.668   |  0.645 |    0.631    |     2.282      |       5        |          0.066             |
|     8     |   0.683   |  0.638 |    0.629    |     2.319      |       5        |          0.099             |
|     2     |   0.662   |  0.629 |    0.616    |     2.35       |       5        |          0.062             |
|     0     |   0.569   |  0.533 |    0.51     |     2.046      |       5        |          0.066             |
|     9     |   0.558   |  0.499 |    0.475    |     1.978      |       5        |          0.048             |
|     1     |   0.509   |  0.462 |    0.432    |     2.093      |       5        |          0.066             |
|     10    |   0.454   |  0.418 |    0.381    |     2.05       |       5        |          0.088             |


#### Using 9 clusters 2m video

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     3     |     1     |  0.543 |    0.704    |     2.018      |       5        |          0.032             |
|     4     |     1     |  0.543 |    0.704    |     1.799      |       5        |          0.016             |
|     5     |     1     |  0.543 |    0.704    |     1.828      |       4        |          0.112             |
|     6     |     1     |  0.543 |    0.704    |     2.05       |       5        |          0.049             |
|     7     |     1     |  0.543 |    0.704    |     1.755      |       5        |          0.016             |
|     11    |     1     |  0.543 |    0.704    |     1.982      |       5        |          0.016             |
|     13    |     1     |  0.543 |    0.704    |     1.738      |       4        |          0.048             |
|     14    |     1     |  0.543 |    0.704    |     1.829      |       5        |          0.06              |
|     15    |     1     |  0.543 |    0.704    |     1.899      |       5        |          0.027             |
|     16    |     1     |  0.543 |    0.704    |     1.797      |       4        |          0.111             |
|     17    |     1     |  0.543 |    0.704    |     1.762      |       4        |          0.016             |
|     18    |     1     |  0.543 |    0.704    |     1.954      |       5        |          0.016             |
|     19    |     1     |  0.543 |    0.704    |     1.865      |       5        |          0.014             |
|     12    |   0.619   |  0.615 |    0.582    |     2.158      |       5        |          0.044             |
|     8     |   0.612   |  0.588 |    0.557    |     2.189      |       5        |          0.032             |
|     2     |   0.547   |  0.534 |    0.496    |     1.914      |       5        |          0.03              |
|     10    |   0.554   |  0.525 |    0.487    |     2.226      |       5        |          0.064             |
|     9     |   0.466   |  0.429 |    0.377    |     1.986      |       5        |          0.035             |
|     1     |   0.447   |  0.417 |    0.363    |     1.815      |       5        |          0.051             |
|     0     |   0.372   |  0.358 |    0.298    |     1.76       |       5        |          0.06              |


#### Using 7 clusters 2.4m video

| Annotator | Precision | Recall | Weighted F1 | Avg Importance | Max Importance | Proportion High Importance |
|-----------|-----------|--------|-------------|----------------|----------------|----------------------------|
|     3     |     1     |  0.7   |    0.823    |     1.966      |       5        |          0.025             |
|     4     |     1     |  0.7   |    0.823    |     1.782      |       5        |          0.012             |
|     5     |     1     |  0.7   |    0.823    |     1.692      |       4        |          0.087             |
|     6     |     1     |  0.7   |    0.823    |     2.047      |       5        |          0.052             |
|     7     |     1     |  0.7   |    0.823    |     2.026      |       5        |          0.012             |
|     11    |     1     |  0.7   |    0.823    |     1.771      |       4        |          0.087             |
|     13    |     1     |  0.7   |    0.823    |     1.833      |       5        |          0.025             |
|     14    |     1     |  0.7   |    0.823    |     1.932      |       5        |          0.05              |
|     15    |     1     |  0.7   |    0.823    |     1.78       |       5        |          0.024             |
|     16    |     1     |  0.7   |    0.823    |     1.981      |       4        |          0.086             |
|     17    |     1     |  0.7   |    0.823    |     1.925      |       4        |          0.062             |
|     18    |     1     |  0.7   |    0.823    |     1.852      |       5        |          0.012             |
|     19    |     1     |  0.7   |    0.823    |     1.977      |       5        |          0.037             |
|     9     |   0.648   |  0.767 |    0.603    |     2.051      |       5        |          0.027             |
|     8     |   0.612   |  0.757 |    0.575    |     2.153      |       5        |          0.074             |
|     2     |   0.528   |  0.665 |    0.463    |     1.977      |       5        |          0.025             |
|     10    |   0.521   |  0.636 |    0.437    |     1.993      |       5        |          0.062             |
|     0     |   0.5     |  0.619 |    0.414    |     1.886      |       5        |          0.05              |
|     1     |   0.497   |  0.598 |    0.393    |     2.028      |       5        |          0.062             |
