In [9]:
!pip install opencv-python

Defaulting to user installation because normal site-packages is not writeable
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


### some resources
#### https://shimat.github.io/opencvsharp_docs/html/6121915d-1174-7345-bdca-789ee1373642.htm
#### https://www.geeksforgeeks.org/opencv-python-tutorial/?ref=shm 
#### https://www.geeksforgeeks.org/opencv-the-gunnar-farneback-optical-flow/

### Imports

In [3]:
import cv2
import os

### Splitting Clips Up From The Video + CLIP EXTRACTION

In [7]:
""" purpose of this function is to crop the borders equally on both sides. Tested 
different border_ratio param values and found that 0.343 best suited for 
the the compilations videos in the dataset."""
def crop_borders(frame, border_ratio = 0.343):
    height, width, _ = frame.shape
    crop_width = int(width * border_ratio)
    cropped_frame = frame[:, crop_width:-crop_width]  
    return cropped_frame

""" turns compilation into frames and compares the pixel intensities for frames after
turning it to grayscale in consecutive order and if it is greater than a certain threshold, 
func will classify it as a scene change.  
"""
def detect_scenes(video_path, threshold):
    # initialization
    cap = cv2.VideoCapture(video_path)
    scene_changes = []
    last_frame = None
    frame_index = 0

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    while True:
        success, frame = cap.read()
        if not success:
            break

        cropped_frame = crop_borders(frame)

        # convert to BGR, less compute + easy processing pixels
        gray_frame = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)

        # compares pixel intensity btwn frames if last frame is present, retrieves diff in values
        if last_frame is not None:
            diff = cv2.absdiff(gray_frame, last_frame)
            mean_diff = diff.mean()

            if mean_diff > threshold:
                scene_changes.append(frame_index)

        last_frame = gray_frame
        frame_index += 1

    # avoid cv error
    cap.release()
    return scene_changes

""" takes in previous helper functions: crop_borders & detect_scenes. extracts clips with the side
borders cropped out instead of having to manually go into the vid to clip them out. """
def extract_clips(video_path, scene_changes, output_dir, border_ratio = 0.343):
    cap = cv2.VideoCapture(video_path)

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    # encode vid + retrieve dims
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    crop_width = int(width * border_ratio)
       
    output_width = width - 2 * crop_width
    output_height = height
    
    os.makedirs(output_dir, exist_ok=True)

    # append last frame to ensure the final clip is extracted bc detect_scene doesn't account for it
    scene_changes.append(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))

    print("Starting Clip Extraction:")
    for i in range(len(scene_changes) - 1):
        # initialize start and end times based on detect_scenes)
        start_frame = scene_changes[i]
        end_frame = scene_changes[i + 1]

        output_path = os.path.join(output_dir, f"clip_{i + 1}.mp4")
        out = cv2.VideoWriter(output_path, fourcc, fps, (output_width, output_height))

        #record
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        for frame_index in range(start_frame, end_frame):
            success, frame = cap.read()
            if not success:
                break

            cropped_frame = crop_borders(frame, border_ratio)
            out.write(cropped_frame)

        out.release()
        print(f"Clip {i+1} has been saved to {output_dir}.")

    cap.release()

if __name__ == "__main__":
    video_path = "data/YouTube.mp4"  
    output_dir = "data/clips"  

    #tested these values and found that these params were
    # best suited for threshold and border_ratio
    # most likely threshold can be altered depending on the compilation
    # video we're working with but highly unlikely, border_ratio is set          
    threshold = 51                 
    border_ratio = 0.343              

    scene_changes = detect_scenes(video_path, threshold)
    print(scene_changes)
    extract_clips(video_path, scene_changes, output_dir, border_ratio)

[463, 915, 1386, 1777, 1784, 1786, 1790, 1792, 1796, 1799, 1847, 1849, 1860, 1861, 1872, 1924, 1926, 1927, 1929, 1930, 1931, 1932, 1934, 1935, 1962, 2039, 2141, 2142, 2205, 2207, 2224, 2677, 2833, 2834, 3131, 3583, 4037, 4490, 4889, 4932, 4957, 4984, 5010, 5037, 5062, 5087, 5138, 5164, 5191, 5217, 5242, 5269, 5294, 5320, 5342, 5796, 6311, 6764, 6765, 7280, 7733, 8186, 8556, 9009, 9462, 9469, 9473, 9577, 9587, 9592, 9595, 9604, 9607, 9610, 9611, 9614, 9615, 9618, 9619, 9621, 9680, 9710, 9711, 9713, 9715, 9718, 9813, 9814, 9815, 9822, 9823, 9836, 10289, 10742, 11195, 11710, 12146, 12600, 13052, 13136, 13151, 13206, 13252, 13278, 13312, 13509, 13510, 13511]
Starting Clip Extraction:
Clip 1 has been saved to data/clips.
Clip 2 has been saved to data/clips.
Clip 3 has been saved to data/clips.
Clip 4 has been saved to data/clips.
Clip 5 has been saved to data/clips.
Clip 6 has been saved to data/clips.
Clip 7 has been saved to data/clips.
Clip 8 has been saved to data/clips.
Clip 9 has been

### Scrapping Clips Outside of Avg Clip Length

In [8]:
import cv2
import os

""" Purpose of function is to filter out clips within a time (seconds) deviation based on the average
clip length within the folder. This will remove clips whose pixel intensities are so high that the previous
helper functions continually clip it frame by frame and also the ending credits clip at the end of the video. 
Function will remove clip lenths that are less than the min_length (ARBITRARILY SET TO 5 SECONDS) because some clips have
constantly changing pixel intensities that surpass the threshold so these chopped clips will be deleted. Then the remaining
clips will have their average time length  calculated and a (arbitrary) deviation will be applied to the mean time length
and delete videos outside of that frame. 
FEEL FREE TO ALTER DEVIATION + MIN LENGTH IF IT DOESN'T SUIT CLIPS AFTER MANUALLY INSPECTING """
# i.e. arbitrary val of 2 based on the "leave dance" tiktok dance
def filtering_clips(output_dir, fps, deviation=3, min_length=5):
    clip_durations = []
    clip_paths = []

    for clip_name in os.listdir(output_dir):
        clip_path = os.path.join(output_dir, clip_name)
        
        cap = cv2.VideoCapture(clip_path)

        # handling error if vid can't be opened 
        if not cap.isOpened():
            print(f"Could not open {clip_path}, skipping.")
            continue
        
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps  
        cap.release()

        # FIRST FILTER: REMOVING CLIPS LESS THAN 5 SECONDS LONG
        if duration < min_length:
            os.remove(clip_path)
            print(f"Deleted {clip_path} (Duration: {duration:.2f} sec) - below minimum length of {min_length} sec")
        else:
            # only add clips that meet minimum length req
            clip_durations.append(duration)
            clip_paths.append(clip_path)

    if len(clip_durations) == 0:
        print("No valid clips found after applying the minimum length filter.")
        return

    average_duration = sum(clip_durations) / len(clip_durations)

    min_duration = max(0, average_duration - deviation)
    max_duration = average_duration + deviation
    print(f"Average clip duration: {average_duration:.2f} seconds")
    print(f"Removing clips shorter than {min_duration:.2f} seconds")

    for clip_path, duration in zip(clip_paths, clip_durations):
        # removes those outside of the deviation range of the avg
        # i.e. within 2 seconds of the average
        if duration < min_duration or duration > max_duration:
            os.remove(clip_path)
            print(f"Deleted {clip_path} (Duration: {duration:.2f} sec) - outside deviation")

if __name__ == "__main__":
    video_path = "data/YouTube.mp4"  
    output_dir = "data/clips"            

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    filtering_clips(output_dir, fps)

Deleted data/clips/clip_12.mp4 (Duration: 0.37 sec) - below minimum length of 5 sec
Deleted data/clips/clip_13.mp4 (Duration: 0.03 sec) - below minimum length of 5 sec
Deleted data/clips/clip_39.mp4 (Duration: 1.43 sec) - below minimum length of 5 sec
Deleted data/clips/clip_11.mp4 (Duration: 0.07 sec) - below minimum length of 5 sec
Deleted data/clips/clip_10.mp4 (Duration: 1.60 sec) - below minimum length of 5 sec
Deleted data/clips/clip_14.mp4 (Duration: 0.37 sec) - below minimum length of 5 sec
Deleted data/clips/clip_28.mp4 (Duration: 2.10 sec) - below minimum length of 5 sec
Deleted data/clips/clip_29.mp4 (Duration: 0.07 sec) - below minimum length of 5 sec
Deleted data/clips/clip_15.mp4 (Duration: 1.73 sec) - below minimum length of 5 sec
Deleted data/clips/clip_17.mp4 (Duration: 0.03 sec) - below minimum length of 5 sec
Deleted data/clips/clip_16.mp4 (Duration: 0.07 sec) - below minimum length of 5 sec
Deleted data/clips/clip_8.mp4 (Duration: 0.13 sec) - below minimum length of

### creating data split

In [9]:
import os
import shutil
import random
"""Purpose of function is to divide the clips into three sets: training, validation, and testing. 
params for the ratios are set to 60:20:20 but FEEL FREE TO CHANGE THE VALS. 
Also, split is done randomly as welln. 
"""
def split_dataset(input_dir, output_dir, train_ratio, val_ratio, test_ratio):
    all_entries = os.listdir(input_dir)
    clips = []
    for clip in all_entries:
        file_path = os.path.join(input_dir, clip)
        
        if os.path.isfile(file_path):
            clips.append(clip)
        
    # RANDOMDMDMDMDMMDMMM
    # random.seed(42)
    random.shuffle(clips)
    
    total = len(clips)

    train_count = int(total * train_ratio)
    val_count = int(total * val_ratio)
    
    train_files = clips[:train_count]
    val_files = clips[train_count:train_count + val_count]
    test_files = clips[train_count + val_count:]
    
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    test_dir = os.path.join(output_dir, "test")
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # trainfiles
    for clip in train_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(train_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # val files
    for clip in val_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(val_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # test files
    for clip in test_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(test_dir, clip)
        shutil.copy2(src_path, dst_path)

    #deeleting files in clip folder
    shutil.rmtree(input_dir)
    
if __name__ == "__main__":
    input_dir = "data/clips"  
    output_dir = "data/"  
    split_dataset(input_dir, output_dir, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2)
    print("DONE")


DONE


### potentially densepose