In [9]:
!pip install opencv-python

Defaulting to user installation because normal site-packages is not writeable
Collecting opencv-python
  Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl.metadata (20 kB)
Downloading opencv_python-4.11.0.86-cp37-abi3-macosx_13_0_arm64.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.11.0.86


### some resources
#### https://shimat.github.io/opencvsharp_docs/html/6121915d-1174-7345-bdca-789ee1373642.htm
#### https://www.geeksforgeeks.org/opencv-python-tutorial/?ref=shm 
#### https://www.geeksforgeeks.org/opencv-the-gunnar-farneback-optical-flow/

### Imports

In [6]:
import cv2
import os

### Splitting Clips Up From The Video + CLIP EXTRACTION

In [7]:
""" purpose of this function is to crop the borders equally on both sides. Tested 
different border_ratio param values and found that 0.343 best suited for 
the the compilations videos in the dataset."""
def crop_borders(frame, border_ratio = 0.343):
    height, width, _ = frame.shape
    crop_width = int(width * border_ratio)
    cropped_frame = frame[:, crop_width:-crop_width]  
    return cropped_frame

""" turns compilation into frames and compares the pixel intensities for frames after
turning it to grayscale in consecutive order and if it is greater than a certain threshold, 
func will classify it as a scene change.  
"""
def detect_scenes(video_path, threshold):
    # initialization
    cap = cv2.VideoCapture(video_path)
    scene_changes = []
    last_frame = None
    frame_index = 0

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    while True:
        success, frame = cap.read()
        if not success:
            break

        cropped_frame = crop_borders(frame)

        # convert to BGR, less compute + easy processing pixels
        gray_frame = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)

        # compares pixel intensity btwn frames if last frame is present, retrieves diff in values
        if last_frame is not None:
            diff = cv2.absdiff(gray_frame, last_frame)
            mean_diff = diff.mean()

            if mean_diff > threshold:
                scene_changes.append(frame_index)

        last_frame = gray_frame
        frame_index += 1

    # avoid cv error
    cap.release()
    return scene_changes

""" takes in previous helper functions: crop_borders & detect_scenes. extracts clips with the side
borders cropped out instead of having to manually go into the vid to clip them out. """
def extract_clips(video_path, scene_changes, output_dir, border_ratio = 0.343):
    cap = cv2.VideoCapture(video_path)

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    # encode vid + retrieve dims
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    crop_width = int(width * border_ratio)
       
    output_width = width - 2 * crop_width
    output_height = height
    
    os.makedirs(output_dir, exist_ok=True)

    # append last frame to ensure the final clip is extracted bc detect_scene doesn't account for it
    scene_changes.append(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))

    print("Starting Clip Extraction:")
    for i in range(len(scene_changes) - 1):
        # initialize start and end times based on detect_scenes)
        start_frame = scene_changes[i]
        end_frame = scene_changes[i + 1]

        output_path = os.path.join(output_dir, f"clip_{i + 1}.mp4")
        out = cv2.VideoWriter(output_path, fourcc, fps, (output_width, output_height))

        #record
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        for frame_index in range(start_frame, end_frame):
            success, frame = cap.read()
            if not success:
                break

            cropped_frame = crop_borders(frame, border_ratio)
            out.write(cropped_frame)

        out.release()
        print(f"Clip {i+1} has been saved to {output_dir}.")

    cap.release()

### ALTER DIRECTORIES TO WHERE THE VIDEO IS LOCATED FOR EACH VID
if __name__ == "__main__":
    video_path = "mp4/ratdance.mp4"  
    output_dir = "ratdance/clips"  

    #tested these values and found that these params were
    # best suited for threshold and border_ratio
    # most likely threshold can be altered depending on the compilation
    # video we're working with but highly unlikely, border_ratio is set          
    threshold = 51                 
    border_ratio = 0.343              

    scene_changes = detect_scenes(video_path, threshold)
    print(scene_changes)
    extract_clips(video_path, scene_changes, output_dir, border_ratio)

[447, 894, 1028, 1095, 1162, 1227, 1585, 1590, 1591, 1593, 1594, 1722, 2067, 2076, 2077, 2210, 2658, 2681, 2698, 2716, 3559, 4007, 4790, 5238, 6006, 6295, 6742, 7189, 7637, 8335, 8783, 9202, 9649, 9672, 9693, 9705, 9910, 10324, 10729, 10855, 11063, 11137, 11367, 11712, 12126, 12573, 13142, 13556, 14003, 14289, 14360, 14632, 15080, 15527, 15666, 15727, 15795, 15885, 16267, 16281, 16672, 17264, 17314, 17420, 17712, 18179, 18508, 18941, 19388, 19408, 19424, 19441, 19589, 19621, 19818, 20266, 20714, 21161, 21608, 22055, 22124, 22190, 22319, 22385, 22831, 23281, 23345, 23671, 23729, 24176]
Starting Clip Extraction:
Clip 1 has been saved to ratdance/clips.
Clip 2 has been saved to ratdance/clips.
Clip 3 has been saved to ratdance/clips.
Clip 4 has been saved to ratdance/clips.
Clip 5 has been saved to ratdance/clips.
Clip 6 has been saved to ratdance/clips.
Clip 7 has been saved to ratdance/clips.
Clip 8 has been saved to ratdance/clips.
Clip 9 has been saved to ratdance/clips.
Clip 10 has b

### Scrapping Clips Outside of Avg Clip Length

In [None]:
import cv2
import os

""" Purpose of function is to filter out clips within a time (seconds) deviation based on the average
clip length within the folder. This will remove clips whose pixel intensities are so high that the previous
helper functions continually clip it frame by frame and also the ending credits clip at the end of the video. 
Function will remove clip lenths that are less than the min_length (ARBITRARILY SET TO 5 SECONDS) because some clips have
constantly changing pixel intensities that surpass the threshold so these chopped clips will be deleted. Then the remaining
clips will have their average time length  calculated and a (arbitrary) deviation will be applied to the mean time length
and delete videos outside of that frame. 
FEEL FREE TO ALTER DEVIATION + MIN LENGTH IF IT DOESN'T SUIT CLIPS AFTER MANUALLY INSPECTING """
# i.e. arbitrary val of 2 based on the "leave dance" tiktok dance
def filtering_clips(output_dir, fps, deviation=3, min_length=5):
    clip_durations = []
    clip_paths = []

    for clip_name in os.listdir(output_dir):
        clip_path = os.path.join(output_dir, clip_name)
        
        cap = cv2.VideoCapture(clip_path)

        # handling error if vid can't be opened 
        if not cap.isOpened():
            print(f"Could not open {clip_path}, skipping.")
            continue
        
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        duration = frame_count / fps  
        cap.release()

        # FIRST FILTER: REMOVING CLIPS LESS THAN 5 SECONDS LONG
        if duration < min_length:
            os.remove(clip_path)
            print(f"Deleted {clip_path} (Duration: {duration:.2f} sec) - below minimum length of {min_length} sec")
        else:
            # only add clips that meet minimum length req
            clip_durations.append(duration)
            clip_paths.append(clip_path)

    if len(clip_durations) == 0:
        print("No valid clips found after applying the minimum length filter.")
        return

    average_duration = sum(clip_durations) / len(clip_durations)

    min_duration = max(0, average_duration - deviation)
    max_duration = average_duration + deviation
    print(f"Average clip duration: {average_duration:.2f} seconds")
    print(f"Removing clips shorter than {min_duration:.2f} seconds")

    for clip_path, duration in zip(clip_paths, clip_durations):
        # removes those outside of the deviation range of the avg
        # i.e. within 2 seconds of the average
        if duration < min_duration or duration > max_duration:
            os.remove(clip_path)
            print(f"Deleted {clip_path} (Duration: {duration:.2f} sec) - outside deviation")

### ALTER DIRECTORIES TO WHERE THE VIDEO IS LOCATED FOR EACH VID
if __name__ == "__main__":
    video_path = "mp4/ratdance.mp4"  
    output_dir = "ratdance/clips"            

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    cap.release()
    filtering_clips(output_dir, fps)

Deleted ratdance/clips/clip_13.mp4 (Duration: 0.30 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_39.mp4 (Duration: 4.20 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_11.mp4 (Duration: 4.27 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_10.mp4 (Duration: 0.03 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_14.mp4 (Duration: 0.03 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_15.mp4 (Duration: 4.44 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_17.mp4 (Duration: 0.77 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_8.mp4 (Duration: 0.03 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_59.mp4 (Duration: 0.47 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_71.mp4 (Duration: 0.57 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_70.mp4 (Duration: 0.53 sec) - below minimum length of 5 sec
Deleted ratdance/clips/clip_9.mp4

### creating data split

In [9]:
import os
import shutil
import random
"""Purpose of function is to divide the clips into three sets: training, validation, and testing. 
params for the ratios are set to 60:20:20 but FEEL FREE TO CHANGE THE VALS. 
Also, split is done randomly as welln. 
"""
def split_dataset(input_dir, output_dir, train_ratio, val_ratio, test_ratio):
    all_entries = os.listdir(input_dir)
    clips = []
    for clip in all_entries:
        file_path = os.path.join(input_dir, clip)
        
        if os.path.isfile(file_path):
            clips.append(clip)
        
    # RANDOMDMDMDMDMMDMMM
    # random.seed(42)
    random.shuffle(clips)
    
    total = len(clips)

    train_count = int(total * train_ratio)
    val_count = int(total * val_ratio)
    
    train_files = clips[:train_count]
    val_files = clips[train_count:train_count + val_count]
    test_files = clips[train_count + val_count:]
    
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    test_dir = os.path.join(output_dir, "test")
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # trainfiles
    for clip in train_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(train_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # val files
    for clip in val_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(val_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # test files
    for clip in test_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(test_dir, clip)
        shutil.copy2(src_path, dst_path)

    #deeleting files in clip folder
    shutil.rmtree(input_dir)
    
### ALTER DIRECTORIES TO WHERE THE VIDEO IS LOCATED FOR EACH VID
if __name__ == "__main__":
    input_dir = "ratdance/clips"  
    output_dir = "ratdance/"  
    split_dataset(input_dir, output_dir, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2)
    print("DONE")


DONE


### extracting negative control

In [3]:
import cv2
import os
import shutil
import random
import math

""" purpose of this function is to crop the borders equally on both sides. Tested 
different border_ratio param values and found that 0.343 best suited for 
the the compilations videos in the dataset."""
def crop_borders(frame, border_ratio = 0.343):
    height, width, _ = frame.shape
    crop_width = int(width * border_ratio)
    cropped_frame = frame[:, crop_width:-crop_width]  
    return cropped_frame

""" turns compilation into frames and compares the pixel intensities for frames after
turning it to grayscale in consecutive order and if it is greater than a certain threshold, 
func will classify it as a scene change.  
"""
def detect_scenes(video_path, threshold, border_ratio = 0.343):
    # initialization
    cap = cv2.VideoCapture(video_path)
    scene_changes = []
    last_frame = None
    frame_index = 0

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    while True:
        success, frame = cap.read()
        if not success:
            break

        cropped_frame = crop_borders(frame, border_ratio)

        # convert to BGR, less compute + easy processing pixels
        gray_frame = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2GRAY)

        # compares pixel intensity btwn frames if last frame is present, retrieves diff in values
        if last_frame is not None:
            diff = cv2.absdiff(gray_frame, last_frame)
            mean_diff = diff.mean()

            if mean_diff > threshold:
                scene_changes.append(frame_index)

        last_frame = gray_frame
        frame_index += 1

    # avoid cv error
    cap.release()
    return scene_changes

""" takes in previous helper functions: crop_borders & detect_scenes. extracts clips with the side
borders cropped out instead of having to manually go into the vid to clip them out. 
added in num_clips and min_length to specify how many clips to extract from each vid in mp4 folder
min_length is just an arbitrary number set to 5 seconds to trim off those that have their pixel intensities
change every frame and it messes up the alg. the function also extracts clips from every pair of scene_changes
so long as it meets the min_length threshold & selects up to num_clips pairs randomly. """
def extract_clips(video_path, scene_changes, output_dir, num_clips, border_ratio = 0.343, min_length = 5):
    cap = cv2.VideoCapture(video_path)

    # handling error if vid can't be opened
    if not cap.isOpened():
        print(f"Can't open video at {video_path}")
        return

    # encode vid + retrieve dims
    fps = cap.get(cv2.CAP_PROP_FPS)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    crop_width = int(width * border_ratio)
       
    output_width = width - 2 * crop_width
    output_height = height
    
    os.makedirs(output_dir, exist_ok=True)

    # append last frame to ensure the final clip is extracted bc detect_scene doesn't account for it
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    scene_changes.append(total_frames)

    # clip pairs from consecutive scene changes that are at least min_length secs long
    min_frames = int(math.ceil(fps * min_length))
    clip_pairs = []
    for i in range(len(scene_changes)-1):
        start_frame = scene_changes[i]
        end_frame = scene_changes[i+1]
        if (end_frame - start_frame) >= min_frames:
            clip_pairs.append((start_frame, end_frame))

    if not clip_pairs:
        cap.release()
        return

    # RANDOMLY SELECT UP TO NUM_CLIPS PAIRS
    selected_pairs = random.sample(clip_pairs, min(num_clips, len(clip_pairs)))
    print(f"Extracting {len(selected_pairs)} clips from {video_path}")
    for i, (start_frame, end_frame) in enumerate(selected_pairs):
        clip_filename = f"{os.path.splitext(os.path.basename(video_path))[0]}_clip_{i+1}.mp4"
        output_path = os.path.join(output_dir, clip_filename)
        out = cv2.VideoWriter(output_path, fourcc, fps, (output_width, output_height))

        cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

        for frame_index in range(start_frame, end_frame):
            success, frame = cap.read()
            if not success:
                break

            cropped_frame = crop_borders(frame, border_ratio)
            out.write(cropped_frame)

        out.release()
        actual_duration = (end_frame - start_frame) / fps
        print(f"Saved: {output_path} (Duration: {actual_duration:.2f} sec)")

    cap.release()

""" Purpose of function is to filter out clips within a time (seconds) deviation based on the average
clip length within the folder. This will remove clips whose pixel intensities are so high that the previous
helper functions continually clip it frame by frame and also the ending credits clip at the end of the video. 
Function will remove clip lenths that are less than the min_length (ARBITRARILY SET TO 5 SECONDS) because some clips have
constantly changing pixel intensities that surpass the threshold so these chopped clips will be deleted. Then the remaining
clips will have their average time length calculated and a (arbitrary) deviation will be applied to the mean time length
and delete videos outside of that frame.
FEEL FREE TO ALTER DEVIATION + MIN LENGTH IF IT DOESN'T SUIT CLIPS AFTER MANUALLY INSPECTING 

changed filtering function to now group clips by their original name (the part before '_clip_')
so that filtering is done based on dance, I also added print statements just so i can see if the loops are
running as intended lol (was gonna remove them but i think they look nice)"""
def filtering_clips(output_dir, fps, deviation=3, min_length=5):
    # group clips by the original video name (the part before '_clip_')
    groups = {}
    for clip_name in os.listdir(output_dir):
        if clip_name.endswith('.mp4'):
            group_key = clip_name.split("_clip_")[0]
            groups.setdefault(group_key, []).append(clip_name)
    
    # process each group/dance separately
    for group_key, clip_names in groups.items():
        clip_durations = []
        clip_paths = []

        # measures durations for all clips in group
        for clip_name in clip_names:
            clip_path = os.path.join(output_dir, clip_name)
            cap = cv2.VideoCapture(clip_path)
            # handling error if vid can't be opened 
            if not cap.isOpened():
                print(f"Could not open {clip_path}, skipping.")
                continue
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            duration = frame_count / fps
            cap.release()

            # FIRST FILTER: REMOVING CLIPS LESS THAN min_length SECONDS LONG
            if duration < min_length:
                os.remove(clip_path)
                print(f"Deleted {clip_path} (Duration: {duration:.2f} sec), below {min_length}s")
            else:
                clip_durations.append(duration)
                clip_paths.append(clip_path)
        
        if len(clip_durations) == 0:
            print(f"No valid clips for {group_key} after min_length filter")
            continue

        avg_duration = sum(clip_durations) / len(clip_durations)
        min_duration = max(0, avg_duration - deviation)
        max_duration = avg_duration + deviation
        print(f"{group_key}")
        print(f"Avg Clip Duration = {avg_duration:.2f}s")
        # print(f"Range: ({min_duration:.2f}s - {max_duration:.2f}s)")
        print(f"Removing clips outside dev ({min_duration:.2f}s - {max_duration:.2f}s)")

        # remove clips outside the dev range for this group
        for clip_path, duration in zip(clip_paths, clip_durations):
            if duration < min_duration or duration > max_duration:
                os.remove(clip_path)
                print(f"Deleted {clip_path} (Duration: {duration:.2f}s) - outside deviation")

"""Purpose of function is to divide the clips into three sets: training, validation, and testing. 
params for the ratios are set to 60:20:20 but FEEL FREE TO CHANGE THE VALS. 
Also, split is done randomly as well. looks more mp4 files specifically as an edge case 
"""
def split_dataset(input_dir, output_dir, train_ratio, val_ratio, test_ratio):
    all_entries = os.listdir(input_dir)
    clips = []
    for clip in all_entries:
        file_path = os.path.join(input_dir, clip)
        if os.path.isfile(file_path):
            clips.append(clip)
        
    # RANDOMDMDMDMDMMDMMM
    # random.seed(42)
    random.shuffle(clips)
    
    total = len(clips)
    train_count = int(total * train_ratio)
    val_count = int(total * val_ratio)
    
    train_files = clips[:train_count]
    val_files = clips[train_count:train_count + val_count]
    test_files = clips[train_count + val_count:]
    
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    test_dir = os.path.join(output_dir, "test")
    
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # trainfiles
    for clip in train_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(train_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # val files
    for clip in val_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(val_dir, clip)
        shutil.copy2(src_path, dst_path)
    
    # test files
    for clip in test_files:
        src_path = os.path.join(input_dir, clip)
        dst_path = os.path.join(test_dir, clip)
        shutil.copy2(src_path, dst_path)

    # deeleting files in clip folder
    shutil.rmtree(input_dir)

if __name__ == '__main__':
    input_folder = "mp4"                   
    output_folder = "negative_control"    

    os.makedirs(output_folder, exist_ok=True)
    clips_dir = os.path.join(output_folder, "clips")
    os.makedirs(clips_dir, exist_ok=True)

    threshold = 51                        
    border_ratio = 0.343

    # ALTER VAL FOR MORE CLIPS PER VID
    num_clips_per_video = 5          
    # ALTER VAL FOR MORE CLIPS PER VID

    for filename in os.listdir(input_folder):
        if filename.endswith('.mp4'):
            video_path = os.path.join(input_folder, filename)
            cap = cv2.VideoCapture(video_path)
            fps = cap.get(cv2.CAP_PROP_FPS)
            cap.release()

            scene_changes = detect_scenes(video_path, threshold, border_ratio)
            if scene_changes:
                extract_clips(video_path, scene_changes, clips_dir, num_clips_per_video, border_ratio, min_length=5)

    split_dataset(clips_dir, output_folder, train_ratio=0.6, val_ratio= 0.2, test_ratio=0.2)
    print("negative control dataset DONE!")


Extracting 5 clips from mp4/savagelove.mp4
Saved: negative_control/clips/savagelove_clip_1.mp4 (Duration: 15.13 sec)
Saved: negative_control/clips/savagelove_clip_2.mp4 (Duration: 15.07 sec)
Saved: negative_control/clips/savagelove_clip_3.mp4 (Duration: 15.03 sec)
Saved: negative_control/clips/savagelove_clip_4.mp4 (Duration: 15.13 sec)
Saved: negative_control/clips/savagelove_clip_5.mp4 (Duration: 15.13 sec)
Extracting 5 clips from mp4/fendi&prada.mp4
Saved: negative_control/clips/fendi&prada_clip_1.mp4 (Duration: 14.55 sec)
Saved: negative_control/clips/fendi&prada_clip_2.mp4 (Duration: 14.58 sec)
Saved: negative_control/clips/fendi&prada_clip_3.mp4 (Duration: 14.55 sec)
Saved: negative_control/clips/fendi&prada_clip_4.mp4 (Duration: 14.61 sec)
Saved: negative_control/clips/fendi&prada_clip_5.mp4 (Duration: 14.58 sec)
Extracting 5 clips from mp4/facelikeamodel.mp4
Saved: negative_control/clips/facelikeamodel_clip_1.mp4 (Duration: 15.07 sec)
Saved: negative_control/clips/facelikeamode