In [8]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
from scenedetect import SceneManager, open_video, ContentDetector
import scenedetect
import datetime
import json

In [9]:
# Define the path to the audio file
audio_path = './Data/Ready_Player_One_rgb/InputAudio.wav'
video_path = './Data/Ready_Player_One_rgb/InputVideo.mp4'

# Define a dict to hold all our data values. I am using start times here as the index

dict_of_start_times = {}

In [10]:
scene_threshold = 50
#threshold (float) – Threshold the average change in pixel intensity must exceed to trigger a cut.

shot_threshold = 35
#threshold (float) – Threshold the average change in pixel intensity must exceed to trigger a cut.

min_scene_length = 60
#Once a cut is detected, this many frames must pass before a new one can be added to the scene list.

min_shot_length = 50
#Once a cut is detected, this many frames must pass before a new shot can be added to the shot list.

In [11]:
def find_scenes(video_path):
    video = open_video(video_path)
    scene_manager = SceneManager()
    scene_manager.add_detector(
        ContentDetector(threshold=scene_threshold,min_scene_len=min_scene_length))
    # Detect all scenes in video from current position to end.
    scene_manager.detect_scenes(video)
    # `get_scene_list` returns a list of start/end timecode pairs
    # for each scene that was found.
    return scene_manager.get_scene_list()

In [12]:
scenes = find_scenes(video_path)
detector = scenedetect.detectors.ContentDetector(threshold=shot_threshold,min_scene_len=min_shot_length)

In [104]:
for scene in scenes:
    scene_start = scene[0].get_timecode()
    scene_end = scene[1].get_timecode()
    #Now we use scene detect again on the original video, but this time we use the scene list to use the start and stop times to create shots
    #create ContentDetector object with a threshold
    shots = scenedetect.detect(video_path,detector=detector, start_time=scene_start, end_time=scene[1].get_timecode())
    #create a new list with shots that have different start and stop timecode
    filtered_shots = [shot for shot in shots if shot[0].get_timecode() != shot[1].get_timecode()]
    dict_of_start_times[scene_start] = {}
    for shot in filtered_shots:
        shot_start = shot[0].get_timecode()
        shot_end = shot[1].get_timecode()
        start_delta = datetime.datetime.strptime(shot_start, "%H:%M:%S.%f") - datetime.datetime.strptime("00:00:00.000", "%H:%M:%S.%f")
        end_delta = datetime.datetime.strptime(shot_end, "%H:%M:%S.%f") - datetime.datetime.strptime("00:00:00.000", "%H:%M:%S.%f")
        duration = (end_delta - start_delta).total_seconds()
        # Load the audio waveform between the start and stop times
        waveform, sample_rate = librosa.load(audio_path, sr=None, offset=start_delta.total_seconds(), duration=duration)
        # Calculate the absolute difference between successive samples
        diff = np.abs(np.diff(waveform))

        # Define the delta threshold above which a shot change is detected. In our case, i decided to use RMS since we don't know the
        # avg change of waves without manual inspection. Professor will give us a random one anyway
        delta_threshold = np.sqrt(np.mean(np.square(waveform)))*2.5

        # Initialize the start and stop times list
        start_stop_times = []

        # Find the start and stop times for each scene change
        # im setting a 1 second minimum for audio classification here so it is not changing rapidly
        start = 0
        for i in range(1, len(diff)):
            if diff[i] > delta_threshold:
                # Use the sample index where the threshold is exceeded as the stop time
                stop = i
                if (stop - start) / sample_rate > 1:  # Minimum duration of 1 second
                    start_stop_times.append((start / sample_rate + start_delta.total_seconds(), stop / sample_rate + start_delta.total_seconds()))
                    start = i

        # Add the final scene if necessary
        if start < len(waveform):
            stop = len(waveform)
            if (stop - start) / sample_rate > 1:  # Minimum duration of 1 second
                start_stop_times.append((start / sample_rate + start_delta.total_seconds(), stop / sample_rate + start_delta.total_seconds()))

        for tuple in start_stop_times:
            list_of_subshots = [t[0] for t in start_stop_times]
            dict_of_start_times[scene_start][shot_start] = {}
            dict_of_start_times[scene_start][shot_start] = list_of_subshots
dict_of_start_times


{'00:00:00.000': {'00:00:00.000': [0.0,
   14.256394557823128,
   15.401972789115646,
   17.77392290249433,
   20.260997732426304,
   21.783242630385487,
   24.66780045351474,
   31.200226757369613],
  '00:00:36.000': [36.0],
  '00:00:37.700': [37.7, 91.47480725623583, 98.6436507936508],
  '00:01:48.133': [108.133],
  '00:01:50.067': [110.067]},
 '00:01:58.200': {'00:01:58.200': [118.2,
   131.9855328798186,
   134.27607709750566,
   136.5936961451247],
  '00:02:24.867': [144.867],
  '00:02:41.467': [161.467,
   162.75112698412698,
   163.82938095238097,
   171.15396145124717,
   183.339947845805],
  '00:03:06.633': [186.633]},
 '00:03:24.667': {'00:03:24.667': [204.667, 206.7152993197279],
  '00:03:30.067': [210.067]},
 '00:03:48.533': {'00:03:48.533': [228.533], '00:03:54.900': [234.9]},
 '00:04:13.033': {'00:04:13.033': [253.033, 254.4427052154195]},
 '00:04:15.633': {'00:04:15.633': [255.633], '00:04:21.167': [261.167]},
 '00:04:29.333': {'00:04:29.333': [269.333], '00:04:35.633': 

In [109]:
print(json.dumps(dict_of_start_times, indent=1))

{
 "00:00:00.000": {
  "00:00:00.000": [
   0.0,
   14.256394557823128,
   15.401972789115646,
   17.77392290249433,
   20.260997732426304,
   21.783242630385487,
   24.66780045351474,
   31.200226757369613
  ],
  "00:00:36.000": [
   36.0
  ],
  "00:00:37.700": [
   37.7,
   91.47480725623583,
   98.6436507936508
  ],
  "00:01:48.133": [
   108.133
  ],
  "00:01:50.067": [
   110.067
  ]
 },
 "00:01:58.200": {
  "00:01:58.200": [
   118.2,
   131.9855328798186,
   134.27607709750566,
   136.5936961451247
  ],
  "00:02:24.867": [
   144.867
  ],
  "00:02:41.467": [
   161.467,
   162.75112698412698,
   163.82938095238097,
   171.15396145124717,
   183.339947845805
  ],
  "00:03:06.633": [
   186.633
  ]
 },
 "00:03:24.667": {
  "00:03:24.667": [
   204.667,
   206.7152993197279
  ],
  "00:03:30.067": [
   210.067
  ]
 },
 "00:03:48.533": {
  "00:03:48.533": [
   228.533
  ],
  "00:03:54.900": [
   234.9
  ]
 },
 "00:04:13.033": {
  "00:04:13.033": [
   253.033,
   254.4427052154195
  ]