In [64]:
import numpy as np
import librosa
import matplotlib.pyplot as plt

In [65]:
# Define the path to the audio file
audio_path = './Data/Ready_Player_One_rgb/InputAudio.wav'

In [66]:
#The first experimentation I am doing here is to find out if this method works when we use on the entire project without start
#and stop times.

# Load the audio waveform WITHOUT start and stop times 
waveform, sample_rate = librosa.load(audio_path, sr=None)


# Calculate the absolute difference between successive samples in the waveform
diff = np.abs(np.diff(waveform))

# Define the delta threshold above which a scene change is detected
delta_threshold = np.sqrt(np.mean(np.square(waveform)))*2

# Initialize the start and stop times list
start_stop_times = []

# Find the start and stop times for each scene change
# im setting a 1 second minimum for audio classification here so it is not changing rapidly
start = 0
for i in range(1, len(diff)):
    if diff[i] > delta_threshold:
        stop = i / sample_rate
        if(stop - start > 1): 
            start_stop_times.append((start, stop))
            start = stop

# Add the final scene if necessary
if start < len(waveform) / sample_rate:
    start_stop_times.append((start, len(waveform) / sample_rate))

# Print the start and stop times list
print(start_stop_times)

[(0, 15.410430839002268), (15.410430839002268, 17.774104308390022), (17.774104308390022, 20.276394557823128), (20.276394557823128, 21.783968253968254), (21.783968253968254, 31.214036281179137), (31.214036281179137, 37.41743764172335), (37.41743764172335, 38.42569160997733), (38.42569160997733, 91.47480725623583), (91.47480725623583, 98.6391156462585), (98.6391156462585, 108.70587301587301), (108.70587301587301, 109.81174603174603), (109.81174603174603, 124.30213151927438), (124.30213151927438, 128.03045351473924), (128.03045351473924, 131.31315192743764), (131.31315192743764, 132.34945578231293), (132.34945578231293, 133.3499319727891), (133.3499319727891, 134.35009070294785), (134.35009070294785, 136.2178231292517), (136.2178231292517, 137.37401360544217), (137.37401360544217, 148.97195011337868), (148.97195011337868, 153.7806575963719), (153.7806575963719, 161.7739909297052), (161.7739909297052, 163.45249433106576), (163.45249433106576, 183.34367346938777), (183.34367346938777, 187.1

In [67]:
# Now this part of the experimentation is to find out if this menthod will work for start stop end times

In [68]:
# Define the start and stop times in seconds
start_time = 5
stop_time = 20

In [69]:
# Load the audio waveform between the start and stop times
waveform, sample_rate = librosa.load(audio_path, sr=None, offset=start_time, duration=stop_time-start_time)


# Calculate the absolute difference between successive samples
diff = np.abs(np.diff(waveform))

# Define the delta threshold above which a shot change is detected. In our case, i decided to use RMS since we don't know the
# avg change of waves without manual inspection. Professor will give us a random one anyway
delta_threshold = np.sqrt(np.mean(np.square(waveform)))*2

# Initialize the start and stop times list
start_stop_times = []

# Find the start and stop times for each scene change
# im setting a 1 second minimum for audio classification here so it is not changing rapidly
start = 0
for i in range(1, len(diff)):
    if diff[i] > delta_threshold:
        # Use the sample index where the threshold is exceeded as the stop time
        stop = i
        if (stop - start) / sample_rate > 1:  # Minimum duration of 1 second
            start_stop_times.append((start / sample_rate + start_time, stop / sample_rate + start_time))
            start = i

# Add the final scene if necessary
if start < len(waveform):
    stop = len(waveform)
    if (stop - start) / sample_rate > 1:  # Minimum duration of 1 second
        start_stop_times.append((start / sample_rate + start_time, stop / sample_rate + start_time))

# Print the start and stop times list
print(start_stop_times)

[(5.0, 11.72015873015873), (11.72015873015873, 14.16907029478458), (14.16907029478458, 15.357913832199547), (15.357913832199547, 17.77390022675737), (17.77390022675737, 18.78716553287982), (18.78716553287982, 20.0)]
