# dont run this

In [None]:
# with lambda function - causes a lambda error
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from keras.layers import Dropout, Lambda
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

def split_audio_into_segments(audio_file, segment_length=10):
    """Splits audio file into segments of specified length."""
    y, sr = librosa.load(audio_file, sr=None)
    segment_samples = segment_length * sr
    num_segments = len(y) // segment_samples
    segments = [y[i * segment_samples:(i + 1) * segment_samples] for i in range(num_segments)]
    return segments, sr

def extract_features(file, sr):
    audio, _ = librosa.load(file, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    return mfcc, mel, chroma, spectral_contrast

def process_movies_and_trailers(movie_files, trailer_files, sr):
    all_mfcc_movie = []
    all_mel_movie = []
    all_chroma_movie = []
    all_spectral_contrast_movie = []

    all_mfcc_trailer = []
    all_mel_trailer = []
    all_chroma_trailer = []
    all_spectral_contrast_trailer = []

    for file in movie_files:
        mfcc, mel, chroma, spectral_contrast = extract_features(file, sr)
        all_mfcc_movie.append(mfcc.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_mel_movie.append(mel.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_chroma_movie.append(chroma.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_spectral_contrast_movie.append(spectral_contrast.reshape(-1, 1))  # Reshape to (n_features, 1)

    for file in trailer_files:
        mfcc, mel, chroma, spectral_contrast = extract_features(file, sr)
        all_mfcc_trailer.append(mfcc.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_mel_trailer.append(mel.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_chroma_trailer.append(chroma.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_spectral_contrast_trailer.append(spectral_contrast.reshape(-1, 1))  # Reshape to (n_features, 1)

    return all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie, \
           all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer

def prepare_triplet_data(all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie,
                         all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer):
    """Prepares triplet data for Siamese LSTM model."""
    anchors, positives, labels = [], [], []
    num_movies = len(all_mfcc_movie)
    num_trailers = len(all_mfcc_trailer)

    for i in range(num_movies):
        min_len = min(len(all_mfcc_movie[i]), len(all_mel_movie[i]), len(all_chroma_movie[i]), len(all_spectral_contrast_movie[i]))
        for j in range(min_len):
            combined_anchor = np.concatenate([all_mfcc_movie[i][j], all_mel_movie[i][j], all_chroma_movie[i][j], all_spectral_contrast_movie[i][j]], axis=0)
            trailer_idx = j % num_trailers  # Use modulo operator to wrap around
            min_len_trailer = min(len(all_mfcc_trailer[trailer_idx]), len(all_mel_trailer[trailer_idx]), len(all_chroma_trailer[trailer_idx]), len(all_spectral_contrast_trailer[trailer_idx]))
            combined_positive = np.concatenate([all_mfcc_trailer[trailer_idx][j % min_len_trailer],
                                                all_mel_trailer[trailer_idx][j % min_len_trailer],
                                                all_chroma_trailer[trailer_idx][j % min_len_trailer],
                                                all_spectral_contrast_trailer[trailer_idx][j % min_len_trailer]], axis=0)
            anchors.append(combined_anchor)
            positives.append(combined_positive)
            labels.append(1)  # Positive pair

    for i in range(num_trailers):
        min_len = min(len(all_mfcc_trailer[i]), len(all_mel_trailer[i]), len(all_chroma_trailer[i]), len(all_spectral_contrast_trailer[i]))
        for j in range(min_len):
            combined_anchor = np.concatenate([all_mfcc_trailer[i][j], all_mel_trailer[i][j], all_chroma_trailer[i][j], all_spectral_contrast_trailer[i][j]], axis=0)
            movie_idx = j % num_movies  # Use modulo operator to wrap around
            min_len_movie = min(len(all_mfcc_movie[movie_idx]), len(all_mel_movie[movie_idx]), len(all_chroma_movie[movie_idx]), len(all_spectral_contrast_movie[movie_idx]))
            combined_positive = np.concatenate([all_mfcc_movie[movie_idx][j % min_len_movie],
                                                all_mel_movie[movie_idx][j % min_len_movie],
                                                all_chroma_movie[movie_idx][j % min_len_movie],
                                                all_spectral_contrast_movie[movie_idx][j % min_len_movie]], axis=0)
            anchors.append(combined_anchor)
            positives.append(combined_positive)
            labels.append(0)  # Negative pair

    anchors = np.array(anchors)
    positives = np.array(positives)
    labels = np.array(labels)

    # Standardize features for better training performance
    scaler = StandardScaler()
    anchors = scaler.fit_transform(anchors)
    positives = scaler.transform(positives)

    return anchors, positives, labels

def create_siamese_lstm_model(input_shape):
    """Defines the Siamese LSTM model architecture."""
    import tensorflow as tf
    if len(input_shape) == 1:  # If input_shape has only one element
        timesteps = 10  # Default number of time steps
        features = input_shape[0]
    else:
        timesteps, features = input_shape

    input_a = Input(shape=(timesteps, features))  # Use the correct input shape
    input_b = Input(shape=(timesteps, features))  # Use the correct input shape

    # Shared LSTM layers
    lstm = LSTM(128, return_sequences=True)
    lstm_a = lstm(input_a)
    lstm_a = LSTM(64)(lstm_a)
    dense_a = Dense(32, activation='relu')(lstm_a)

    lstm_b = lstm(input_b)
    lstm_b = LSTM(64)(lstm_b)
    dense_b = Dense(32, activation='relu')(lstm_b)

    # Calculate distance between outputs
    distance = Lambda(lambda tensors: tf.sqrt(tf.reduce_sum(tf.square(tensors[0] - tensors[1]), axis=-1, keepdims=True)),
                     output_shape=(1,))(dense_a, dense_b)

    # Define the model
    model = Model(inputs=[input_a, input_b], outputs=distance)
    return model

def contrastive_loss(margin=1):
    def loss(y_true, y_pred):
        # Add a small value to y_pred to prevent division by zero
        y_pred = tf.maximum(y_pred, 1e-8)

        # Calculate the loss
        loss = tf.reduce_mean((1 - y_true) * tf.square(y_pred) + y_true * tf.square(tf.maximum(margin - y_pred, 0)))

        # Check for NaN values and replace them with zeros
        loss = tf.where(tf.math.is_nan(loss), tf.zeros_like(loss), loss)

        return loss
    return loss

# Example usage
trailer_files = [
        '/content/Stucco_Trailer.wav',
        '/content/SushiNoh_Trailer.wav',
        '/content/THECHAIR_Trailer.wav',
        '/content/TheCouch_Trailer.wav',
        '/content/TheElevator_Trailer.wav'
]

movie_files = [
        '/content/Stucco _Movie.wav',
        '/content/SushiNoh_Movie.wav',
        '/content/THECHAIR_Movie.wav',
        '/content/TheCouch_Movie.wav',
        '/content/TheElevator_Movie.wav'
]

segments, sr = split_audio_into_segments(trailer_files[0])
all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie, \
all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer = process_movies_and_trailers(movie_files, trailer_files, sr)

anchors, positives, labels = prepare_triplet_data(all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie,
                                                 all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer)
# Check the original shape of the anchors array
print("Original shape of anchors:", anchors.shape)

# Calculate the number of samples (batch size) and time steps
batch_size = anchors.shape[0]
time_steps = anchors.shape[1] // 4  # assuming 4 features

# Reshape the anchors and positives arrays
anchors = anchors.reshape(batch_size, time_steps, 4)
positives = positives.reshape(batch_size, time_steps, 4)

input_shape = anchors.shape[1:]
model = create_siamese_lstm_model(input_shape)
# Add dropout to prevent overfitting
x = model.output
x = Dropout(0.2)(x)
model = Model(inputs=model.input, outputs=x)

optimizer = Adam(learning_rate=0.0001, clipvalue=0.5)
model.compile(optimizer=optimizer, loss=contrastive_loss(margin=1))

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.001)

# Train the model
history = model.fit([anchors, positives], labels, epochs=10, batch_size=64,
                    validation_split=0.2, callbacks=[reduce_lr, early_stopping])



Original shape of anchors: (3766595, 4)
Epoch 1/10




[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m453s[0m 10ms/step - loss: 0.2512 - val_loss: 7.4723 - learning_rate: 1.0000e-04
Epoch 2/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 10ms/step - loss: 0.2008 - val_loss: 12.2373 - learning_rate: 1.0000e-04
Epoch 3/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 9ms/step - loss: 0.1996 - val_loss: 17.7669 - learning_rate: 1.0000e-04
Epoch 4/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 9ms/step - loss: 0.2034 - val_loss: 21.4143 - learning_rate: 1.0000e-04
Epoch 5/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m432s[0m 9ms/step - loss: 0.1960 - val_loss: 26.4838 - learning_rate: 1.0000e-04
Epoch 6/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 9ms/step - loss: 0.1995 - val_loss: 27.5801 - learning_rate: 1.0000e-04
Epoch 7/10
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[

In [None]:
# Save the trained model in the native Keras format
model.save('siamese_lstm_model.h5')
print("Model saved as 'siamese_lstm_model.keras'")



Model saved as 'siamese_lstm_model.keras'


In [None]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
from pydub import AudioSegment
sound = AudioSegment.from_mp3("/content/ignoreitmovieaudio.mp3")
sound.export("/content/ignoreitmovieaudio.wav", format="wav")

<_io.BufferedRandom name='/content/ignoreitmovieaudio.wav'>

In [None]:
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

def extract_features(audio, sr):
    """Extracts audio features: MFCC, Mel, Chroma, Spectral Contrast."""
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr, n_chroma=13)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)

    # Ensure all features have the same number of frames
    max_frames = max(mfcc.shape[1], mel.shape[1], chroma.shape[1], spectral_contrast.shape[1])

    # Pad or truncate features to have the same number of frames
    mfcc = librosa.util.fix_length(mfcc, size=max_frames, axis=1)
    mel = librosa.util.fix_length(mel, size=max_frames, axis=1)
    chroma = librosa.util.fix_length(chroma, size=max_frames, axis=1)
    spectral_contrast = librosa.util.fix_length(spectral_contrast, size=max_frames, axis=1)

    # Combine features into a single array and take mean across time
    features = np.vstack([mfcc.mean(axis=0), mel.mean(axis=0), chroma.mean(axis=0), spectral_contrast.mean(axis=0)])
    return features

def pad_sequences(sequences, max_length):
    """Pads sequences to the same length."""
    padded_sequences = []
    for sequence in sequences:
        length = sequence.shape[1]  # Get the length of the sequence along the time dimension
        if length < max_length:
            pad_width = max_length - length
            padded_sequence = np.pad(sequence, ((0, 0), (0, pad_width)), mode='constant')
        else:
            padded_sequence = sequence[:, :max_length]
        padded_sequences.append(padded_sequence)
    return np.array(padded_sequences)

def get_top_trailer_worthy_segments(labels, segment_duration=5, top_n=50, movie_audio_len=0, sr=22050):
    """Get the top `top_n` non-overlapping trailer-worthy segments."""
    trailer_segments = []
    used_indices = set()

    # Calculate the total movie duration in seconds
    total_audio_duration = movie_audio_len / sr  # Duration in seconds

    # Calculate distances of each segment from the cluster centroid to rank them
    unique_labels = np.unique(labels)
    distances = []

    for cluster in unique_labels:
        cluster_indices = np.where(labels == cluster)[0]
        for idx in cluster_indices:
            # Calculate the start and end time in seconds
            start_time = (idx * segment_duration)  # Correct scaling based on segment duration
            end_time = start_time + segment_duration

            # Ensure the segment stays within the bounds of the movie duration
            if end_time > total_audio_duration:
                continue  # Skip segments that go beyond the movie's duration

            # Calculate distance to centroid (optional, depending on your ranking method)
            distance_to_centroid = np.linalg.norm(labels[idx] - np.mean(labels[cluster_indices], axis=0))  # Distance to centroid

            distances.append((start_time, end_time, distance_to_centroid, idx))  # store distance, time, and index

    # Sort by distance (smallest to largest) - we want the closest to centroids first
    distances.sort(key=lambda x: x[2])  # Sort by distance

    # Select top_n segments and ensure non-overlapping
    for start_time, end_time, _, idx in distances[:top_n]:
        # Check for overlap with previously selected segments
        if not any(start < end_time and end > start_time for start, end in trailer_segments):
            trailer_segments.append((start_time, end_time))
            used_indices.add(idx)

    return trailer_segments

def predict_timestamps(model, movie_file, trailer_files, sr):
    """Predicts the timestamps for the trailer-worthy segments."""
    # Load and extract features from the movie
    movie_audio, _ = librosa.load(movie_file, sr=sr)
    movie_features = extract_features(movie_audio, sr)

    # Load and extract features from each trailer
    trailer_features = []
    for trailer_file in trailer_files:
        trailer_audio, _ = librosa.load(trailer_file, sr=sr)
        features = extract_features(trailer_audio, sr)
        trailer_features.append(features)

    # Pad segments to ensure uniform shape
    max_length = max(movie_features.shape[1], max(t.shape[1] for t in trailer_features))
    padded_movie_features = pad_sequences([movie_features], max_length)[0]
    padded_trailer_features = pad_sequences(trailer_features, max_length)

    # Standardize features
    scaler = StandardScaler()
    padded_movie_features = scaler.fit_transform(padded_movie_features.T).T
    padded_trailer_features = np.array([scaler.transform(trailer.T).T for trailer in padded_trailer_features])

    # Reshape features to fit model's expected input shape
    def reshape_features(features):
        """Reshape the feature array to the shape that the model expects."""
        num_frames = features.shape[1]
        if features.shape[0] != 4:
            raise ValueError("Features should have a shape of (4, num_frames)")

        # Reshape the features to (num_frames, 1, 4) to match the model input shape
        reshaped_features = features.reshape((num_frames, 1, 4))  # (num_frames, 1, 4)
        return reshaped_features

    # Reshape movie features and trailer features to fit the model input
    movie_segments = reshape_features(padded_movie_features)

    # For trailers, compute the mean across trailers and reshape
    mean_trailer_features = np.mean(padded_trailer_features, axis=0)  # Take the mean across all trailers
    trailer_segments = reshape_features(mean_trailer_features)  # Reshape

    # Prepare inputs as a list of tensors
    inputs = [tf.convert_to_tensor(movie_segments), tf.convert_to_tensor(trailer_segments)]

    # Debugging: Print shapes and types of inputs
    print("Input shapes:")
    print("Movie segments shape:", movie_segments.shape)
    print("Trailer segments shape:", trailer_segments.shape)
    print("Inputs type:", type(inputs), "Inputs content:", inputs)

    # Predict using the model
    predictions = model.predict(inputs)  # Pass the list of inputs

    # Flatten the predicted features for clustering
    predicted_embeddings = predictions.reshape(predictions.shape[0], -1)

    # Perform K-means clustering to find trailer-worthy segments
    num_clusters = 4  # Set this based on your criteria
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(predicted_embeddings)

    # Get cluster labels
    labels = kmeans.labels_

    # Extract top trailer-worthy segments
    movie_audio_len = len(movie_audio)  # Length of the movie audio in samples
    trailer_segments = get_top_trailer_worthy_segments(labels, segment_duration=5, top_n=30, movie_audio_len=movie_audio_len, sr=sr)

    return trailer_segments

# Load the pre-trained Siamese LSTM model with safe_mode set to False
model = load_model('/content/siamese_lstm_model.keras', custom_objects={'contrastive_loss': contrastive_loss}, compile=False, safe_mode=False)
model.summary()

# Example paths (update these with your actual paths)
movie_file = "/content/ignoreitmovieaudio.wav"
trailer_files = [
    "/content/Stucco_Trailer.wav",
    "/content/SushiNoh_Trailer.wav",
    "/content/THECHAIR_Trailer.wav",
    "/content/TheElevator_Trailer.wav",
    "/content/TheCouch_Trailer.wav"
]

# Set the sample rate
sr = 22050  # Sample rate

# Predict trailer-worthy segments
trailer_segments = predict_timestamps(model, movie_file, trailer_files, sr)

# Output the results
print("Top 50 non-overlapping trailer-worthy segments (start time, end time) in seconds:")
for segment in trailer_segments:
    print(f"Start: {segment[0]}s, End: {segment[1]}s")

# Optionally save the results to a CSV file
df = pd.DataFrame(trailer_segments, columns=['Start Time (s)', 'End Time (s)'])
df.to_csv('top_trailer_worthy_segments.csv', index=False)
print("Top 30 trailer-worthy segments have been saved to 'top_trailer_worthy_segments.csv'.")


Input shapes:
Movie segments shape: (16837, 1, 4)
Trailer segments shape: (16837, 1, 4)
Inputs type: <class 'list'> Inputs content: [<tf.Tensor: shape=(16837, 1, 4), dtype=float64, numpy=
array([[[-3.14680743, -2.14150866, -1.77313273, -1.60382616]],

       [[-1.50869514, -1.70686195, -1.49884773, -1.45279835]],

       [[-1.50186953, -1.48460835, -1.38992496, -1.12190749]],

       ...,

       [[-1.80887846, -5.14206966, -6.69890218, -4.4876505 ]],

       [[-4.19442842, -2.978377  , -2.68481344, -4.92212311]],

       [[-1.89791841, -2.4179231 , -7.56592033, -9.46191723]]])>, <tf.Tensor: shape=(16837, 1, 4), dtype=float64, numpy=
array([[[ -4.41496924,  -4.26898948,  -4.40948542,  -4.94617964]],

       [[ -4.94672546,  -4.94672546,  -4.94672546,  -4.94672546]],

       [[ -4.94672546,  -4.94672546,  -4.56481188,  -4.27240532]],

       ...,

       [[-10.98854859, -10.98854859, -10.98854859, -10.98854859]],

       [[-10.98854859, -10.98854859, -10.98854859, -10.98854859]],

     

# Run for all movies !!

In [None]:
import numpy as np
import librosa
from tensorflow.keras.models import load_model
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

def extract_features(audio, sr):
    """Extracts audio features: MFCC, Mel, Chroma, Spectral Contrast."""
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=13)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr, n_chroma=13)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)

    # Ensure all features have the same number of frames
    max_frames = max(mfcc.shape[1], mel.shape[1], chroma.shape[1], spectral_contrast.shape[1])

    # Pad or truncate features to have the same number of frames
    mfcc = librosa.util.fix_length(mfcc, size=max_frames, axis=1)
    mel = librosa.util.fix_length(mel, size=max_frames, axis=1)
    chroma = librosa.util.fix_length(chroma, size=max_frames, axis=1)
    spectral_contrast = librosa.util.fix_length(spectral_contrast, size=max_frames, axis=1)

    # Combine features into a single array and take mean across time
    features = np.vstack([mfcc.mean(axis=0), mel.mean(axis=0), chroma.mean(axis=0),
                          spectral_contrast.mean(axis=0)])
    return features

def pad_sequences(sequences, max_length):
    """Pads sequences to the same length."""
    padded_sequences = []
    for sequence in sequences:
        length = sequence.shape[1]  # Get the length of the sequence along the time dimension
        if length < max_length:
            pad_width = max_length - length
            padded_sequence = np.pad(sequence, ((0, 0), (0, pad_width)), mode='constant')
        else:
            padded_sequence = sequence[:, :max_length]
        padded_sequences.append(padded_sequence)
    return np.array(padded_sequences)

def get_key_audio_segments(labels, features, segment_duration=5, movie_audio_len=0,
                           sr=22050, threshold_distance=0.1):
    """Filters and returns key audio moments from the clustered segments."""
    key_segments = []
    total_audio_duration = movie_audio_len / sr  # Duration in seconds

    # Iterate over the labels to extract all segments and their distances to the centroid
    unique_labels = np.unique(labels)
    for cluster in unique_labels:
        cluster_indices = np.where(labels == cluster)[0]
        cluster_features = features[cluster_indices]

        # Calculate the centroid of the cluster
        centroid = np.mean(cluster_features, axis=0)

        for idx in cluster_indices:
            # Calculate the start and end time of the segment
            start_time = idx * segment_duration
            end_time = start_time + segment_duration

            # Ensure the segment stays within the bounds of the movie duration
            if end_time > total_audio_duration:
                continue  # Skip segments that go beyond the movie's duration

            # Calculate the distance of the segment's features to the cluster centroid
            distance_to_centroid = np.linalg.norm(cluster_features[idx] - centroid)

            # Filter based on distance (closer to centroid = more likely to be a key moment)
            if distance_to_centroid < threshold_distance:
                key_segments.append((start_time, end_time))

    return key_segments

def predict_timestamps(model, movie_file, trailer_files, sr, threshold_distance=0.1):
    """Predicts the timestamps for the key audio moments."""
    # Load and extract features from the movie
    movie_audio, _ = librosa.load(movie_file, sr=sr)
    movie_features = extract_features(movie_audio, sr)

    # Load and extract features from each trailer
    trailer_features = []
    for trailer_file in trailer_files:
        trailer_audio, _ = librosa.load(trailer_file, sr=sr)
        features = extract_features(trailer_audio, sr)
        trailer_features.append(features)

    # Pad segments to ensure uniform shape
    max_length = max(movie_features.shape[1], max(t.shape[1] for t in trailer_features))
    padded_movie_features = pad_sequences([movie_features], max_length)[0]
    padded_trailer_features = pad_sequences(trailer_features, max_length)

    # Standardize features
    scaler = StandardScaler()
    padded_movie_features = scaler.fit_transform(padded_movie_features.T).T
    padded_trailer_features = np.array([scaler.transform(trailer.T).T for trailer in
                                        padded_trailer_features])

    # Reshape features to fit model's expected input shape
    def reshape_features(features):
        """Reshape the feature array to the shape that the model expects."""
        num_frames = features.shape[1]
        if features.shape[0] != 4:
            raise ValueError("Features should have a shape of (4, num_frames)")

        # Reshape the features to (num_frames, 1, 4) to match the model input shape
        reshaped_features = features.reshape((num_frames, 1, 4))  # (num_frames, 1, 4)
        return reshaped_features

    # Reshape movie features and trailer features to fit the model input
    movie_segments = reshape_features(padded_movie_features)

    # For trailers, compute the mean across trailers and reshape
    mean_trailer_features = np.mean(padded_trailer_features, axis=0)  # Take the mean across all trailers
    trailer_segments = reshape_features(mean_trailer_features)  # Reshape

    # Prepare inputs as a list of tensors
    inputs = [tf.convert_to_tensor(movie_segments), tf.convert_to_tensor(trailer_segments)]

    # Predict using the model
    predictions = model.predict(inputs)  # Pass the list of inputs

    # Flatten the predicted features for clustering
    predicted_embeddings = predictions.reshape(predictions.shape[0], -1)

    # Perform K-means clustering to find trailer-worthy segments
    num_clusters = 4  # Set this based on your criteria
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(predicted_embeddings)

    # Get cluster labels
    labels = kmeans.labels_

    # Extract key audio segments (those closest to cluster centroids)
    movie_audio_len = len(movie_audio)  # Length of the movie audio in samples
    key_segments = get_key_audio_segments(labels, predicted_embeddings, segment_duration=5,
                    movie_audio_len=movie_audio_len, sr=sr, threshold_distance=threshold_distance)

    return key_segments

# Load the pre-trained Siamese LSTM model with safe_mode set to False
model = load_model('/content/siamese_lstm_model.keras',
                   custom_objects={'contrastive_loss': contrastive_loss},
                   compile=False, safe_mode=False)
model.summary()


# Set the sample rate
sr = 22050  # Sample rate

# Predict key audio moments (segments)
key_segments = predict_timestamps(model, movie_file, trailer_files, sr, threshold_distance=0.1)


# Optionally save the results to a CSV file
df = pd.DataFrame(key_segments, columns=['Start Time (s)', 'End Time (s)'])
df.to_csv('key_audio_moments.csv', index=False)
print("Key audio moments have been saved to 'key_audio_moments.csv'.")


[1m527/527[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step
Key audio moments (start time, end time) in seconds:
Start: 100s, End: 105s
Start: 115s, End: 120s
Start: 130s, End: 135s
Start: 140s, End: 145s
Start: 150s, End: 155s
Start: 155s, End: 160s
Start: 270s, End: 275s
Start: 275s, End: 280s
Start: 285s, End: 290s
Start: 290s, End: 295s
Start: 295s, End: 300s
Start: 300s, End: 305s
Start: 310s, End: 315s
Start: 315s, End: 320s
Start: 320s, End: 325s
Start: 325s, End: 330s
Start: 345s, End: 350s
Start: 350s, End: 355s
Start: 360s, End: 365s
Start: 370s, End: 375s
Start: 375s, End: 380s
Start: 380s, End: 385s
Start: 385s, End: 390s
Key audio moments have been saved to 'key_audio_moments.csv'.


# Dont run this

In [None]:
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Lambda, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Function to split audio into segments
def split_audio_into_segments(audio_file, segment_length=10):
    """Splits audio file into segments of specified length."""
    y, sr = librosa.load(audio_file, sr=None)
    segment_samples = segment_length * sr
    num_segments = len(y) // segment_samples
    segments = [y[i * segment_samples:(i + 1) * segment_samples] for i in range(num_segments)]
    return segments, sr

# Function to extract features from an audio file
def extract_features(file, sr):
    audio, _ = librosa.load(file, sr=sr)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    mel = librosa.feature.melspectrogram(y=audio, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    return mfcc, mel, chroma, spectral_contrast

# Process movies and trailers and extract features
def process_movies_and_trailers(movie_files, trailer_files, sr):
    all_mfcc_movie = []
    all_mel_movie = []
    all_chroma_movie = []
    all_spectral_contrast_movie = []

    all_mfcc_trailer = []
    all_mel_trailer = []
    all_chroma_trailer = []
    all_spectral_contrast_trailer = []

    for file in movie_files:
        mfcc, mel, chroma, spectral_contrast = extract_features(file, sr)
        all_mfcc_movie.append(mfcc.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_mel_movie.append(mel.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_chroma_movie.append(chroma.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_spectral_contrast_movie.append(spectral_contrast.reshape(-1, 1))  # Reshape to (n_features, 1)

    for file in trailer_files:
        mfcc, mel, chroma, spectral_contrast = extract_features(file, sr)
        all_mfcc_trailer.append(mfcc.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_mel_trailer.append(mel.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_chroma_trailer.append(chroma.reshape(-1, 1))  # Reshape to (n_features, 1)
        all_spectral_contrast_trailer.append(spectral_contrast.reshape(-1, 1))  # Reshape to (n_features, 1)

    return all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie, \
           all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer

# Prepare triplet data for Siamese LSTM
def prepare_triplet_data(all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie,
                         all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer):
    """Prepares triplet data for Siamese LSTM model."""
    anchors, positives, labels = [], [], []
    num_movies = len(all_mfcc_movie)
    num_trailers = len(all_mfcc_trailer)

    for i in range(num_movies):
        min_len = min(len(all_mfcc_movie[i]), len(all_mel_movie[i]), len(all_chroma_movie[i]),
                      len(all_spectral_contrast_movie[i]))
        for j in range(min_len):
            combined_anchor = np.concatenate([all_mfcc_movie[i][j], all_mel_movie[i][j],
                                              all_chroma_movie[i][j], all_spectral_contrast_movie[i][j]], axis=0)
            trailer_idx = j % num_trailers  # Use modulo operator to wrap around
            min_len_trailer = min(len(all_mfcc_trailer[trailer_idx]), len(all_mel_trailer[trailer_idx]),
                                  len(all_chroma_trailer[trailer_idx]), len(all_spectral_contrast_trailer[trailer_idx]))
            combined_positive = np.concatenate([all_mfcc_trailer[trailer_idx][j % min_len_trailer],
                                                all_mel_trailer[trailer_idx][j % min_len_trailer],
                                                all_chroma_trailer[trailer_idx][j % min_len_trailer],
                                                all_spectral_contrast_trailer[trailer_idx][j % min_len_trailer]], axis=0)
            anchors.append(combined_anchor)
            positives.append(combined_positive)
            labels.append(1)  # Positive pair

    for i in range(num_trailers):
        min_len = min(len(all_mfcc_trailer[i]), len(all_mel_trailer[i]), len(all_chroma_trailer[i]),
                      len(all_spectral_contrast_trailer[i]))
        for j in range(min_len):
            combined_anchor = np.concatenate([all_mfcc_trailer[i][j], all_mel_trailer[i][j], all_chroma_trailer[i][j],
                                              all_spectral_contrast_trailer[i][j]], axis=0)
            movie_idx = j % num_movies  # Use modulo operator to wrap around
            min_len_movie = min(len(all_mfcc_movie[movie_idx]), len(all_mel_movie[movie_idx]), len(all_chroma_movie[movie_idx]),
                                len(all_spectral_contrast_movie[movie_idx]))
            combined_positive = np.concatenate([all_mfcc_movie[movie_idx][j % min_len_movie],
                                                all_mel_movie[movie_idx][j % min_len_movie],
                                                all_chroma_movie[movie_idx][j % min_len_movie],
                                                all_spectral_contrast_movie[movie_idx][j % min_len_movie]], axis=0)
            anchors.append(combined_anchor)
            positives.append(combined_positive)
            labels.append(0)  # Negative pair

    anchors = np.array(anchors)
    positives = np.array(positives)
    labels = np.array(labels)

    # Standardize features for better training performance
    scaler = StandardScaler()
    anchors = scaler.fit_transform(anchors)
    positives = scaler.transform(positives)

    return anchors, positives, labels

@tf.keras.utils.register_keras_serializable()
def compute_distance(tensors):
    # Compute the squared differences
    squared_diff = tf.square(tensors[0] - tensors[1])
    # Sum across the feature dimension (axis=-1)
    sum_squared_diff = tf.reduce_sum(squared_diff, axis=-1, keepdims=True)
    # Take the square root
    return tf.sqrt(sum_squared_diff)

# Define the Siamese LSTM model architecture
def create_siamese_lstm_model(input_shape):
    """Defines the Siamese LSTM model architecture."""
    timesteps, features = input_shape

    input_a = Input(shape=(timesteps, features))  # Shape should be (timesteps, features)
    input_b = Input(shape=(timesteps, features))  # Shape should be (timesteps, features)

    # Shared LSTM layers
    lstm = LSTM(128, return_sequences=True)
    lstm_a = lstm(input_a)
    lstm_a = LSTM(64)(lstm_a)  # LSTM for first input
    dense_a = Dense(32, activation='relu')(lstm_a)

    lstm_b = lstm(input_b)
    lstm_b = LSTM(64)(lstm_b)  # LSTM for second input
    dense_b = Dense(32, activation='relu')(lstm_b)

    # Specify output shape correctly
    distance = Lambda(compute_distance, output_shape=(1,))([dense_a, dense_b])


    # Define the model
    model = Model(inputs=[input_a, input_b], outputs=distance)
    return model

# Contrastive loss function
def contrastive_loss(margin=1):
    def loss(y_true, y_pred):
        # Add a small value to y_pred to prevent division by zero
        y_pred = tf.maximum(y_pred, 1e-8)

        # Calculate the loss
        loss = tf.reduce_mean((1 - y_true) * tf.square(y_pred) + y_true * tf.square(tf.maximum(margin - y_pred, 0)))

        # Check for NaN values and replace them with zeros
        loss = tf.where(tf.math.is_nan(loss), tf.zeros_like(loss), loss)

        return loss
    return loss

# Example usage with movie and trailer files
trailer_files = [
        '/content/Stucco_Trailer.wav',
        '/content/SushiNoh_Trailer.wav',
        '/content/THECHAIR_Trailer.wav',
        '/content/TheCouch_Trailer.wav',
        '/content/TheElevator_Trailer.wav'
]

movie_files = [
        '/content/Stucco _Movie.wav',
        '/content/SushiNoh_Movie.wav',
        '/content/THECHAIR_Movie.wav',
        '/content/TheCouch_Movie.wav',
        '/content/TheElevator_Movie.wav'
]

# Split audio into segments and process features
segments, sr = split_audio_into_segments(trailer_files[0])
all_mfcc_movie, all_mel_movie, all_chroma_movie, all_spectral_contrast_movie, \
all_mfcc_trailer, all_mel_trailer, all_chroma_trailer, all_spectral_contrast_trailer =
                              process_movies_and_trailers(movie_files, trailer_files, sr)

# Prepare triplet data
anchors, positives, labels = prepare_triplet_data(all_mfcc_movie, all_mel_movie, all_chroma_movie,
                                                all_spectral_contrast_movie, all_mfcc_trailer,
                                                all_mel_trailer, all_chroma_trailer,
                                                  all_spectral_contrast_trailer)

# Reshape the anchors and positives for the LSTM input (batch_size, timesteps, features)
batch_size = anchors.shape[0]
time_steps = anchors.shape[1] // 4  # assuming 4 features
anchors = anchors.reshape(batch_size, time_steps, 4)
positives = positives.reshape(batch_size, time_steps, 4)

# Create the Siamese LSTM model
input_shape = anchors.shape[1:]  # (timesteps, 4)
model = create_siamese_lstm_model(input_shape)

# Add dropout to prevent overfitting
x = model.output
x = Dropout(0.2)(x)
model = Model(inputs=model.input, outputs=x)

# Compile the model
optimizer = Adam(learning_rate=0.0001, clipvalue=0.5)
model.compile(optimizer=optimizer, loss=contrastive_loss(margin=1))

# Define callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.001)

# Train the model
history = model.fit([anchors, positives], labels, epochs=30, batch_size=64,
                    validation_split=0.2, callbacks=[reduce_lr, early_stopping])


Epoch 1/3
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m534s[0m 11ms/step - loss: 0.2163 - val_loss: 0.9036 - learning_rate: 1.0000e-04
Epoch 2/3
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m490s[0m 10ms/step - loss: 0.1997 - val_loss: 0.9036 - learning_rate: 1.0000e-04
Epoch 3/3
[1m47083/47083[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 10ms/step - loss: 0.1997 - val_loss: 0.9036 - learning_rate: 1.0000e-04


## NEW CODE