<a href="https://colab.research.google.com/github/Shreshth-112/Video-summarization-using-keyframe-extraction/blob/main/keyframe_vae_corrected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Loading Ground truth frames

In [None]:
import scipy.io
import cv2

# Load the MAT file
mat_file = scipy.io.loadmat("D:\College etc\College\8th sem\Project\SumMe\GT\Cooking.mat")

# Extract the segments information from the MAT file
segments = mat_file['segments'][0]  # Assuming the segments information is stored as a cell array

# Load the video file
video = cv2.VideoCapture("Cooking.mp4")

# Extract keyframes from each segment
keyframes = []
for segment in segments:
    start_frame = int(segment[0][0])  # Convert start frame index to integer
    end_frame = int(segment[0][1])  # Convert end frame index to integer

    # Extract keyframes within the segment range
    for frame_index in range(start_frame, end_frame + 1):
        video.set(cv2.CAP_PROP_POS_FRAMES, frame_index)  # Set the video frame position to the current frame index
        ret, frame = video.read()
        if ret:
            keyframes.append(frame)

# Save the keyframes as individual image files or perform further processing

# Release the video file
video.release()

## Keyframe extraction for video summarisation

In [None]:
# Import the required libraries
import cv2
import numpy as np
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, UpSampling2D

# Load the video file
video = cv2.VideoCapture("Cooking.mp4")

# Define the VAE model architecture
input_shape = (640, 640, 3)
latent_dim = 256

def vae_model():
    input_img = Input(shape=input_shape)

    # Encoder
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(latent_dim, (3, 3), activation='relu', padding='same')(x)

    # Latent representation
    flat = tf.keras.layers.Flatten()(x)
    encoded = tf.keras.layers.Dense(latent_dim, activation='relu')(flat)
    encoder = Model(input_img, encoded)

    # Decoder
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(3, (3, 3), activation='sigmoid', padding='same')(x)

    # VAE model
    autoencoder = Model(input_img, decoded)
    return autoencoder, encoder

# Create a KMeans clustering object
n_clusters = 30
kmeans = KMeans(n_clusters=n_clusters)

# Train the VAE model
autoencoder, encoder = vae_model()
autoencoder.compile(optimizer='adam', loss='mse')

# Extract frames from the video
frames = []
while True:
    ret, frame = video.read()
    if not ret:
        break
    frame = cv2.resize(frame, (640, 640))
    frames.append(frame)
frames = np.array(frames)

# Extract the latent representation of the frames
encoded_frames = encoder.predict(frames)

# Cluster the latent representations using KMeans
kmeans.fit(encoded_frames)

# Extract the keyframes
keyframes = []
for i in range(n_clusters):
    cluster_indices = np.where(kmeans.labels_ == i)[0]
    cluster_frames = frames[cluster_indices]
    cluster_distances = kmeans.transform(encoded_frames[cluster_indices])
    keyframe_index = np.argmin(np.max(cluster_distances, axis=1))
    keyframe = cluster_frames[keyframe_index]
    keyframes.append(keyframe)

# Save the keyframes as an MP4 video
height, width, layers = keyframes[0].shape
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter('keyframes_cook.mp4', fourcc, 30, (width, height))
for frame in keyframes:
    video_writer.write(frame)
video_writer.release()

# Release the video file
video.release()



## Calcluating SSIM

In [None]:
import cv2
from skimage.metrics import structural_similarity as ssim

# Load the extracted keyframes from your code
extracted_keyframes = []

video = cv2.VideoCapture('keyframes_cook.mp4')
while True:
    ret, frame = video.read()
    if not ret:
        break
    extracted_keyframes.append(frame)
video.release()

# Load the ground truth keyframes from your code
ground_truth_keyframes = keyframes  # Replace 'keyframes' with the variable name storing the ground truth keyframes

# Compare each keyframe
for extracted_frame, ground_truth_frame in zip(extracted_keyframes, ground_truth_keyframes):
    # Convert frames to grayscale for SSIM comparison
    extracted_frame_gray = cv2.cvtColor(extracted_frame, cv2.COLOR_BGR2GRAY)
    ground_truth_frame_gray = cv2.cvtColor(ground_truth_frame, cv2.COLOR_BGR2GRAY)

    # Calculate SSIM between keyframes
    similarity = ssim(extracted_frame_gray, ground_truth_frame_gray)

    # Print the similarity value
    print(f"SSIM: {similarity}")

SSIM: 0.9771203464670698
SSIM: 0.981167755327745
SSIM: 0.9766605533964451
SSIM: 0.9828939196569944
SSIM: 0.9843180626916304
SSIM: 0.9805899616481492
SSIM: 0.9797184550289161
SSIM: 0.9796338490729566
SSIM: 0.9783998935257417
SSIM: 0.9795186360366318
SSIM: 0.9812017031879224
SSIM: 0.9808075843688635
SSIM: 0.9846464992743256
SSIM: 0.9807443122321227
SSIM: 0.9816415996355486
SSIM: 0.9810423921162751
SSIM: 0.9824884913287779
SSIM: 0.9816452474669005
SSIM: 0.9804515196059042
SSIM: 0.9818186394673101
SSIM: 0.9798226403418732
SSIM: 0.9827327171383569
SSIM: 0.9808836308997407
SSIM: 0.9809050682591994
SSIM: 0.9811758232375432
SSIM: 0.9769646041582646
SSIM: 0.9813404210694728
SSIM: 0.978958284637155
SSIM: 0.9794789723234867
SSIM: 0.9787712802436588
