In [1]:
import os
import cv2
import numpy as np
import kagglehub
import matplotlib.pyplot as plt
import torch
from torchmetrics import StructuralSimilarityIndexMeasure as SSIM
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
from torchvision import transforms
from PIL import Image

In [2]:


def print_counts(path):
    # Initialize a dictionary to store folder names and their .avi file counts
    folder_avi_counts = {}

    # Loop through each folder in the root directory
    for folder_name in os.listdir(path):
        folder_path = os.path.join(path, folder_name)
        if os.path.isdir(folder_path):
            # Count the .avi files in the current folder
            avi_count = len([file for file in os.listdir(folder_path) if file.endswith('.avi')])
            folder_avi_counts[folder_name] = avi_count

    # Sort the folder_avi_counts dictionary by folder name for a cleaner chart
    sorted_counts = dict(sorted(folder_avi_counts.items()))

    # Plot the vertical bar chart for all folder counts
    plt.figure(figsize=(12, 6))
    plt.bar(sorted_counts.keys(), sorted_counts.values(), color='skyblue')
    plt.xlabel('Folder Names', fontsize=12)
    plt.ylabel('Count of .avi Files', fontsize=12)
    plt.title('Count of .avi Files in Each Folder', fontsize=14)
    plt.xticks(rotation=90, fontsize=10)  # Rotate x-axis labels for readability
    plt.tight_layout()  # Adjust layout to fit labels
    plt.show()

    # Get the top 5 folders with the highest .avi counts
    top_5_counts = dict(sorted(folder_avi_counts.items(), key=lambda x: x[1], reverse=True)[:5])

    # Plot the horizontal bar chart for top 5 folders
    plt.figure(figsize=(10, 6))
    plt.barh(list(top_5_counts.keys()), list(top_5_counts.values()), color='coral')
    plt.xlabel('Count of .avi Files', fontsize=12)
    plt.ylabel('Folder Names', fontsize=12)
    plt.title('Top 5 Folders with Highest Count of .avi Files', fontsize=14)
    plt.gca().invert_yaxis()  # Invert y-axis to display the highest count at the top
    plt.tight_layout()
    plt.show()

In [3]:
train_path = "/kaggle/input/ucf101-action-recognition/train"
val_path = "/kaggle/input/ucf101-action-recognition/val"
test_path = "/kaggle/input/ucf101-action-recognition/test"

In [4]:
selected_classes = ["HorseRiding", "PlayingDhol", "PushUps", "BenchPress", "PlayingGuitar"]

In [5]:
output_path = "/kaggle/working/processed_ucf101_train"  # Path to save processed data

# Parameters for preprocessing
frame_size = (64, 64)  # Target size for frames
convert_to_grayscale = False  # Set to True to convert to grayscale; False for RGB

# Function to preprocess videos
def preprocess_videos(output_path, file_path):
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    for class_name in selected_classes:
        class_path = os.path.join(file_path, class_name)
        processed_class_path = os.path.join(output_path, class_name)

        if not os.path.exists(processed_class_path):
            os.makedirs(processed_class_path)

        # Loop through each video in the class folder
        for video_file in os.listdir(class_path):
            if video_file.endswith(".avi"):
                video_path = os.path.join(class_path, video_file)
                video_capture = cv2.VideoCapture(video_path)

                frames = []
                while True:
                    ret, frame = video_capture.read()
                    if not ret:
                        break

                    # Resize frame
                    frame = cv2.resize(frame, frame_size)

                    # Convert to grayscale if needed
                    if convert_to_grayscale:
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

                    frames.append(frame)

                video_capture.release()

                # Save preprocessed frames as a numpy array
                frames = np.array(frames)  # Shape: (num_frames, 64, 64, 3) for RGB or (num_frames, 64, 64) for grayscale
                save_path = os.path.join(processed_class_path, video_file.replace(".avi", ".npy"))
                np.save(save_path, frames)

                #print(f"Processed and saved: {save_path}")

# Run preprocessing
preprocess_videos(output_path, train_path)
val_output_path = "/kaggle/working/processed_ucf101_val"
preprocess_videos(val_output_path, val_path)
test_output_path = "/kaggle/working/processed_ucf101_test"
preprocess_videos(test_output_path, test_path)
print("Processed and saved all videos")

Processed and saved all videos


In [6]:
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, Model
import os

# Custom pixel-level accuracy metric
def pixel_accuracy(y_true, y_pred, threshold=0.5):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    correct_pixels = tf.abs(y_true - y_pred) < threshold
    accuracy = tf.reduce_mean(tf.cast(correct_pixels, tf.float32))
    return accuracy

# Dataset Class with Noise Augmentation
class VideoFrameDataset(tf.keras.utils.Sequence):
    def __init__(self, data_path, input_frames=5, future_frames=5, batch_size=8):
        self.data_path = data_path
        self.input_frames = input_frames
        self.future_frames = future_frames
        self.batch_size = batch_size
        self.video_files = []

        for class_name in os.listdir(data_path):
            class_path = os.path.join(data_path, class_name)
            if os.path.isdir(class_path):
                for video_file in os.listdir(class_path):
                    if video_file.endswith('.npy'):
                        self.video_files.append(os.path.join(class_path, video_file))

    def __len__(self):
        return len(self.video_files) // self.batch_size

    def __getitem__(self, idx):
        batch_files = self.video_files[idx * self.batch_size:(idx + 1) * self.batch_size]
        X, y = [], []

        for file_path in batch_files:
            video = np.load(file_path)  # Video shape: (frames, H, W, 3)
            if len(video) >= (self.input_frames + self.future_frames):
                # Convert video to grayscale
                video = np.mean(video, axis=-1, keepdims=True)  # Grayscale: (frames, H, W, 1)

                start_idx = np.random.randint(0, len(video) - (self.input_frames + self.future_frames) + 1)
                input_seq = video[start_idx:start_idx + self.input_frames]
                target_seq = video[start_idx + self.input_frames:start_idx + self.input_frames + self.future_frames]

                # Normalize and add noise
                input_seq = self.add_noise(input_seq.astype(np.float32) / 255.0)
                target_seq = target_seq.astype(np.float32) / 255.0

                X.append(input_seq)
                y.append(target_seq)

        return np.array(X), np.array(y)

    @staticmethod
    def add_noise(video, noise_factor=0.05):
        noisy_video = video + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=video.shape)
        return np.clip(noisy_video, 0.0, 1.0)


# Patch Embedding Layer
class PatchEmbedding(layers.Layer):
    def __init__(self, patch_size, embed_dim):
        super().__init__()
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.projection = layers.Conv3D(
            filters=embed_dim,
            kernel_size=(1, patch_size, patch_size),
            strides=(1, patch_size, patch_size),
            padding="valid"
        )

    def call(self, inputs):
        x = self.projection(inputs)
        shape = tf.shape(x)
        batch_size, frames, h, w = shape[0], shape[1], shape[2], shape[3]
        x = tf.reshape(x, [batch_size, frames, h * w, self.embed_dim])
        return x


# Transformer Block with Residual Connections
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            layers.Dense(ff_dim, activation="gelu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, inputs, training=False):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return inputs + self.layernorm2(out1 + ffn_output)  # Double residual connection


# Reconstruction Layer
class FrameReconstruction(layers.Layer):
    def __init__(self, filters=64, embed_dim=256, patch_size=4):
        super().__init__()
        self.conv1 = layers.Conv2D(filters, kernel_size=3, strides=1, padding="same", activation="relu")
        self.conv2 = layers.Conv2D(filters, kernel_size=3, strides=1, padding="same", activation="relu")
        self.conv3 = layers.Conv2D(embed_dim, kernel_size=3, strides=1, padding="same", activation="relu")
        self.conv_transpose = layers.Conv2DTranspose(
            filters=1,
            kernel_size=patch_size,
            strides=patch_size,
            padding="same",
            activation="sigmoid"
        )

    def call(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv_transpose(x)
        return x


# ViViT Model
class ViViT(Model):
    def __init__(
        self, input_shape, patch_size=4, embed_dim=256, num_heads=12,
        ff_dim=512, num_transformer_layers=10, dropout=0.1, future_frames=5
    ):
        super().__init__()
        self.patch_embed = PatchEmbedding(patch_size, embed_dim)
        frames, h, w, c = input_shape
        self.num_patches = (h // patch_size) * (w // patch_size)
        self.embed_dim = embed_dim

        # Positional embedding updated to include num_patches
        self.pos_embed = self.add_weight(
            name="pos_embed",
            shape=[1, frames, self.num_patches, embed_dim],
            initializer=tf.keras.initializers.RandomNormal(stddev=0.02),
            trainable=True
        )

        self.transformer_blocks = [
            TransformerBlock(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_transformer_layers)
        ]
        self.future_frames = future_frames
        self.reconstruction = FrameReconstruction(embed_dim=embed_dim, patch_size=patch_size)

    def call(self, inputs, training=False):
        x = self.patch_embed(inputs)
        # Add positional embedding, matching all dimensions
        x += self.pos_embed[:, :tf.shape(x)[1], :, :]

        for block in self.transformer_blocks:
            x = block(x, training=training)

        future_frames = []
        current_embedding = x[:, -1]
        patch_dim = tf.cast(tf.sqrt(tf.cast(self.num_patches, tf.float32)), tf.int32)

        for _ in range(self.future_frames):
            current_embedding_reshaped = tf.reshape(
                current_embedding,
                [-1, patch_dim, patch_dim, self.embed_dim]
            )
            reconstructed_frame = self.reconstruction(current_embedding_reshaped)
            future_frames.append(reconstructed_frame)

            reconstructed_frame_expanded = tf.expand_dims(reconstructed_frame, axis=1)
            reconstructed_frame_patches = self.patch_embed(reconstructed_frame_expanded)
            current_embedding = tf.reduce_mean(reconstructed_frame_patches, axis=1)

        return tf.stack(future_frames, axis=1)


# Loss Functions
def ssim_loss(y_true, y_pred):
    return 1 - tf.reduce_mean(tf.image.ssim(y_true, y_pred, max_val=1.0))

def combined_loss(y_true, y_pred):
    mse = tf.keras.losses.MeanSquaredError()(y_true, y_pred)
    ssim = ssim_loss(y_true, y_pred)
    return 0.7 * mse + 0.3 * ssim


# Training Function
def train_model(model, train_dataset, val_dataset, epochs=50):
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(optimizer=optimizer, loss=combined_loss, metrics=["mse", ssim_loss, pixel_accuracy])  # Added pixel_accuracy metric
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=epochs,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True),
            tf.keras.callbacks.ModelCheckpoint("vivit_grayscale_sharp_model.keras", save_best_only=True)
        ]
    )
    return history



In [None]:
# Specify paths for training and validation datasets
train_path = "/kaggle/working/processed_ucf101_train"
val_path = "/kaggle/working/processed_ucf101_val"

# Create datasets
train_dataset = VideoFrameDataset(train_path, input_frames=10, future_frames=10, batch_size=4)
val_dataset = VideoFrameDataset(val_path, input_frames=10, future_frames=10, batch_size=4)

 
# Modify the ViViT class to accept this shape
model = ViViT(
    input_shape=( 10, 64, 64, 1),  # 5 frames, 64x64, 1 channel (grayscale)
    patch_size=8,                # Patch size
    embed_dim=256,               # Embedding dimension
    num_heads=16,                 # Number of attention heads
    ff_dim=512,                  # Feedforward dimension
    num_transformer_layers=16,    # Number of transformer layers
    dropout=0.1,                 # Dropout rate
    future_frames=10             # Predict 5 future frames
)

# Train the model
history = train_model(model, train_dataset, val_dataset, epochs=60)

Epoch 1/60


  self._warn_if_super_not_called()
I0000 00:00:1733150923.129570    3096 service.cc:145] XLA service 0x7b4918022c90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733150923.129628    3096 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
W0000 00:00:1733150927.551258    3096 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert/Assert
W0000 00:00:1733150927.552496    3096 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_1/Assert
W0000 00:00:1733150927.553970    3096 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_2/Assert
W0000 00:00:1733150927.555043    3096 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_3/Assert
W0000 00:00:1733150927.555245    3096 assert_op.cc:38] Ignoring Assert operator SSIM/Assert/Assert
W0000 00:00:1733150927.555428    3096 assert_op.cc:38] Igno

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 440ms/step - loss: 0.3135 - mse: 0.0886 - pixel_accuracy: 0.9839 - ssim_loss: 0.8383

W0000 00:00:1733151090.397270    3098 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert/Assert
W0000 00:00:1733151090.398462    3098 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_1/Assert
W0000 00:00:1733151090.399481    3098 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_2/Assert
W0000 00:00:1733151090.400118    3098 assert_op.cc:38] Ignoring Assert operator compile_loss/combined_loss/SSIM/Assert_3/Assert
W0000 00:00:1733151090.400216    3098 assert_op.cc:38] Ignoring Assert operator SSIM/Assert/Assert
W0000 00:00:1733151090.400373    3098 assert_op.cc:38] Ignoring Assert operator SSIM/Assert_1/Assert
W0000 00:00:1733151090.401431    3098 assert_op.cc:38] Ignoring Assert operator SSIM/Assert_2/Assert
W0000 00:00:1733151090.402118    3098 assert_op.cc:38] Ignoring Assert operator SSIM/Assert_3/Assert


[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 536ms/step - loss: 0.3134 - mse: 0.0886 - pixel_accuracy: 0.9839 - ssim_loss: 0.8382 - val_loss: 0.3028 - val_mse: 0.0777 - val_pixel_accuracy: 0.9632 - val_ssim_loss: 0.8281
Epoch 2/60
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 490ms/step - loss: 0.2964 - mse: 0.0736 - pixel_accuracy: 0.9614 - ssim_loss: 0.8163 - val_loss: 0.2977 - val_mse: 0.0742 - val_pixel_accuracy: 0.9526 - val_ssim_loss: 0.8193
Epoch 3/60
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 491ms/step - loss: 0.2915 - mse: 0.0701 - pixel_accuracy: 0.9566 - ssim_loss: 0.8083 - val_loss: 0.2970 - val_mse: 0.0724 - val_pixel_accuracy: 0.9509 - val_ssim_loss: 0.8212
Epoch 4/60
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 490ms/step - loss: 0.2918 - mse: 0.0703 - pixel_accuracy: 0.9529 - ssim_loss: 0.8085 - val_loss: 0.2958 - val_mse: 0.0718 - val_pixel_accuracy: 0.9507 - val_ssim_loss: 0.818

In [None]:
model_save_path = "/kaggle/working/transformers_model.h5"
model.save(model_save_path)
print(f"Model saved successfully at {model_save_path}")
model_weights_save_path = "/kaggle/working/transformers_model.weights.h5"
print(f"Model weights saved successfully at {model_weights_save_path}")


In [None]:
def plot_training_history(history):
    """
    Plots the training and validation loss, MSE, and SSIM metrics.
    Args:
    - history: The history object returned by `model.fit`.
    """
    # Plot training & validation loss
    plt.figure(figsize=(12, 6))

    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss (MSE + SSIM)')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    # MSE
    plt.subplot(1, 2, 2)
    plt.plot(history.history['mse'], label='Train MSE')
    plt.plot(history.history['val_mse'], label='Val MSE')
    plt.title('Mean Squared Error (MSE)')
    plt.xlabel('Epoch')
    plt.ylabel('MSE')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # # Optionally plot SSIM (if included in metrics)
    # plt.figure(figsize=(6, 6))
    # plt.plot(history.history['lambda'], label='Train SSIM')
    # plt.plot(history.history['val_lambda'], label='Val SSIM')
    # plt.title('SSIM (Structural Similarity Index)')
    # plt.xlabel('Epoch')
    # plt.ylabel('SSIM')
    # plt.legend()
    # plt.show()

In [None]:
def predict_future_frames(model, input_sequence):
    """
    Predict future frames using the trained model.
    Args:
        model: Trained ViViT model.
        input_sequence: Input frames of shape (batch_size, input_frames, height, width, channels).
    Returns:
        Predicted future frames of shape (batch_size, future_frames, height, width, channels).
    """
    input_sequence = np.expand_dims(input_sequence, axis=0)  # Add batch dimension if single input
    input_sequence = input_sequence.astype(np.float32) / 255.0  # Normalize to [0, 1]
    
    predicted_frames = model.predict(input_sequence)  # Predict future frames
    predicted_frames = np.clip(predicted_frames, 0, 1)  # Ensure values are in valid range
    predicted_frames = (predicted_frames * 255).astype(np.uint8)  # Convert back to [0, 255]
    
    return predicted_frames





In [None]:
import os
from PIL import Image
import numpy as np

def save_frames_to_directory(frames, output_directory):
    """
    Save frames as individual image files in the specified directory.
    Args:
        frames: Array of frames with shape (num_frames, height, width, channels).
        output_directory: Path to save the frames.
    """
    os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist
    
    for i, frame in enumerate(frames):
        frame_path = os.path.join(output_directory, f"frame_{i:04d}.png")  # Save as frame_0001.png, frame_0002.png, etc.
        Image.fromarray(frame).save(frame_path)

# Example usage
#frames_to_save = predicted_frames[0]  # Assuming predicted_frames[0] contains the frames
#frames_to_save = [np.squeeze(frame) for frame in frames_to_save]  # Adjust if grayscale

# Save frames in kaggle/working/predictions
#save_frames_to_directory(frames_to_save, "/kaggle/working/predictions")


In [None]:
import matplotlib.pyplot as plt

def visualize_predictions(input_frames, predicted_frames):
    """
    Visualize the input and predicted future frames.
    Args:
        input_frames: Array of input frames of shape (input_frames, height, width, channels).
        predicted_frames: Array of predicted frames of shape (future_frames, height, width, channels).
    """
    num_input_frames = input_frames.shape[0]
    num_predicted_frames = predicted_frames.shape[0]

    fig, axes = plt.subplots(2, max(num_input_frames, num_predicted_frames), figsize=(15, 5))
    
    # Plot input frames
    for i in range(num_input_frames):
        axes[0, i].imshow(input_frames[i, :, :, 0], cmap="gray")
        axes[0, i].axis("off")
        axes[0, i].set_title(f"Input Frame {i+1}")

    # Plot predicted frames
    for i in range(num_predicted_frames):
        axes[1, i].imshow(predicted_frames[i, :, :, 0], cmap="gray")
        axes[1, i].axis("off")
        axes[1, i].set_title(f"Predicted Frame {i+1}")

    plt.tight_layout()
    plt.show()


In [None]:
# Example input video sequence (you can use any sequence from your dataset)
test_video_path = "/kaggle/working/processed_ucf101_test/BenchPress/v_BenchPress_g01_c02.npy"
test_video = np.load(test_video_path)  # Shape: (frames, height, width, 3)

# Convert to grayscale
test_video = np.mean(test_video, axis=-1, keepdims=True)  # Shape: (frames, height, width, 1)

# Select input frames
input_frames = test_video[:10]  # First 5 frames as input

# Predict future frames
predicted_frames = predict_future_frames(model, input_frames)
predicted_frames_squeezed = np.squeeze(predicted_frames, axis=0)  # Shape: (10, 64, 64, 1)

print("Shape of the array:", predicted_frames.shape)

frames_to_save = predicted_frames[0]  # Assuming predicted_frames[0] contains the frames
frames_to_save = [np.squeeze(frame) for frame in frames_to_save]  # Adjust if grayscale

# Save frames in kaggle/working/predictions
save_frames_to_directory(frames_to_save, "/kaggle/working/predictions")

def create_gif_from_directory(input_directory, output_gif_path, duration=500):
    """
    Create a GIF from image frames stored in a directory.
    Args:
        input_directory: Path to the directory containing image frames.
        output_gif_path: Path to save the generated GIF.
        duration: Duration of each frame in milliseconds.
    """
    # Get all image file paths sorted by name
    frame_files = sorted(
        [os.path.join(input_directory, f) for f in os.listdir(input_directory) if f.endswith('.png')]
    )
    
    # Load images
    frames = [Image.open(frame_file) for frame_file in frame_files]
    
    # Create and save the GIF
    frames[0].save(
        output_gif_path,
        save_all=True,
        append_images=frames[1:],
        duration=duration,
        loop=0
    )

input_directory = "/kaggle/working/predictions"
output_gif_path = "/kaggle/working/predicted_frames.gif"
create_gif_from_directory(input_directory, output_gif_path, duration=500)




# Visualize results
visualize_predictions(input_frames, predicted_frames_squeezed)  # Visualize first predicted batch


In [None]:
# Example input video sequence (you can use any sequence from your dataset)
test_video_path = "/kaggle/working/processed_ucf101_test/PushUps/v_PushUps_g09_c03.npy"
test_video = np.load(test_video_path)  # Shape: (frames, height, width, 3)

# Convert to grayscale
test_video = np.mean(test_video, axis=-1, keepdims=True)  # Shape: (frames, height, width, 1)

# Select input frames
input_frames = test_video[:10]  # First 10 frames as input

# Predict future frames
predicted_frames = predict_future_frames(model, input_frames)
predicted_frames_squeezed = np.squeeze(predicted_frames, axis=0)  # Shape: (10, 64, 64, 1)

# Visualize results
visualize_predictions(input_frames, predicted_frames_squeezed)  # Visualize first predicted batch


In [None]:
model_weights_save_path = "/kaggle/working/transformers_model.weights.h5"
model.save_weights(model_weights_save_path)

print(f"Model weights saved successfully at {model_weights_save_path}")

In [None]:
from IPython.display import FileLink

# Provide the path to your file
file_path = '/kaggle/working/transformers_model.weights.h5'

# Create a downloadable link
FileLink(file_path)
