In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:

%cd "/content/drive/MyDrive/VideoSeal-main"

/content/drive/.shortcut-targets-by-id/1vs99Ljy_X3Wg-u5WtsXQkNH4AV1wpXk5/VideoSeal-main


In [5]:
!pip3 install -r requirements.txt



In [6]:
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import logging
logging.getLogger("matplotlib.image").setLevel(logging.ERROR)
from IPython.display import HTML, display

import pandas as pd
from tqdm import tqdm

import torch
import torchvision
import torchvision.transforms.functional as F

# import videoseal
# from videoseal.augmentation import H264
# from videoseal.evals.metrics import bit_accuracy
from videoseal.evals.metrics import bit_accuracy
from videoseal.models import Videoseal
from videoseal.utils.cfg import setup_model_from_model_card

import argparse
import cv2
import numpy as np

In [7]:
def save_torch_video(video_w, out_path, fps):
    numpy_vid = torch.permute(video_w, (0, 2, 3, 1)).detach().numpy()
    numpy_vid = (numpy_vid * 255).astype(np.uint8)

    out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (numpy_vid.shape[2], numpy_vid.shape[1]))
    cv2.imwrite("test_frame.png", numpy_vid[0])
    for i in range(numpy_vid.shape[0]):
        numpy_vid[i] = cv2.cvtColor(numpy_vid[i], cv2.COLOR_BGR2RGB)
        out.write(numpy_vid[i])

    out.release()

In [8]:
def pgd_attack(model, img, target_labels, alpha=0.007, eps=0.03, num_iter=20):
    bce_loss = torch.nn.BCEWithLogitsLoss()
    img_original = img.clone()
    for i in range(num_iter):
        img.retain_grad()
        detection = model.detect(torch.unsqueeze(img, dim=0), is_video=False)['preds']
        total_ce_loss = bce_loss(torch.squeeze(detection)[1:], torch.squeeze(target_labels.float()))
        total_ce_loss.backward(retain_graph=True)
        with torch.no_grad():
            grad_sign = torch.sign(img.grad)
            img -= alpha * grad_sign
            img = torch.clamp(img, min=img_original - eps, max=img_original + eps)
            img = torch.clamp(img, min=0, max=1)
        img.requires_grad = True
        img.grad = None
        model.grad = None
        total_ce_loss = 0
        torch.cuda.empty_cache()
    #print((detection > 0).int())
    return img.detach().cpu()

In [9]:
def final_message_extractor(msg_path, video_path, model):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    gt_msgs = torch.tensor(np.loadtxt(msg_path))

    video_w, _, _ = torchvision.io.read_video(video_path, output_format="TCHW")
    video_w = video_w.to(device)
    video_w = video_w / 255.0
    video_w = video_w[:50]
    with torch.no_grad():
        msg_extracted = model.extract_message(video_w, None).cpu().detach()
        bit_accuracy_ = bit_accuracy(msg_extracted, gt_msgs).nanmean().item()
        print(f"Bit Accuracy: {bit_accuracy_:.3f}")
        print("GT:", gt_msgs[0])
        print("Extracted", msg_extracted.int())

In [16]:
def videoseal_eval(video_name, model):
  # Read the video and convert to tensor format
  video, _, _ = torchvision.io.read_video(video_name + ".mp4", output_format="TCHW", pts_unit='sec', end_pts=6)

  # Normalize the video frames to the range [0, 1] and trim to 1 second
  video = video.float() / 255.0
  #video = video[:, :, :500, :500]
  video = video.to(device)
  # Perform watermark embedding
  gt_msgs = torch.zeros(1, 96)
  with torch.no_grad():
      outputs = model.embed(video, is_video=True, msgs=gt_msgs.to(device))
  # Extract the results
  video_w = outputs["imgs_w"].cpu().detach()  # Watermarked video frames

  save_torch_video(video_w, video_name + '_w.mp4', fps)

  np.savetxt('./assets/videos/1_msgs.txt', gt_msgs, fmt='%d')
  # Delete variables to preserve GPU memory
  del outputs
  del video_w
  del video
  torch.cuda.empty_cache()
  video_w, _, _ = torchvision.io.read_video(video_name + '_w.mp4', output_format="TCHW")
  video_w = video_w / 255.0
  #video_w = video_w[:, :, :400, :400]
  video_w = video_w.to(device)
  video_w.requires_grad = True

  # Get PGD attacked video
  video_pgd = []
  for i in range(video_w.shape[0]):
      #gt_labels = (torch.rand_like(gt_msgs) > 0).int()
      gt_labels = 1 - gt_msgs.to(device)
      video_pgd.append(pgd_attack(model, video_w[i], target_labels=gt_labels))
      torch.cuda.empty_cache()


  video_pgd = torch.stack(video_pgd, axis=0)
  save_torch_video(video_pgd, video_name + '_w_pwd.mp4', fps)

  with torch.no_grad():
      final_message_extractor('./assets/videos/1_msgs.txt', video_name + '_w.mp4', model)
      final_message_extractor('./assets/videos/1_msgs.txt', video_name + '_w_pwd.mp4', model)

In [33]:
videoseal_eval('./assets/videos/static_short', model)

Bit Accuracy: 0.775
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         ...,
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.int32)
Bit Accuracy: 0.102
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [34]:
videoseal_eval('./assets/videos/static_short_2', model)

Bit Accuracy: 0.890
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 1, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 1, 0]]], dtype=torch.int32)
Bit Accuracy: 0.107
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 1, 1,  ..., 0, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [35]:
videoseal_eval('./assets/videos/static_short_3', model)

Bit Accuracy: 0.904
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         ...,
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.int32)
Bit Accuracy: 0.127
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [18]:
videoseal_eval('./assets/videos/moving_short', model)

Bit Accuracy: 0.872
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         [1, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.int32)
Bit Accuracy: 0.113
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [37]:
videoseal_eval('./assets/videos/moving_short_2', model)

Bit Accuracy: 0.889
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 0, 0,  ..., 0, 1, 1],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]], dtype=torch.int32)
Bit Accuracy: 0.117
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 1, 1,  ..., 1, 1, 1],
         [0, 0, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [0, 1, 1,  ..., 0, 1, 1],
         [0, 0, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [17]:
videoseal_eval('./assets/videos/moving_short_3', model)

Bit Accuracy: 0.875
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 1, 0,  ..., 0, 1, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 1, 0]]], dtype=torch.int32)
Bit Accuracy: 0.119
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[[1, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         ...,
         [0, 1, 1,  ..., 1, 1, 1],
         [0, 1, 1,  ..., 1, 1, 1],
         [0, 0, 1,  ..., 1, 1, 1]]], dtype=torch.int32)


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
fps = 24

# Load the VideoSeal model
model = setup_model_from_model_card("videoseal")
model.chunk_size=2

# Set the model to evaluation mode and move it to the selected device
model = model.eval()
model = model.to(device)

# Path to the input video
# video_name = './assets/videos/static_short'
# # Read the video and convert to tensor format
# video, _, _ = torchvision.io.read_video(video_name + ".mp4", output_format="TCHW", pts_unit='sec', end_pts=5)

# # Normalize the video frames to the range [0, 1] and trim to 1 second
# video = video.float() / 255.0
# video = video[:30, :, :500, :500]
# video = video.to(device)

cuda
Model loaded successfully from /root/.cache/huggingface/hub/models--facebook--video_seal/snapshots/8037ef59ba2b2ec8fb8b55298ff37b8ccddd078d/checkpoint.pth with message: <All keys matched successfully>


AttributeError: module 'av' has no attribute 'AVError'

In [11]:

# Perform watermark embedding
gt_msgs = torch.zeros(1, 96)
with torch.no_grad():
    outputs = model.embed(video, is_video=True, msgs=gt_msgs.to(device))
# Extract the results
video_w = outputs["imgs_w"].cpu().detach()  # Watermarked video frames

save_torch_video(video_w, video_name + '_w.mp4', fps)

np.savetxt('./assets/videos/1_msgs.txt', gt_msgs, fmt='%d')
# Delete variables to preserve GPU memory
del outputs
del video_w
del video
torch.cuda.empty_cache()



In [12]:
video_w, _, _ = torchvision.io.read_video(video_name + '_w.mp4', output_format="TCHW")
video_w = video_w / 255.0
#video_w = video_w[:, :, :400, :400]
video_w = video_w.to(device)
video_w.requires_grad = True

# Get PGD attacked video
video_pgd = []
for i in range(video_w.shape[0]):
    #gt_labels = (torch.rand_like(gt_msgs) > 0).int()
    gt_labels = 1 - gt_msgs.to(device)
    video_pgd.append(pgd_attack(model, video_w[i], target_labels=gt_labels))
    torch.cuda.empty_cache()


video_pgd = torch.stack(video_pgd, axis=0)
save_torch_video(video_pgd, video_name + '_w_pwd.mp4', fps)

with torch.no_grad():
    final_message_extractor('./assets/videos/1_msgs.txt', video_name + '_w.mp4', model)
    final_message_extractor('./assets/videos/1_msgs.txt', video_name + '_w_pwd.mp4', model)



Bit Accuracy: 0.625
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
         0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
         1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
         0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]],
       dtype=torch.int32)
Bit Accuracy: 0.062
GT: tensor(0., dtype=torch.float64)
Extracted tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       dtype=torch.int32)
