In [14]:
!pip install opendatasets



In [15]:
import opendatasets as od
import pandas

In [16]:
!rm -rf /content/realfake-video-dataset

In [17]:
od.download("https://www.kaggle.com/datasets/mohammadsarfrazalam/realfake-video-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: mohammadsarfrazalam
Your Kaggle Key: ··········
Downloading realfake-video-dataset.zip to ./realfake-video-dataset


100%|██████████| 240M/240M [00:01<00:00, 143MB/s]





In [18]:
# Continue with regular imports
import matplotlib.pyplot as plt
import torch
import torchvision

from torch import nn
from torchvision import transforms

# Try to get torchinfo, install it if it doesn't work
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

# Try to import the going_modular directory, download it from GitHub if it doesn't work
try:
    from going_modular.going_modular import data_setup, engine
    from helper_functions import download_data, set_seeds, plot_loss_curves
except:
    # Get the going_modular scripts
    print("[INFO] Couldn't find going_modular or helper_functions scripts... downloading them from GitHub.")
    !git clone https://github.com/mrdbourke/pytorch-deep-learning
    !mv pytorch-deep-learning/going_modular .
    !mv pytorch-deep-learning/helper_functions.py . # get the helper_functions.py script
    !rm -rf pytorch-deep-learning
    from going_modular.going_modular import data_setup, engine
    from helper_functions import download_data, set_seeds, plot_loss_curves

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import Dataset
from torch import nn

In [21]:
input_shape = (224,224)

## Building the DataFrame

In [None]:
import pandas as pd
from glob import glob
import numpy as np

In [None]:
real_paths = glob("/content/realfake-video-dataset/real/*/*")
fake_paths = glob("/content/realfake-video-dataset/fake/*/*")


In [None]:
len(fake_paths), len(real_paths)

(472, 400)

In [None]:
paths = np.concatenate((real_paths, fake_paths))
fake = np.concatenate((np.zeros(len(real_paths)),np.ones(len(fake_paths))))

In [None]:
import pandas as pd
from glob import glob
import numpy as np

import torch
import torchvision
from torchvision import transforms

from torch import nn

## Building the Dataset

In [22]:
import os

In [23]:
from torch.utils.data import Dataset

In [24]:
class VideoDetectionDatasetV1(Dataset):
  def __init__(self, paths, labels):
    super()
    self.paths = paths
    self.labels = labels
    pass

  def __len__(self):
    return len(self.paths)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    frames = glob(os.path.join(self.paths[idx],"*"))
    label = self.labels[idx]

    sample = {
        "frames": frames,
        "fake": label,
        "path": self.paths[idx]
    }

    return sample


In [25]:
video_detection_dataset_v1 = VideoDetectionDatasetV1(paths,fake)

In [26]:
video_detection_dataset_v1[0]

{'frames': ['/content/realfake-video-dataset/real/MSVD/68_0/frame3.jpg',
  '/content/realfake-video-dataset/real/MSVD/68_0/frame0.jpg',
  '/content/realfake-video-dataset/real/MSVD/68_0/frame2.jpg',
  '/content/realfake-video-dataset/real/MSVD/68_0/frame1.jpg'],
 'fake': 0.0,
 'path': '/content/realfake-video-dataset/real/MSVD/68_0'}

In [None]:
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-wmmnv3at
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-wmmnv3at
  Resolved https://github.com/openai/CLIP.git to commit a1d071733d7111c9c014f024669f959182114e33
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.2.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m931.6 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->clip==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->clip==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-p

In [None]:
import clip
model, preprocess = clip.load("ViT-B/32", device=device)

# Setting up ClipVIT

In [None]:
from PIL import Image

In [None]:
image = preprocess(Image.open("/content/realfake-video-dataset/real/MSVD/0_0/frame0.jpg")).unsqueeze(0).to(device)

In [None]:
with torch.no_grad():
  image_features = model.encode_image(image)

In [None]:
from tqdm import tqdm

In [None]:
resize_transform = transforms.Resize(input_shape)

In [None]:
def getFeatures(path, transform=resize_transform):
  img = Image.open(path)
  transformed_img=img
  if transform is not None:
    transformed_img = transform(img)

  image = preprocess(transformed_img).unsqueeze(0).to(device)
  image_features = model.encode_image(image)

  return image_features

In [None]:
getFeatures("/content/realfake-video-dataset/real/MSVD/0_0/frame0.jpg",resize_transform).shape

torch.Size([1, 512])

In [None]:
features_data = []
features_label = []

for i in tqdm(range(len(video_detection_dataset_v1))):
  sample = video_detection_dataset_v1[i]
  feature_set = []
  for i in range(0,4):
    frame = os.path.join(sample["path"], f'frame{i}.jpg')
    frame_featureset = getFeatures(frame, resize_transform)
    feature_set.append(frame_featureset.detach().numpy())

  if sample["fake"] == 0.0:
    features_label.append(0)
  else:
    features_label.append(1)

  features_data.append(np.array(feature_set))
  pass

100%|██████████| 872/872 [14:57<00:00,  1.03s/it]


In [None]:
features_label

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [None]:
len(features_data), len(video_detection_dataset_v1)

(872, 872)

In [None]:
features_data[180].shape

(4, 1, 512)

In [None]:
features_data[80]

array([[[ 0.38766155, -0.2891936 , -0.32155976, ...,  0.85192263,
          0.18196571,  0.50799996]],

       [[ 0.07789263, -0.02788828, -0.12115043, ...,  0.96208364,
          0.32585344,  0.5601748 ]],

       [[ 0.10716557, -0.16418365, -0.17438841, ...,  0.9506273 ,
          0.30708146,  0.56472373]],

       [[ 0.16094775, -0.1009776 , -0.05352733, ...,  1.0295435 ,
          0.2516189 ,  0.63440883]]], dtype=float32)

In [None]:
np.array(features_data)

array([[[[ 0.05263811,  0.04423955,  0.27214235, ...,  0.36872342,
          -0.3586424 ,  0.00132785]],

        [[ 0.01881401,  0.09063845,  0.2776182 , ...,  0.5014169 ,
          -0.43635702,  0.04243718]],

        [[ 0.02163584,  0.09365203,  0.26532605, ...,  0.4888641 ,
          -0.42907768,  0.02978035]],

        [[ 0.13789478,  0.11604232,  0.31128398, ...,  0.32942954,
          -0.51528937,  0.10649417]]],


       [[[ 0.04719633,  0.02175133,  0.08219218, ...,  0.37099028,
           0.29979274,  0.47452345]],

        [[-0.18049693,  0.10077631,  0.17697635, ...,  0.403755  ,
           0.45333195,  0.5330764 ]],

        [[-0.13986011,  0.14379954,  0.09819879, ...,  0.5878541 ,
           0.30821723,  0.55616   ]],

        [[-0.11029769,  0.12295825,  0.08543553, ...,  0.72305256,
           0.19747138,  0.37365502]]],


       [[[ 0.12570408,  0.17865135, -0.17236936, ...,  0.5303531 ,
          -0.10975075,  0.05779109]],

        [[ 0.12570408,  0.17865135, -0.172

In [None]:
file = open("clip_input", "wb")

np.save(file, features_data)

In [None]:
file = open("clip_output", "wb")

np.save(file, features_label)

In [None]:
features_data[0].shape

(4, 1, 512)

# Building the model

## Get Input and Labels

In [None]:
input = np.load("clip_input")
labels = np.load("clip_output")

In [None]:
input.shape

(872, 4, 1, 512)

In [None]:
input = input.reshape((len(input),1, 2048))
input.shape

(872, 1, 2048)

In [None]:
labels.shape

(872,)

## Building the Dataset and Dataloader

In [None]:
class VideoDetectionDatasetV2(Dataset):
  def __init__(self, input, labels):
    super()
    self.inputs = input
    self.labels = labels
    pass

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()


    return self.inputs[idx], self.labels[idx]

In [None]:
video_detection_dataset_v2 = VideoDetectionDatasetV2(input, labels)

In [None]:
from torch.utils.data import DataLoader

In [None]:
len(video_detection_dataset_v2)

872

In [None]:
train_size = int(0.8 * len(video_detection_dataset_v2))
test_size = len(video_detection_dataset_v2) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(video_detection_dataset_v2, [train_size, test_size])

In [None]:
len(train_dataset)

697

In [None]:
encoder_layer = nn.TransformerEncoderLayer(d_model=2048, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(4, 1, 512).flatten().reshape((1,1,2048))
out = transformer_encoder(src)

out.shape



torch.Size([1, 1, 2048])

In [None]:
src.flatten().shape

torch.Size([2048])

In [None]:
  classifier = nn.Sequential(
        nn.LayerNorm(normalized_shape=2048),
        nn.Linear(
            in_features=2048,
            out_features=2
        )
    )


In [None]:
out[:,0].shape

torch.Size([1, 2048])

In [None]:
classifier(out)

tensor([[[ 0.0393, -0.8499]]], grad_fn=<ViewBackward0>)

In [None]:
class ViT(nn.Module):
  def __init__(
      self,
      d_model:int = 2048,
      num_heads:int = 12,
      num_classes:int=1000
  ):
    super().__init__()

    self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=num_heads)

    self.classifier = nn.Sequential(
        nn.LayerNorm(normalized_shape=d_model),
        nn.Linear(
            in_features=d_model,
            out_features=num_classes
        )
    )

  def forward(self, x):
    x = self.encoder_layer(x)
    x = self.classifier(x[:,0])

    return x

In [None]:
modelV1 = ViT(
    d_model=2048,
    num_heads=8,
    num_classes=2
)

In [None]:
summary(model=modelV1,
        input_size=(4,1,2048),
        col_names=["input_size", "output_size", "num_params","trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                       Input Shape          Output Shape         Param #              Trainable
ViT (ViT)                                     [4, 1, 2048]         [4, 2]               --                   True
├─TransformerEncoderLayer (encoder_layer)     [4, 1, 2048]         [4, 1, 2048]         --                   True
│    └─MultiheadAttention (self_attn)         [4, 1, 2048]         [4, 1, 2048]         16,785,408           True
│    └─Dropout (dropout1)                     [4, 1, 2048]         [4, 1, 2048]         --                   --
│    └─LayerNorm (norm1)                      [4, 1, 2048]         [4, 1, 2048]         4,096                True
│    └─Linear (linear1)                       [4, 1, 2048]         [4, 1, 2048]         4,196,352            True
│    └─Dropout (dropout)                      [4, 1, 2048]         [4, 1, 2048]         --                   --
│    └─Linear (linear2)                       [4, 1, 2048]         [4, 1, 2048]        

In [None]:
optimizer = torch.optim.Adam(
    params=modelV1.parameters(),
    lr=1e-3,
    betas=(0.9,0.999),
    weight_decay=0.1
)

loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32,
                        shuffle=True, num_workers=0)

test_dataloader = DataLoader(test_dataset, batch_size=32,
                        shuffle=True, num_workers=0)

In [None]:
for batch,(x,y) in enumerate(train_dataloader):
  print(batch, (x.shape,y.shape))

0 (torch.Size([32, 1, 2048]), torch.Size([32]))
1 (torch.Size([32, 1, 2048]), torch.Size([32]))
2 (torch.Size([32, 1, 2048]), torch.Size([32]))
3 (torch.Size([32, 1, 2048]), torch.Size([32]))
4 (torch.Size([32, 1, 2048]), torch.Size([32]))
5 (torch.Size([32, 1, 2048]), torch.Size([32]))
6 (torch.Size([32, 1, 2048]), torch.Size([32]))
7 (torch.Size([32, 1, 2048]), torch.Size([32]))
8 (torch.Size([32, 1, 2048]), torch.Size([32]))
9 (torch.Size([32, 1, 2048]), torch.Size([32]))
10 (torch.Size([32, 1, 2048]), torch.Size([32]))
11 (torch.Size([32, 1, 2048]), torch.Size([32]))
12 (torch.Size([32, 1, 2048]), torch.Size([32]))
13 (torch.Size([32, 1, 2048]), torch.Size([32]))
14 (torch.Size([32, 1, 2048]), torch.Size([32]))
15 (torch.Size([32, 1, 2048]), torch.Size([32]))
16 (torch.Size([32, 1, 2048]), torch.Size([32]))
17 (torch.Size([32, 1, 2048]), torch.Size([32]))
18 (torch.Size([32, 1, 2048]), torch.Size([32]))
19 (torch.Size([32, 1, 2048]), torch.Size([32]))
20 (torch.Size([32, 1, 2048]),

In [None]:
from going_modular.going_modular import engine

set_seeds()

results = engine.train(
    model=modelV1,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    epochs=10,
    device=device
)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 1.0987 | train_acc: 0.8295 | test_loss: 0.2433 | test_acc: 0.9108
Epoch: 2 | train_loss: 0.0764 | train_acc: 0.9783 | test_loss: 0.1061 | test_acc: 0.9622
Epoch: 3 | train_loss: 0.0798 | train_acc: 0.9811 | test_loss: 0.0925 | test_acc: 0.9628
Epoch: 4 | train_loss: 0.0623 | train_acc: 0.9868 | test_loss: 0.0765 | test_acc: 0.9792
Epoch: 5 | train_loss: 0.0609 | train_acc: 0.9897 | test_loss: 0.0773 | test_acc: 0.9688
Epoch: 6 | train_loss: 0.0503 | train_acc: 0.9943 | test_loss: 0.0815 | test_acc: 0.9688
Epoch: 7 | train_loss: 0.0482 | train_acc: 0.9957 | test_loss: 0.0755 | test_acc: 0.9688
Epoch: 8 | train_loss: 0.0439 | train_acc: 0.9957 | test_loss: 0.0765 | test_acc: 0.9688
Epoch: 9 | train_loss: 0.0408 | train_acc: 0.9986 | test_loss: 0.0643 | test_acc: 0.9792
Epoch: 10 | train_loss: 0.0402 | train_acc: 0.9972 | test_loss: 0.0698 | test_acc: 0.9792


## Testing Models

In [None]:
!rm -rf test

In [None]:
!unzip test.zip

Archive:  test.zip
   creating: kaggle/working/test/
   creating: kaggle/working/test/sora/
   creating: kaggle/working/test/sora/tokyo-walk_0/
  inflating: kaggle/working/test/sora/tokyo-walk_0/frame1.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_0/frame0.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_0/frame2.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_0/frame3.jpg  
   creating: kaggle/working/test/sora/tokyo-walk_1/
  inflating: kaggle/working/test/sora/tokyo-walk_1/frame1.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_1/frame0.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_1/frame2.jpg  
  inflating: kaggle/working/test/sora/tokyo-walk_1/frame3.jpg  


In [None]:
!ls

clip_input   going_modular	  kaggle       realfake-video-dataset  test.zip
clip_output  helper_functions.py  __pycache__  sample_data


In [None]:
!rm -rf test.zip

In [None]:
path = "/content/kaggle/working/test/sora/tokyo-walk_1"

In [None]:
test_data = np.array([
    getFeatures(f"{path}/frame0.jpg").cpu().detach().numpy(),
    getFeatures(f"{path}/frame1.jpg").cpu().detach().numpy(),
    getFeatures(f"{path}/frame2.jpg").cpu().detach().numpy(),
    getFeatures(f"{path}/frame3.jpg").cpu().detach().numpy(),
])

In [None]:
test_data.shape

(4, 1, 512)

In [None]:
test_data = test_data.reshape(1,1,2048)

In [None]:
modelV1.eval()

data = modelV1(torch.tensor(test_data).to(device).to(torch.float32))

In [None]:
data

tensor([[-3.9418,  2.7476]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
np.argmax(data.cpu().detach().numpy())

1