In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import PIL
import os
from tqdm import tqdm
import gc

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
frames_per_video = 16

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Dataset

In [3]:
class VideoDataset(Dataset):
    
    def __init__(self, video_paths, labels, transform=None, frames_per_video=frames_per_video):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform
        self.frames_per_video=frames_per_video

      
    def extract_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            return []

        current_frame = 0
        
        while current_frame < self.frames_per_video:
            ret, frame = cap.read()
            
            if not ret:
                break

            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(frame)
            current_frame += 1

        if len(frames) > 0:
            while len(frames) < (self.frames_per_video):
                frames.append(frames[-1])
        
        cap.release()
        return frames
                
    
    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        
        frames = self.extract_frames(video_path)
        
        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        else:
            frames = [torch.tensor(frame).permute(2,0,1).float() / 255.0 for frame in frames]

        frames_tensor = torch.stack(frames)

        return (frames_tensor, torch.tensor(label, dtype=torch.long))
    

    def __len__(self):
        return len(self.video_paths)

In [4]:
data_path = "/kaggle/input/faceforensics/FF++"

def load_dataset(data_path):
    
    real_path = os.path.join(data_path, "real")
    fake_path = os.path.join(data_path, "fake")

    video_paths = []
    labels = []

    if os.path.exists(real_path):
        real_videos = [f for f in os.listdir(real_path)]

        for video in real_videos:
            video_paths.append(os.path.join(real_path, video))
            labels.append(0)

    if os.path.exists(fake_path):
        fake_videos = [f for f in os.listdir(fake_path)]

        for video in fake_videos:
            video_paths.append(os.path.join(fake_path, video))
            labels.append(1)

    return video_paths, labels

In [5]:
file_path = "/kaggle/input/faceforensics/FF++"

video_paths, labels = load_dataset(file_path)

print(len(video_paths))
print(len(labels))
print(list(set(labels)))
print(video_paths[:5])

400
400
[0, 1]
['/kaggle/input/faceforensics/FF++/real/08__talking_against_wall.mp4', '/kaggle/input/faceforensics/FF++/real/14__walking_down_indoor_hall_disgust.mp4', '/kaggle/input/faceforensics/FF++/real/08__walking_down_street_outside_angry.mp4', '/kaggle/input/faceforensics/FF++/real/05__outside_talking_still_laughing.mp4', '/kaggle/input/faceforensics/FF++/real/14__exit_phone_room.mp4']


In [6]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
train_paths, temp_paths, train_labels, temp_labels = train_test_split(video_paths, labels, train_size=0.7, 
                                                              random_state=42, stratify=labels, shuffle=True)

val_paths, test_paths, val_labels, test_labels = train_test_split(temp_paths, temp_labels, train_size=0.7, 
                                                                  random_state=42, stratify=temp_labels, shuffle=True)

In [8]:
from collections import Counter

print(Counter(train_labels))
print(Counter(val_labels))
print(Counter(test_labels))

Counter({1: 140, 0: 140})
Counter({0: 42, 1: 42})
Counter({0: 18, 1: 18})


In [9]:
train_dataset = VideoDataset(video_paths=train_paths, labels=train_labels, transform=transform, frames_per_video=frames_per_video)
val_dataset = VideoDataset(video_paths=val_paths, labels=val_labels, transform=None, frames_per_video=frames_per_video)
test_dataset = VideoDataset(video_paths=test_paths, labels=test_labels, transform=None, frames_per_video=frames_per_video)

In [10]:
batch_size = 4

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Helper Functions

In [11]:
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()


def train_model_one_epoch(model, train_loader, optimizer, loss_fn, scheduler=None):

    total_train_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
        
    model.train()
    train_tqdm = tqdm(train_loader, desc="Training: ")

    for batch_idx, (frames, labels) in enumerate(train_tqdm):
        
        frames, labels = frames.to(device), labels.to(device)

        B, T, C, H, W = frames.shape
        frames = frames.view(B*T, C, H, W)
        
        optimizer.zero_grad()
        
        with autocast():
            output = model(frames) 
            output = output.view(B, T, -1).mean(dim=1)
            
            loss = loss_fn(output, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_train_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        probs = torch.softmax(output, dim=1)[:,1]

        all_preds.extend(predicted.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())
        all_probs.extend(probs.detach().cpu().numpy())
        
        train_tqdm.set_postfix({"Loss": f"{loss.item():.4f}"})

    
    train_loss = total_train_loss / len(train_loader)

    train_accuracy = accuracy_score(all_labels, all_preds)
    train_precision = precision_score(all_labels, all_preds, average="binary")
    train_recall = recall_score(all_labels, all_preds, average="binary")
    train_f1 = f1_score(all_labels, all_preds, average="binary")    
    

    return (
        train_loss,
        train_accuracy,
        train_precision,
        train_recall,
        train_f1,
        all_probs
    )

  scaler = GradScaler()


In [12]:
def val_model_one_epoch(model, val_loader, loss_fn):
    
    total_val_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []

    best_model_state = None
    model.eval()

    with torch.no_grad():
            
        val_tqdm = tqdm(val_loader, desc="Validation: ")
        
        for frames, labels in val_tqdm:
            frames, labels = frames.to(device), labels.to(device)

            B, T, C, H, W = frames.shape
            frames = frames.view(B*T, C, H, W)
                    
            output = model(frames)
            output = output.view(B, T, -1).mean(dim=1)
            loss = loss_fn(output, labels)
    
            total_val_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            probs = torch.softmax(output, dim=1)[:,1]
                    
            all_preds.extend(predicted.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())
            all_probs.extend(probs.detach().cpu().numpy())

            val_tqdm.set_postfix(loss=loss.item())

    
    val_accuracy = accuracy_score(all_labels, all_preds)
    val_precision = precision_score(all_labels, all_preds, average="binary")
    val_recall = recall_score(all_labels, all_preds, average="binary")
    val_f1 = f1_score(all_labels, all_preds, average="binary")
    val_loss = total_val_loss / len(val_loader)

    print(classification_report(all_labels, all_preds))

    return (
        val_loss,
        val_accuracy,
        val_precision,
        val_recall,
        val_f1,
        all_probs
    )

# Modeling

## 1. XceptionNet

In [13]:
class XceptionNet(nn.Module):
    
    def __init__(self, num_classes=2, num_frames=frames_per_video):
        super(XceptionNet, self).__init__()
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(3, 32, 3, 2, 1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        
        self.conv2 = nn.Conv2d(32, 64, 3, bias=False, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        
        self.separable_conv1 = self._make_separable_conv(64, 128, 2)
        self.separable_conv2 = self._make_separable_conv(128, 256, 2)
        self.separable_conv3 = self._make_separable_conv(256, 512, 2)
        
        self.middle_blocks = nn.ModuleList([
            self._make_separable_conv(512, 512, 1) for _ in range(8)
        ])
        
        self.separable_conv4 = self._make_separable_conv(512, 1024, 2)
        
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))
        
        self.temporal_conv = nn.Conv1d(1024, 512, 1)
        self.temporal_pool = nn.AdaptiveAvgPool1d(1)
        
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
        
    def _make_separable_conv(self, in_channels, out_channels, stride):
        return nn.Sequential(
            # Depthwise conv
            nn.Conv2d(in_channels, in_channels, 3, stride, 1, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            # Pointwise conv
            nn.Conv2d(in_channels, out_channels, 1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )
    
    def forward(self, x):
        batch_size, num_frames = x.size(0), x.size(1)
        
        x = x.view(-1, x.size(2), x.size(3), x.size(4))
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = nn.functional.relu(x, inplace=True)
        
        x = self.conv2(x)
        x = self.bn2(x)
        x = nn.functional.relu(x, inplace=True)
        
        x = self.separable_conv1(x)
        x = self.separable_conv2(x)
        x = self.separable_conv3(x)
        
        for block in self.middle_blocks:
            x = block(x) + x  # Residual connection
        
        x = self.separable_conv4(x)
        
        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1)
        
        x = x.view(batch_size, num_frames, -1)
        
        x = x.permute(0, 2, 1)  # (batch, features, frames)
        x = self.temporal_conv(x)
        x = self.temporal_pool(x)
        x = x.squeeze(-1)
        
        x = self.classifier(x)
        return x

In [14]:
import timm

model = timm.create_model("xception", pretrained=True, num_classes=2)

for param in model.parameters():
    param.requires_grad = False

in_features = model.get_classifier().in_features
model.fc = nn.Linear(in_features, 2)

  model = create_fn(
Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/xception-43020ad28.pth" to /root/.cache/torch/hub/checkpoints/xception-43020ad28.pth


### Hyperparameters

In [15]:
device = ("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)
#model = XceptionNet(num_classes=2, num_frames=frames_per_video).to(device)

optimizer = AdamW(
    model.parameters(),
    lr = 0.001,
    weight_decay = 1e-4
)

scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0 = 8,
    T_mult = 2,
    eta_min = 1e-6
)

loss_fn = nn.CrossEntropyLoss()

epochs = 20

In [16]:
best_val_f1 = 0
best_val_loss = float("inf")
patience = 4
counter = 0

In [17]:
for epoch in range(epochs):
    
    print(f"\nEpoch: {epoch+1}/{epochs}")

    train_loss, train_accuracy, train_precision, train_recall, train_f1, train_probs = train_model_one_epoch(model = model, 
                        train_loader = train_loader, optimizer = optimizer, loss_fn = loss_fn)

    val_loss, val_accuracy, val_precision,val_recall, val_f1, val_probs = val_model_one_epoch(model = model, 
                        val_loader = val_loader, loss_fn = loss_fn)

    scheduler.step()

    print(f"\nTRAINING METRICS: ")
    print(f"Loss: {train_loss}  Accuracy: {train_accuracy}  Precision: {train_precision}  Recall: {train_recall}  F1 Score: {train_f1}")

    torch.cuda.empty_cache()
    gc.collect()

    print(f"\nValiation METRICS: ")
    print(f"Loss: {val_loss}  Accuracy: {val_accuracy}  Precision: {val_precision}  Recall: {val_recall}  F1 Score: {val_f1}")

    
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = copy.deepcopy(model.state_dict())


    if best_val_loss > val_loss:
        best_val_loss = val_loss
        counter = 0

    else:
        counter += 1

    if counter >= patience:
        print("Early Stopping Triggered")
        break


Epoch: 1/20


  with autocast():
Training: 100%|██████████| 70/70 [01:42<00:00,  1.46s/it, Loss=0.6865]
Validation:   0%|          | 0/21 [00:04<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.84 GiB. GPU 0 has a total capacity of 14.74 GiB of which 5.14 GiB is free. Process 4862 has 9.59 GiB memory in use. Of the allocated memory 9.42 GiB is allocated by PyTorch, and 39.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
if best_model_state:
    model.load_state_dict(best_model_state)