In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import cv2
import PIL
import os
from tqdm import tqdm
import gc
import copy 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
frames_per_video = 32
device = ("cuda" if torch.cuda.is_available() else "cpu")

import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Dataset

In [3]:
class VideoDataset(Dataset):
    
    def __init__(self, video_paths, labels, transform=None, frames_per_video=frames_per_video):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform
        self.frames_per_video=frames_per_video

      
    def extract_frames(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)

        if not cap.isOpened():
            frames = [np.zeros((224, 224, 3), dtype=np.uint8)] * self.frames_per_video
            return frames

        count_frame = 0
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if(total_frames >= self.frames_per_video):           
            frame_indices = np.linspace(0, total_frames-1, self.frames_per_video, dtype=int)
            
            while count_frame < self.frames_per_video:
                cap.set(cv2.CAP_PROP_POS_FRAMES, frame_indices[count_frame])
                
                ret, frame = cap.read()
                if not ret:
                    continue
                    
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
                count_frame += 1

        else:
            while True:        
                ret, frame = cap.read()

                if not ret:
                    break

                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
        
            while len(frames) < (self.frames_per_video):
                frames.append(frames[-1])
        
        cap.release()
        return frames
                
    
    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        
        frames = self.extract_frames(video_path)
        
        if self.transform:
            frames = [self.transform(frame) for frame in frames]

        else:
            frames = [torch.tensor(frame).permute(2,0,1).float() / 255.0 for frame in frames]

        frames_tensor = torch.stack(frames)

        return (frames_tensor, torch.tensor(label, dtype=torch.long))
    

    def __len__(self):
        return len(self.video_paths)

In [4]:
data_path = "/kaggle/input/faceforensics/FF++"

def load_dataset(data_path):
    
    real_path = os.path.join(data_path, "real")
    fake_path = os.path.join(data_path, "fake")

    video_paths = []
    labels = []

    if os.path.exists(real_path):
        real_videos = [f for f in os.listdir(real_path)]

        for video in real_videos:
            video_paths.append(os.path.join(real_path, video))
            labels.append(0)

    if os.path.exists(fake_path):
        fake_videos = [f for f in os.listdir(fake_path)]

        for video in fake_videos:
            video_paths.append(os.path.join(fake_path, video))
            labels.append(1)

    return video_paths, labels

In [5]:
file_path = "/kaggle/input/faceforensics/FF++"

video_paths, labels = load_dataset(file_path)

print(len(video_paths))
print(len(labels))
print(list(set(labels)))
print(video_paths[:5])

0
0
[]
[]


In [6]:
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [7]:
train_paths, temp_paths, train_labels, temp_labels = train_test_split(video_paths, labels, train_size=0.7, 
                                                              random_state=42, stratify=labels, shuffle=True)

val_paths, test_paths, val_labels, test_labels = train_test_split(temp_paths, temp_labels, train_size=0.7, 
                                                                  random_state=42, stratify=temp_labels, shuffle=True)

ValueError: With n_samples=0, test_size=None and train_size=0.7, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
from collections import Counter

print(Counter(train_labels))
print(Counter(val_labels))
print(Counter(test_labels))

Counter({1: 140, 0: 140})
Counter({0: 42, 1: 42})
Counter({0: 18, 1: 18})


In [None]:
train_dataset = VideoDataset(video_paths=train_paths, labels=train_labels, transform=train_transform, frames_per_video=frames_per_video)
val_dataset = VideoDataset(video_paths=val_paths, labels=val_labels, transform=val_transform, frames_per_video=frames_per_video)
test_dataset = VideoDataset(video_paths=test_paths, labels=test_labels, transform=val_transform, frames_per_video=frames_per_video)

In [None]:
batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Helper Functions

In [None]:
def train_model_one_epoch(model, train_loader, optimizer, loss_fn, scheduler=None):

    total_train_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []
        
    model.train()
    train_tqdm = tqdm(train_loader, desc="Training: ")

    for batch_idx, (frames, labels) in enumerate(train_tqdm):
        
        frames, labels = frames.to(device), labels.to(device)
        frames = frames.permute(0, 2, 1, 3, 4)
        
        optimizer.zero_grad()
        
        output = model(frames)             
        loss = loss_fn(output, labels)
        
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        probs = torch.softmax(output, dim=1)[:,1]

        all_preds.extend(predicted.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())
        all_probs.extend(probs.detach().cpu().numpy())
        
        train_tqdm.set_postfix({"Loss": f"{loss.item():.4f}"})

    
    train_loss = total_train_loss / len(train_loader)

    train_accuracy = accuracy_score(all_labels, all_preds)
    train_precision = precision_score(all_labels, all_preds, average="binary")
    train_recall = recall_score(all_labels, all_preds, average="binary")
    train_f1 = f1_score(all_labels, all_preds, average="binary")    
    

    return (
        train_loss,
        train_accuracy,
        train_precision,
        train_recall,
        train_f1,
        all_probs
    )

In [None]:
def val_model_one_epoch(model, val_loader, loss_fn):
    
    total_val_loss = 0
    all_preds = []
    all_labels = []
    all_probs = []

    best_model_state = None
    model.eval()

    with torch.no_grad():
            
        val_tqdm = tqdm(val_loader, desc="Validation: ")
        
        for frames, labels in val_tqdm:
            frames, labels = frames.to(device), labels.to(device)
            frames = frames.permute(0, 2, 1, 3, 4)
                    
            output = model(frames)
            loss = loss_fn(output, labels)
    
            total_val_loss += loss.item()
            _, predicted = torch.max(output.data, 1)
            probs = torch.softmax(output, dim=1)[:,1]
                    
            all_preds.extend(predicted.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())
            all_probs.extend(probs.detach().cpu().numpy())

            val_tqdm.set_postfix(loss=loss.item())

    
    val_accuracy = accuracy_score(all_labels, all_preds)
    val_precision = precision_score(all_labels, all_preds, average="binary")
    val_recall = recall_score(all_labels, all_preds, average="binary")
    val_f1 = f1_score(all_labels, all_preds, average="binary")
    val_loss = total_val_loss / len(val_loader)

    print(classification_report(all_labels, all_preds))

    return (
        val_loss,
        val_accuracy,
        val_precision,
        val_recall,
        val_f1,
        all_probs
    )

# Modeling

## 1. EfficientNet v2 small

In [None]:
from torchvision.models.video import r3d_18

model = r3d_18(pretrained=True).to(device)

for param in model.parameters():
    param.requires_grad = False

model.fc = nn.Linear(model.fc.in_features, 2)



### Hyperparameters

In [None]:
model = model.to(device)
#model = XceptionNet(num_classes=2, num_frames=frames_per_video).to(device)

optimizer = AdamW(
    model.parameters(),
    lr = 0.0001,
    weight_decay = 0.01
)

scheduler = CosineAnnealingWarmRestarts(
    optimizer,
    T_0 = 8,
    T_mult = 2,
    eta_min = 1e-6
)

loss_fn = nn.CrossEntropyLoss()

epochs = 20

In [None]:
best_val_f1 = 0
best_val_loss = float("inf")
patience = 4
counter = 0

In [None]:
print(next(model.parameters()).device)
print(device)

cuda:0
cuda


In [None]:
for epoch in range(epochs):
    
    print(f"\nEpoch: {epoch+1}/{epochs}")

    train_loss, train_accuracy, train_precision, train_recall, train_f1, train_probs = train_model_one_epoch(model = model, 
                        train_loader = train_loader, optimizer = optimizer, loss_fn = loss_fn)

    val_loss, val_accuracy, val_precision,val_recall, val_f1, val_probs = val_model_one_epoch(model = model, 
                        val_loader = val_loader, loss_fn = loss_fn)

    scheduler.step()

    print(f"\nTRAINING METRICS: ")
    print(f"Loss: {train_loss}  Accuracy: {train_accuracy}  Precision: {train_precision}  Recall: {train_recall}  F1 Score: {train_f1}")

    torch.cuda.empty_cache()
    gc.collect()

    print(f"\nValiation METRICS: ")
    print(f"Loss: {val_loss}  Accuracy: {val_accuracy}  Precision: {val_precision}  Recall: {val_recall}  F1 Score: {val_f1}")

    
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_model_state = copy.deepcopy(model.state_dict())
        print("Best Model Updated")


    if best_val_loss > val_loss:
        best_val_loss = val_loss
        counter = 0

    else:
        counter += 1

    if counter >= patience:
        print("Early Stopping Triggered")
        break


Epoch: 1/20


Training:   3%|▎         | 1/35 [01:42<58:00, 102.38s/it, Loss=0.7272]

In [None]:
if best_model_state:
    model.load_state_dict(best_model_state)

In [None]:
model_path = "/kaggle/working/deepfake_trial.pth"

torch.save(model.state_dict(), model_path)

In [None]:
test_loss, test_accuracy, test_precision,test_recall, test_f1, test_probs = val_model_one_epoch(model = model, 
                        val_loader = test_loader, loss_fn = loss_fn)

In [None]:
print(roc_auc_score(train_labels, train_probs))
print(roc_auc_score(val_labels, val_probs))
print(roc_auc_score(test_labels, test_probs))

In [None]:
model