In [5]:
!pip install face_alignment

In [12]:
import cv2
import torch
import torchvision.transforms as T
import numpy as np
import torch.nn.functional as F
from torchvision import models
from torch import nn
import os
import glob
import pandas as pd
import face_alignment
from tqdm import tqdm
from torchvision.models import efficientnet_b0, EfficientNet_B0_Weights
import timm

In [4]:
# ✅ Set the device to MPS(for Mac) if available, otherwise fallback to CUDA or CPU
device = torch.device("mps") if torch.backends.mps.is_available() else (
torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
)
print(f"Using device: {device}")

In [3]:
# Mount Google Drive to access files in Google Colab
from google.colab import drive
drive.mount('/content/drive')

In [13]:
# Model Structure
class Model(nn.Module):
    def __init__(self, num_binary_classes=2, num_method_classes=7,model_name="resnext50_32x4d", lstm_layers=1 , hidden_dim = 2048, bidirectional = False):
        super(Model, self).__init__()
        self.model_name = model_name

        # Select the model for comparison (ResNeXt, Xception, or EfficientNet)
        if self.model_name=="resnext50_32x4d":
          model = models.resnext50_32x4d(pretrained = True) # Using ResNeXt50-32x4d
          self.model = nn.Sequential(*list(model.children())[:-2])
          self.latent_dim = 2048
        elif self.model_name=="xception":
          self.latent_dim = 2048
          model = timm.create_model('xception', pretrained=True, features_only=False) # Using Xception
          self.model = nn.Sequential(*list(model.children())[:-2])
        elif self.model_name=="EfficientNet-b0":
           self.latent_dim = 1280
           weights = EfficientNet_B0_Weights.DEFAULT  # Using EfficientNet B0
           model = efficientnet_b0(weights=weights)
           self.model = nn.Sequential(*list(model.features))

        print("latet_dim: ",self.latent_dim)

        self.lstm = nn.LSTM(self.latent_dim,hidden_dim, lstm_layers,  bidirectional)
        self.relu = nn.LeakyReLU()
        self.dp = nn.Dropout(0.5)
        self.avgpool = nn.AdaptiveAvgPool2d(1)

        # Two outputs: binary classification and method classification
        self.binary_classifier = nn.Linear(hidden_dim, num_binary_classes)
        self.method_classifier = nn.Linear(hidden_dim, num_method_classes)

    def forward(self, x):
        batch_size,seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        fmap = self.model(x)
        x = self.avgpool(fmap)
        x = x.view(batch_size,seq_length,self.latent_dim) # resnext50_32x4d, xception : 2048, efficientnet-b0 : 1280
        x_lstm,_ = self.lstm(x,None)
        pooled = torch.mean(x_lstm, dim=1)
        return fmap, self.binary_classifier(self.dp(pooled)), self.method_classifier(self.dp(pooled))

## Grad-cam

In [14]:
# ✅ Grad-CAM computation for binary classification
def compute_gradcam_binary(model, input_tensor, target_class=0):
    fmap = None
    grad = None

    def fw_hook(module, inp, out):
        nonlocal fmap
        fmap = out.detach()

    def bw_hook(module, grad_in, grad_out):
        nonlocal grad
        grad = grad_out[0].detach()

    last_layer = model.model[-1]
    f = last_layer.register_forward_hook(fw_hook)
    b = last_layer.register_backward_hook(bw_hook)

    input_tensor = input_tensor.to(device).unsqueeze(0).unsqueeze(0).requires_grad_(True)
    _, binary_output, method_output = model(input_tensor)

    # Get the probability of the target class
    prob = F.softmax(binary_output, dim=1)[0, target_class].item()

    # Predict binary and method classes
    binary_pred = torch.argmax(binary_output, dim=1).item()   # 0: fake, 1: real
    method_pred = torch.argmax(method_output, dim=1).item()   # 0: original, 1~6: fake methods, 7: others

    # 🔴 Condition 1: If the prediction is real and the method is original, skip CAM computation / 조건 1: real(1) + original(0) → CAM X
    if binary_pred == 1 and method_pred==0:
        cam = np.zeros((input_tensor.shape[-2], input_tensor.shape[-1]))
        f.remove()
        b.remove()
        return cam,prob

    # Grad-CAM for fake class (target_class = 0)
    target_class = 0
    model.zero_grad()
    binary_output[0, target_class].backward()

    # Compute Grad-CAM
    weights = grad.mean(dim=[2, 3], keepdim=True)
    cam = (weights * fmap).sum(dim=1, keepdim=True)
    cam = F.relu(cam).squeeze().cpu().numpy()

    # 🔵 Condition 2: If the prediction is fake and the method is not original, enhance CAM / 조건 2: fake (0) + method (1~7) (≠ 0) → CAM ↑
    if binary_pred == 0 and method_pred != 0:
        cam *=1.5

    # Normalize and resize the CAM
    cam = (cam - cam.min()) / (cam.max() - cam.min() + 1e-8)
    cam = cv2.resize(cam, (input_tensor.shape[-1], input_tensor.shape[-2]))



    f.remove()
    b.remove()
    return cam, prob

In [15]:
# ✅ Function to process video, compute Grad-CAM, and save frames
def process_video_and_save_frames(input_video_path, output_video_path, model, frame_dir):

    # Check the device (MPS, CUDA, or CPU)
    device = torch.device("mps") if torch.backends.mps.is_available() else (
    torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    )
    print(f"Using device: {device}")

    # Check how many videos are already processed and saved
    input_path = f'{input_video_path}/*.mp4'
    video_files = glob.glob(input_path)
    print(len(video_files))
    already_present_count = glob.glob(output_video_path+ '/*.mp4')
    print("No of videos already present ", len(already_present_count))

    # Create output folders if not exist
    os.makedirs(frame_dir, exist_ok=True)
    os.makedirs(output_video_path, exist_ok=True)
    top_jpg_dir = os.path.join(output_video_path, "Top_jpg")
    os.makedirs(top_jpg_dir, exist_ok=True)


    # Load REAL/FAKE labels
    df = pd.read_excel(predictions_file_path)
    for video_file in tqdm(video_files):
        file_name=os.path.basename(video_file)
        result = str(df[df['Filename'] == file_name]['label'].iloc[0])[0] + str(df[df['Filename'] == file_name]['Prediction'].iloc[0])[0]+str(df[df['Filename'] == file_name]['Predicted_method'].iloc[0])

        f_name=f'({result})_'+file_name
        out_path = os.path.join(output_video_path,f_name) # Extract output video file name
        file_exists = glob.glob(out_path + "*")

        if(len(file_exists) != 0): # Skip if video already exists
            print("File Already exists: " , out_path)
            continue

        cap = cv2.VideoCapture(video_file)
        fps = cap.get(cv2.CAP_PROP_FPS)
        w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

        # Set output MP4 file name
        name=os.path.splitext(file_name)[0]
        out = cv2.VideoWriter(out_path,cv2.VideoWriter_fourcc('M','J','P','G'), fps, (w, h))

        # Preprocessing transforms
        transform = T.Compose([
            T.ToPILImage(),
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406],
                                [0.229, 0.224, 0.225])
        ])

        # Load face_alignment library for face detection
        fa = face_alignment.FaceAlignment(
            face_alignment.LandmarksType.TWO_D,
            device=str(device)
        )

        frame_count = 0

        frame_scores = []
        frame_images = []
        roi_result=[]

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            original = frame.copy()
            img = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = transform(img).to(device)

            # 1️⃣ Compute Grad-CAM for binary classification
            cam , score= compute_gradcam_binary(model, img)

            # cam_activation = float(np.mean(cam))  # the mean of CAM Activation

            # Generate Grad-CAM overlay for the frame
            heatmap = cv2.applyColorMap(np.uint8(255 * cam), cv2.COLORMAP_JET)
            heatmap = cv2.resize(heatmap, (original.shape[1], original.shape[0]))
            overlay = 0.4 * heatmap + 0.6 * original
            overlay = np.clip(overlay, 0, 255).astype(np.uint8)

            frame_scores.append((frame_count, score))
            frame_images.append((frame_count, overlay))

            out.write(overlay) # Save Grad-CAM overlay video

            # Save the Grad-CAM overlay frame as an image
            frame_path = os.path.join(frame_dir, f"({result})_{name}_{frame_count:04d}_{score:10f}.jpg")
            cv2.imwrite(frame_path, overlay)
            frame_count += 1

        # ✅ Save top 10 frames with highest scores(probability)
        frame_scores.sort(key=lambda x: x[1], reverse=True)
        top_10_indices = [idx for idx, score in frame_scores[:10]]

        for rank, idx in enumerate(top_10_indices):
            img = frame_images[idx][1]
            score = frame_scores[rank][1]

            top_frame_path = os.path.join(top_jpg_dir, f"{name}_TOP{rank + 1}_Score{score:10f}.jpg")
            cv2.imwrite(top_frame_path, img)   # Save top 10 frames


        cap.release()  # Save frames
        out.release()  # Save videos

        print(f"✅ Grad-CAM video saved: {output_video_path}")
        print(f"✅ {frame_count} frame images saved: {frame_dir}/frame_XXXX.jpg")


In [1]:
# Define checkpoint and model selection
checkpoint_name="checkpoint_v35"
selected_model="EfficientNet-b0"


# Set the checkpoint path and predictions file path
checkpoint_path=f'/content/drive/MyDrive/Capstone/checkpoints/{checkpoint_name}'
predictions_file_path = f'{checkpoint_path}/(test)_{checkpoint_name}_predictions.xlsx' # 예측 후 메타 데이터


# Define the model structure
model = Model(num_binary_classes=2, num_method_classes=7, model_name=selected_model).to(device)
model.load_state_dict(torch.load(f'{checkpoint_path}/{checkpoint_name}.pt', map_location=device))
model.eval()
model.lstm.train()
# print("LSTM mode:", model.lstm.training)  # ✅ 여기가 True면 됨


folder_list=[ "Deepfakes", "original","Face2Face", "FaceShifter", "FaceSwap", "NeuralTextures"]

for folder_name in folder_list:
  input_video_path=f'/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/before/{folder_name}'    # input
  output_video_path=f'/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/after/{folder_name}/video'   # 영상 output 저장하는 경로
  frame_path=f'/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/after/{folder_name}/frame'      # jpg output 저장하는 경로

  # Call the function to process videos, compute Grad-CAM, and save frames
  process_video_and_save_frames(
      input_video_path,
      output_video_path,
      model=model,
      frame_dir=frame_path
  )


##ROI Activation

In [63]:
def get_bbox(pts):
    x, y = pts[:,0], pts[:,1]
    return int(x.min()), int(y.min()), int(x.max()), int(y.max())

def roi_activation(cam, bbox):
    x1, y1, x2, y2 = bbox
    patch = cam[y1:y2, x1:x2]
    mean_val = float(patch.mean())

    if np.isnan(mean_val):
        return -1
    return mean_val

def analyze_roi_activation(input_dir, output_dir_box, base_path, folder_name, model):

  fa = face_alignment.FaceAlignment(
      face_alignment.LandmarksType.TWO_D,
      device=str(device)
  )

  os.makedirs(output_dir_box, exist_ok=True)

  transform = T.Compose([
      T.ToTensor(),
      T.Resize((224, 224)),
      T.Normalize(mean=[0.485, 0.456, 0.406],
                  std=[0.229, 0.224, 0.225])
  ])

  result = []

  video_paths = glob.glob(os.path.join(input_dir, '*.mp4'))

  for video_path in tqdm(video_paths):

    facial_region=['jawline', 'left_eye', 'right_eye', 'left_eye_brow', 'right_eye_brow', 'nose', 'mouth','None']
    first_detection_count = {key: 0 for key in facial_region}
    second_detection_count = {key: 0 for key in facial_region}
    detection_probabillity={key: 0.0 for key in facial_region}



    cap = cv2.VideoCapture(video_path)
    frame_idx = 0
    video_name = os.path.splitext(os.path.basename(video_path))[0]

    while cap.isOpened():
      success, frame = cap.read()
      if not success:
        break

      rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
      landmarks = fa.get_landmarks(rgb)
      if not landmarks:
        frame_idx += 1
        continue
      lm = landmarks[0]

      # ROI BBoxes
      bbox_map = {
          'jawline': get_bbox(lm[0:17]),
          'left_eye': get_bbox(lm[36:42]),
          'right_eye': get_bbox(lm[42:48]),
          'left_eye_brow': get_bbox(lm[17:22]),
          'right_eye_brow': get_bbox(lm[22:27]),
          'nose': get_bbox(lm[27:36]),
          'mouth': get_bbox(lm[48:68]),
      }

      # Grad-CAM
      img = transform(rgb).to(device)
      cam, cam_score = compute_gradcam_binary(model, img)
      cam = cv2.resize(cam, (frame.shape[1], frame.shape[0]))

      scores = {region: roi_activation(cam, box) for region, box in bbox_map.items()}
      sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
      first_activated_region = sorted_scores[0][0]
      second_activated_region = sorted_scores[1][0]

      f_x1, f_y1, f_x2, f_y2 = bbox_map[first_activated_region]
      s_x1, s_y1, s_x2, s_y2 = bbox_map[second_activated_region]

      # 1️⃣ Grad-CAM 히트맵 오버레이
      heatmap = cv2.applyColorMap(np.uint8(255 * cam), cv2.COLORMAP_JET)
      overlay = cv2.addWeighted(frame, 0.6, heatmap, 0.4, 0)

      # 2️⃣ 원본 프레임 복사해서 박스용 준비
      frame_with_box = frame.copy()
      overlay_with_box = overlay.copy()

      # 3️⃣ 박스 그리기
      if scores[first_activated_region] > 0:
        cv2.rectangle(frame_with_box, (f_x1, f_y1), (f_x2, f_y2), (0, 255, 0), 2)
        cv2.rectangle(overlay_with_box, (f_x1, f_y1), (f_x2, f_y2), (0, 255, 0), 2)
      else:
        first_activated_region="None"

      if scores[second_activated_region] > 0:
        cv2.rectangle(frame_with_box, (s_x1, s_y1), (s_x2, s_y2), (255, 0, 0), 2)
        cv2.rectangle(overlay_with_box, (s_x1, s_y1), (s_x2, s_y2), (255, 0, 0), 2)
      else:
        second_activated_region="None"

      # 4️⃣ 파일 저장
      file_id = f"{video_name}_frame{frame_idx:04d}"
      cv2.imwrite(os.path.join(output_dir_box, f"{file_id}_roi.jpg"), frame_with_box)
      cv2.imwrite(os.path.join(output_dir_box, f"{file_id}_gradcam.jpg"), overlay_with_box)

      first_detection_count[first_activated_region]+=1
      second_detection_count[second_activated_region]+=1
      for key in facial_region:
        if key!="None" and scores[key]!=-1:
          detection_probabillity[key]+=cam_score * scores[key]

      result.append({
          'file_name': file_id,
          'cam_score': cam_score,
          'first_activate_region': first_activated_region,
          'second_activate_region': second_activated_region,
          'f_x1': f_x1, 'f_y1': f_y1, 'f_x2': f_x2, 'f_y2': f_y2,
          's_x1': s_x1, 's_y1': s_y1, 's_x2': s_x2, 's_y2': s_y2,
          **scores,
      })

      frame_idx += 1

    # Printing all the dictionaries
    first_detection_rate = {key: round((value / frame_idx)*100, 2) for key, value in first_detection_count.items()}
    second_detection_rate = {key: round((value / frame_idx)*100, 2) for key, value in second_detection_count.items()}

    # 📌 Note: A high proportion of 'None' may inflate the relative Contribution (%) and should be interpreted with caution.
    raw_detection_probabillity= {key: round(value, 4) for key, value in detection_probabillity.items()}
    probabillity_total = sum(detection_probabillity.values())
    detection_probabillity= {key: round((value/probabillity_total)*100, 2) for key, value in detection_probabillity.items()}



    print("Video name:", video_name)
    print("Facial Region:", facial_region)
    print("First Detection Count:", first_detection_count)
    print("Second Detection Count:", second_detection_count)
    print("First Detection Rate:", first_detection_rate)
    print("Second Detection Rate:", second_detection_rate)
    print("Raw_Detection Probability:", raw_detection_probabillity)
    print("Detection Probability:", detection_probabillity)

  # 저장
  df = pd.DataFrame(result)
  df = df[[
      'file_name', 'cam_score',
      'first_activate_region', 'second_activate_region',
      'f_x1', 'f_y1', 'f_x2', 'f_y2',
      's_x1', 's_y1', 's_x2', 's_y2',
      'jawline', 'left_eye', 'right_eye',
      'left_eye_brow', 'right_eye_brow',
      'nose', 'mouth',
  ]]

  df.to_excel(f"{base_path}/{folder_name}_roi.xlsx", index=False)
  df.to_json(f"{base_path}/{folder_name}_roi.json", orient='records', force_ascii=False)


In [2]:

checkpoint_name="checkpoint_v35"
selected_model="EfficientNet-b0"
checkpoint_path=f'/content/drive/MyDrive/Capstone/checkpoints/{checkpoint_name}'

# Define the model structure
model = Model(num_binary_classes=2, num_method_classes=7, model_name=selected_model).to(device)
model.load_state_dict(torch.load(f'{checkpoint_path}/{checkpoint_name}.pt', map_location=device))
model.eval()
model.lstm.train()
# print("LSTM mode:", model.lstm.training)  # ✅ 여기가 True면 됨

folder_list=[ "Deepfakes", "original","Face2Face", "FaceShifter", "FaceSwap", "NeuralTextures"]

for folder_name in folder_list:
  input_dir=f'/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/before/{folder_name}'    # input
  output_dir_box=f'/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/after/{folder_name}/roi_frame'      # jpg output 저장하는 경로
  base_path='/content/drive/MyDrive/Capstone/Dataset/ff++(grad-cam_v2)/after'

  analyze_roi_activation(
      input_dir,
      output_dir_box,
      base_path,
      folder_name,
      model=model
  )