# Visualization

In [None]:
import os
import glob

PATH_DATASET = "/kaggle/input/recodai-luc-scientific-image-forgery-detection"
authentic_images = glob.glob(os.path.join(PATH_DATASET, 'train_images', 'authentic', '*.png'))
forged_images = glob.glob(os.path.join(PATH_DATASET, 'train_images', 'forged', '*.png'))

print(f"Found {len(authentic_images)} authentic images.")
print(f"Found {len(forged_images)} forged images.")

## Authentic cases

In [None]:
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Select a random subset of authentic images
num_images_to_show = 12  # 3x4 grid
random_authentic_images = random.sample(authentic_images, min(num_images_to_show, len(authentic_images)))

# Display the images in a grid
fig, axes = plt.subplots(3, 4, figsize=(10, 8))
axes = axes.flatten()

for i, img_path in enumerate(random_authentic_images):
    img = mpimg.imread(img_path)
    axes[i].imshow(img)
    axes[i].axis('off') # Hide axes
    axes[i].set_title(os.path.basename(img_path), fontsize=8) # Add filename as title

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

## Forged cases with annotations

In [None]:
import glob

# Assuming the masks are in a 'train_masks' directory within the data_dir
mask_dir = os.path.join(PATH_DATASET, 'train_masks')

# Find all .npy files in the train_masks directory and store in a dictionary
mask_files_dict = {}
for mask_path in glob.glob(os.path.join(mask_dir, '*.npy')):
    basename = os.path.basename(mask_path)
    filename_without_extension, _ = os.path.splitext(basename) # Remove extension
    mask_files_dict[filename_without_extension] = mask_path

print(f"Found {len(mask_files_dict)} mask files and stored in a dictionary with keys as filenames without extensions.")
mask_dict = [f"{k}: {v}" for k, v in list(mask_files_dict.items())]
print("\n".join(mask_dict[:5]))

In [None]:
from tqdm.auto import tqdm
import numpy as np

# Load all masks and store their shapes
all_mask_shapes = []
for filename_without_extension, mask_path in tqdm(mask_files_dict.items()):
    mask = np.load(mask_path)
    all_mask_shapes.append(len(mask.shape))

print(f"Loaded shapes for {len(all_mask_shapes)} masks.")
print("Mask shapes:", set(all_mask_shapes))

In [None]:
def load_mask(mask_path: str):
    mask_raw = np.load(mask_path)
    # Sum across the first dimension and binarize: 1 if any channel has a value > 0, 0 otherwise.
    mask = np.zeros_like(mask_raw[0, :, :], dtype=np.uint8)
    for c in range(mask_raw.shape[0]):
        mask[mask_raw[c, :, :] > 0] = c + 1
    return mask

In [None]:
# Step 1: Match images and masks
# We'll match based on the base filename (without extension)
image_mask_pairs = []

for image_path in forged_images:
    image_basename = os.path.basename(image_path)
    filename_without_extension, _ = os.path.splitext(image_basename)
    mask_path = mask_files_dict[filename_without_extension]
    image_mask_pairs.append((image_path, mask_path))

print(f"Found {len(image_mask_pairs)} image-mask pairs.")

In [None]:
# Define a list of colors for the different mask levels (excluding background 0)
# You can customize this list with more colors if you expect more levels
mask_colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown', 'pink', 'gray', 'olive', 'cyan']

In [None]:
# Step 2: Select random subset
num_pairs_to_show = 12 # For a 12-row grid
random_pairs = random.sample(image_mask_pairs, min(num_pairs_to_show, len(image_mask_pairs)))

for i, (image_path, mask_path) in enumerate(random_pairs):
    # Step 3: Visualize in grid (3 columns, num_pairs_to_show rows)
    fig, axes = plt.subplots(1, 3, figsize=(12, 4)) # Adjust figsize as needed
    # Display image in the first column
    img = mpimg.imread(image_path)
    axes[0].imshow(img)
    axes[0].axis('off')
    axes[0].set_title(os.path.basename(image_path), fontsize=8)

    # Load the mask as multilabel
    mask = load_mask(mask_path)
    levels = np.unique(mask)[:-1] + 0.5

    # Display image with mask contour in the second column
    axes[1].imshow(img) # Display the original image

    # Find and draw contours on the second column axes
    axes[1].contour(mask, levels=levels, colors=mask_colors, linewidths=1)
    axes[1].axis('off')
    axes[1].set_title("Mask Contour", fontsize=8)

    # Display mask in the third column
    # Assuming the mask is a grayscale or binary image, adjust colormap if necessary
    axes[2].imshow(mask, cmap='viridis', interpolation='nearest')
    axes[2].axis('off')
    axes[2].set_title(os.path.basename(mask_path), fontsize=8)
    fig.tight_layout()
plt.show()

## Overlap authentic and forged cases

In [None]:
# Get just the filenames without the path
authentic_filenames = [os.path.basename(img_path) for img_path in authentic_images]
forged_filenames = [os.path.basename(img_path) for img_path in forged_images]

# Find the intersection of the two sets of filenames
overlapping_filenames = list(set(authentic_filenames).intersection(forged_filenames))

print(f"Found {len(overlapping_filenames)} overlapping filenames in authentic and forged folders.")

In [None]:
# Create dictionaries mapping filename without extension to full path for quicker lookup
authentic_image_dict = {os.path.splitext(os.path.basename(img_path))[0]: img_path for img_path in authentic_images}
forged_image_dict = {os.path.splitext(os.path.basename(img_path))[0]: img_path for img_path in forged_images}

# Find filenames that exist in both authentic and forged sets (using keys without extensions)
overlapping_filenames_without_extension = list(set(authentic_image_dict.keys()).intersection(forged_image_dict.keys()))

# Create pairs of (authentic_path, forged_path, mask_path) for overlapping filenames
matching_pairs_with_mask = []
for filename_without_extension in overlapping_filenames_without_extension:
    authentic_path = authentic_image_dict[filename_without_extension]
    forged_path = forged_image_dict[filename_without_extension]
    # Check if a mask exists for this forged image (mask_files_dict already uses keys without extension)
    if filename_without_extension in mask_files_dict:
        mask_path = mask_files_dict[filename_without_extension]
        matching_pairs_with_mask.append((authentic_path, forged_path, mask_path))
    else:
        print(f"Warning: No mask found for forged image with filename (without extension): {filename_without_extension}")

print(f"Found {len(matching_pairs_with_mask)} matching image-mask pairs with the same filename (without extension).")

In [None]:
# Determine the number of pairs to show (up to 12 rows)
num_pairs_to_show = min(16, len(matching_pairs_with_mask))

# Select a random subset of matching pairs
random_matching_pairs_with_mask = random.sample(matching_pairs_with_mask, num_pairs_to_show)

for i in range(num_pairs_to_show):
    # Create the grid (num_pairs_to_show rows, 3 columns)
    fig, axes = plt.subplots(1, 2, figsize=(14, 7)) # Adjust figsize as needed
    auth_img_path, forged_img_path, mask_path = random_matching_pairs_with_mask[i]
    auth_img = mpimg.imread(auth_img_path)
    forged_img = mpimg.imread(forged_img_path)
    # Load the mask as multilabel
    mask = load_mask(mask_path)
    levels = np.unique(mask)[:-1] + 0.5

    # Display authentic image in the first column
    axes[0].imshow(auth_img)
    # Find and draw contours on the second column axes using the mask
    axes[0].contour(mask, levels=levels, colors=mask_colors, linewidths=1)
    axes[0].axis('off')
    axes[0].set_title(f"Authentic + Mask: {os.path.basename(auth_img_path)}", fontsize=8)

    # Display forged image with mask contour in the second column
    axes[1].imshow(forged_img) # Display the forged image
    # Find and draw contours on the second column axes using the mask
    axes[1].contour(mask, levels=levels, colors=mask_colors, linewidths=1)
    axes[1].axis('off')
    axes[1].set_title(f"Forged + Mask: {os.path.basename(forged_img_path)}", fontsize=8)
    fig.tight_layout()
plt.show()

# Analyses

In [None]:
all_mask_instances = []
for filename_without_extension, mask_path in tqdm(mask_files_dict.items()):
    mask = np.load(mask_path)
    all_mask_instances.append(mask.shape[0])

print(f"Loaded shapes for {len(all_mask_instances)} masks.")
print("Mask shapes:", set(all_mask_instances))

## Discover how many instances are per image

In [None]:
import collections
import seaborn as sns

# Count the occurrences of each instance count
instance_counts = collections.Counter(all_mask_instances)
sorted_instance_counts = dict(sorted(instance_counts.items()))

# Create a bar plot of the instance counts
plt.figure(figsize=(8, 3))
sns.barplot(x=list(sorted_instance_counts.keys()), y=list(sorted_instance_counts.values()))
plt.title("Mask Instance Counts")
plt.xlabel("Number of Instances in Mask")
plt.ylabel("Occurances")
plt.grid(axis='y', alpha=0.75)
plt.show()

## Explore the ratios of object to image size

In [None]:
# Initialize a list to store all area ratios
all_area_ratios = []

# Iterate through mask files
for filename_without_extension, mask_path in tqdm(mask_files_dict.items()):
    # Load the raw mask data (not using the load_mask function as we need individual layers)
    mask_raw = np.load(mask_path)
    # Get image dimensions from the mask shape (assuming mask and image have same dimensions)
    num_instances, height, width = mask_raw.shape
    total_image_area = height * width
    # List to store ratios for the current mask
    mask_area_ratios = []

    # Iterate through mask instances (layers)
    for instance_layer in mask_raw:
        # Calculate segmented area for the instance
        segmented_area = np.sum(instance_layer > 0)
        # Calculate area ratio
        area_ratio = segmented_area / total_image_area
        # Store the ratio
        mask_area_ratios.append(area_ratio)

    # Extend the main list with ratios from the current mask
    all_area_ratios.extend(mask_area_ratios)

print(f"Calculated area ratios for {len(all_area_ratios)} instances across all masks.")

In [None]:
# Analyze and visualize the distribution of area ratios
plt.figure(figsize=(10, 4))
sns.histplot(all_area_ratios, bins=50, kde=True) # Using 50 bins to show the distribution shape
plt.title("Distribution of Segmented Area Ratios")
plt.xlabel("Area Ratio (Segmented Area / Total Image Area)")
plt.ylabel("Frequency")
plt.grid(axis='y', alpha=0.75)
plt.show()

# Print some basic statistics about the area ratios
print("\nBasic statistics for area ratios:")
print(f"Mean: {np.mean(all_area_ratios):.4f}")
print(f"Median: {np.median(all_area_ratios):.4f}")
print(f"Standard Deviation: {np.std(all_area_ratios):.4f}")
print(f"Min: {np.min(all_area_ratios):.4f}")
print(f"Max: {np.max(all_area_ratios):.4f}")