Published on October 23, 2025. By Prata, Marília (mpwolke)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from matplotlib.gridspec import GridSpec

#Two lines Required to Plot Plotly
import plotly.io as pio
pio.renderers.default = 'iframe'

import plotly.graph_objs as go
import plotly.offline as py
import plotly.express as px

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

from glob import glob
from tqdm.notebook import tqdm
import os
import shutil

import tensorflow as tf

import cv2
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## CMF (Copy Move Forgery) Kaggle Birds

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ2v_-Q4tpBmlU4W6YcLAHZ3hLmYWe9mYgSVQ&s)https://jivp-eurasipjournals.springeropen.com/articles/10.1186/s13640-019-0469-9

## Competition Citation

@misc{recodai-luc-scientific-image-forgery-detection,
    author = {João Phillipe Cardenuto and Daniel Moreira and Anderson Rocha and Sohier Dane and Addison Howard and Ashley Oldacre},
    
    title = {Recod.ai/LUC - Scientific Image Forgery Detection},
    year = {2025},
    
    howpublished = {\url{https://kaggle.com/competitions/recodai-luc-scientific-image-forgery-detection}},
    note = {Kaggle}
}

## About Competition

"Scientific images are central to published research, but not all of them are honest. Help protect science from fraudulent image manipulation by building models that can detect and segment copy-move forgeries in biomedical images."

https://www.kaggle.com/competitions/recodai-luc-scientific-image-forgery-detection

### Copy-move forgery detection technique

 A robust copy-move forgery detection technique based on discrete cosine transform and cellular automata

Authors: Gulnawaz Gani, Fasel Qadir

"**Copy Move Forgery (CMF)** is a type of digital image forgery in which an image region is copied and pasted to another location within the same image with malicious intent to misrepresent its meaning. To prevent misinterpretation of an image content, several **Copy Move Forgery Detection (CMFD)** methods have been proposed in the past. However, the existing methods show limited robustness on images altered with post-processing attacks such as noise addition, compression, blurring etc."

"In this paper, the authors proposed a robust method for detecting copy-move forgeries under different post-processing attacks. They used **Discrete Cosine Transform (DCT)** to extract features from each block. Next, Cellular Automata is employed to construct feature vectors based on the sign information of the DCT coefficients."

"Finally, feature vectors are matched using the kd-tree based nearest-neighbor searching method to find the duplicated areas in the image. Experimental results show that the proposed method performs exceptionally well relative to the other state-of-the-art methods from the literature even when an image is heavily affected by the post-processing attacks, in particular, JPEG compression and additive white Gaussian noise."

"Furthermore, experiments confirm the robustness of the proposed method against the range of combined attacks."

https://www.sciencedirect.com/science/article/abs/pii/S2214212619307343

## Load sample_submission

In [None]:
sub = pd.read_csv('/kaggle/input/recodai-luc-scientific-image-forgery-detection/sample_submission.csv')
sub.tail()

## Test images (only one image)

In [None]:
from PIL import Image

imgs_dir = '../input/recodai-luc-scientific-image-forgery-detection/test_images/'
Image.open(imgs_dir + '45.png')

In [None]:
import cv2
import glob

## Authentic train images 

Only with size 512.

In [None]:
def plotImages(tools,directory):
    print(tools)
    multipleImages = glob.glob(directory)
    plt.rcParams['figure.figsize'] = (8, 8) #Original is 15,15. Since we have 18 veggies I decreased the size
    plt.subplots_adjust(wspace=0, hspace=0)
    i_ = 0
    for l in multipleImages[:25]: #Original is 25
        im = cv2.imread(l)
        im = cv2.resize(im, (512, 512)) 
        plt.subplot(5, 5, i_+1) #.set_title(l)
        plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB)); plt.axis('off')
        i_ += 1

plotImages("Authentic train images","../input/recodai-luc-scientific-image-forgery-detection/train_images/authentic/**")

## Forged train_images

In [None]:
def plotImages(tools,directory):
    print(tools)
    multipleImages = glob.glob(directory)
    plt.rcParams['figure.figsize'] = (8, 8) #Original is 15,15. Since we have 18 veggies I decreased the size
    plt.subplots_adjust(wspace=0, hspace=0)
    i_ = 0
    for l in multipleImages[:25]: #Original is 25
        im = cv2.imread(l)
        im = cv2.resize(im, (256, 256)) 
        plt.subplot(5, 5, i_+1) #.set_title(l)
        plt.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB)); plt.axis('off')
        i_ += 1

plotImages("Forged train images","../input/recodai-luc-scientific-image-forgery-detection/train_images/forged/**")

In [None]:
train_img_dir = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/train_images/forged/"
train_mask_dir = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/train_masks/"

train_img_list = sorted(os.listdir(train_img_dir))
train_mask_list =  sorted(os.listdir(train_mask_dir))

In [None]:
batch_size = 2

In [None]:
def load_img(img_dir , img_list):
    
    images = []
    for i,image_name in enumerate(img_list):
        
        if(image_name.split('.')[1]=='npy', 'png'):
            # if the file is numpy array
            
            image = np.load(img_dir + image_name)
            images.append(image)
        
    images = np.array(images)
        
    return (images)

In [None]:
#I forget who made that since I tried many Notebooks.

def imageLoader(img_dir , img_list , mask_dir , mask_list , batch_size):
    
    L = len(img_list)
    
    while True:
        
        batch_start = 0
        batch_end = batch_size
        
        while batch_start < L:
            
            limit = min(batch_end , L)
            
            X = load_img(img_dir , img_list[batch_start:limit]) # load image
            Y = load_img(mask_dir , mask_list[batch_start:limit]) # load mask
            
            yield(X,Y) # yields images - a tuple with 2 numpy arrays with batch_size samples
            
            batch_start += batch_size
            batch_end += batch_size

In [None]:
train_img_datagen = imageLoader(train_img_dir, train_img_list,  train_mask_dir, train_mask_list, batch_size)

### Cannot load file containing pickled data when allow_pickle=False

In [None]:
img, msk = train_img_datagen.__next__()

In [None]:
img.shape

In [None]:
msk.shape

In [None]:
import random

img_num = random.randint(0,img.shape[0]-1)
test_img=img[img_num]
#test_mask=msk[img_num]
#test_mask=np.argmax(test_mask, axis=3)

In [None]:
!pip install imgaug

In [None]:
# By Jocelyn Dumlao
# the numpy bool deprecation warning in imgaug
import numpy as np
np.bool = bool

# Define image dimensions
IMG_WIDTH = 256
IMG_HEIGHT = 256

In [None]:
#By Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

def preprocess_image(image_path, mask_path, img_width=IMG_WIDTH, img_height=IMG_HEIGHT):
    """
    Loads, preprocesses, and resizes an image and its corresponding mask.
    """
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

    # Resize images and masks
    img = cv2.resize(img, (img_width, img_height))
    mask = cv2.resize(mask, (img_width, img_height), interpolation=cv2.INTER_NEAREST) #Keep mask values discrete

    # Normalize image pixels to be between 0 and 1
    img = img / 255.0
    mask = mask / 255.0  #Normalize the mask to 0 or 1 values

    #Expand mask dimensions to be (IMG_WIDTH, IMG_HEIGHT, 1)
    mask = np.expand_dims(mask, axis=-1)
    return img, mask

In [None]:
#By Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

# data augmentation using imgaug
try:
    import imgaug.augmenters as iaa
    HAS_IMGAUG = True  # Flag to indicate imgaug is installed
except ImportError:
    print("imgaug is not installed. Data augmentation will be limited.")
    HAS_IMGAUG = False

def augment_data(image, mask):
    """Applies data augmentation to an image and its corresponding mask."""
    if HAS_IMGAUG:
        # Define a sequence of augmentations
        seq = iaa.Sequential([
            iaa.Fliplr(0.5), # horizontal flips
            iaa.Sometimes(0.5,
                iaa.GaussianBlur(sigma=(0, 0.5))
            ),
            iaa.Sometimes(0.5,
                iaa.Affine(
                    scale={"x": (0.8, 1.2), "y": (0.8, 1.2)},
                    translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)},
                    rotate=(-25, 25),
                    shear=(-8, 8)
                )
            ),
            iaa.Sometimes(0.5,
                iaa.SomeOf((0, 5), [
                    iaa.AdditiveGaussianNoise(scale=0.05*255),
                    iaa.Add((-10, 10)),
                    iaa.Multiply((0.5, 1.5)),
                    iaa.ContrastNormalization((0.5, 2.0)),
                    iaa.Grayscale(alpha=(0.0, 1.0))
                ], random_order=True)
            )
        ])

        # Convert image and mask to correct format for imgaug
        image = np.uint8(image * 255)  # Scale back to 0-255
        mask = np.uint8(mask * 255)  # Scale back to 0-255

        # Apply augmentations. imgaug expects images with 3 channels for color augmentations
        # If mask only has one channel, convert it to 3 channels
        if mask.shape[-1] == 1:
            mask = np.repeat(mask, 3, axis=-1)  # Convert single-channel mask to 3 channels

        augmented_image = seq(image=image)
        augmented_mask = seq(image=mask)

        # Convert back to original format
        augmented_image = augmented_image / 255.0
        augmented_mask = augmented_mask[:,:,0] / 255.0 #Keep the first channel of the augmented mask and scale between 0 and 1
        augmented_mask = np.expand_dims(augmented_mask, axis=-1)

        return augmented_image, augmented_mask
    else:
        # If imgaug is not installed, return the original image and mask
        return image, mask

In [None]:
#By Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

# Create a function to generate the dataset
def data_generator(image_paths, mask_paths, batch_size=32, augment=True):
    """
    Generates batches of training data.
    """
    num_samples = len(image_paths)
    while True:
        # Shuffle the data at the beginning of each epoch
        combined_list = list(zip(image_paths, mask_paths))
        random.shuffle(combined_list)
        image_paths, mask_paths = zip(*combined_list)  # Unzip back into separate lists

        for offset in range(0, num_samples, batch_size):
            batch_images = []
            batch_masks = []
            batch_image_paths = image_paths[offset:offset + batch_size]
            batch_mask_paths = mask_paths[offset:offset + batch_size]

            for image_path, mask_path in zip(batch_image_paths, batch_mask_paths):
                img, mask = preprocess_image(image_path, mask_path)
                if augment:
                    img, mask = augment_data(img, mask)

                batch_images.append(img)
                batch_masks.append(mask)

            # Convert to numpy arrays
            batch_images = np.array(batch_images, dtype=np.float32)  # Ensure float32 for images
            batch_masks = np.array(batch_masks, dtype=np.float32)    # Ensure float32 for masks

            yield batch_images, batch_masks

In [None]:
#By Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

# Define the base path to the dataset
base_path = '/kaggle/input/recodai-luc-scientific-image-forgery-detection/'

# Define the environmental conditions
conditions = ['authentic', 'forged']

In [None]:
#By Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

# Create dictionaries to store image and mask paths for each condition
image_paths = {}
mask_paths = {}

for condition in conditions:
    image_paths[condition] = sorted(glob.glob(os.path.join(base_path, 'train_images', condition,'*.png')))
    mask_paths = sorted(glob.glob(os.path.join(base_path,'train_masks', '*.npy')))

# Verify the number of images and masks for each condition
for condition in conditions:
    print(f"Condition: {condition}")
    print(f"  Number of images: {len(image_paths[condition])}")
    print(f"  Number of masks: {len(mask_paths)}")

## Last attempt to display the npy masks.

In [None]:
input_dir = "../input/recodai-luc-scientific-image-forgery-detection/train_images/forged/"
mask_dir = "../input/recodai-luc-scientific-image-forgery-detection/train_masks"
#img_size = (160, 160)
#num_classes = 3
batch_size = 16

In [None]:
# By Ammar Alhaj Ali https://www.kaggle.com/ammarnassanalhajali/image-segmentation-with-a-u-net-and-keras

input_img_paths = sorted(
    [
        os.path.join(input_dir, fname)
        for fname in os.listdir(input_dir)
        if fname.endswith(".png")
    ]
)
mask_img_paths = sorted(
    [
        os.path.join(target_dir, fname)
        for fname in os.listdir(mask_dir)
        if fname.endswith(".npy") and not fname.startswith(".")
    ]
)

print("Number of samples:", len(input_img_paths))

In [None]:
#By Ammar Alhaj Ali https://www.kaggle.com/ammarnassanalhajali/image-segmentation-with-a-u-net-and-keras

import matplotlib.image as mpimg

#Display sample of Image Dataset
i = 4
figure, ax = plt.subplots(nrows=1,ncols=2,figsize=(8,8))
ax.ravel()[0].imshow(mpimg.imread(input_img_paths[i]))
ax.ravel()[0].set_title("Original image")
ax.ravel()[0].set_axis_off()
#ax.ravel()[1].imshow(mpimg.imread(target_img_paths[i]))
image_mask = np.load('../input/recodai-luc-scientific-image-forgery-detection/train_masks/10070.npy') #mpimg.imread
ax.ravel()[1].set_title("Mask")
ax.ravel()[1].set_axis_off()
#ax.ravel()[2].imshow(PIL.ImageOps.autocontrast(load_img(target_img_paths[i])))
#ax.ravel()[2].set_title("Contrast of mask")
#ax.ravel()[2].set_axis_off()
plt.tight_layout()

## After 3h:18m, Not a single npy mask was displayed.

#Acknowledgements:

Jocelyn Dumlao https://www.kaggle.com/code/jocelyndumlao/crosswalk-segmentation-u-net-model/notebook

Ammar Alhaj Ali https://www.kaggle.com/ammarnassanalhajali/image-segmentation-with-a-u-net-and-keras