

* **SIFD:** **S**cientific **I**mage **F**orgery **D**etection (The problem domain).
* **U-NET:** The **Deep Learning Architecture** used for pixel-level segmentation.
* **Tversky:** The **Specialized Loss Function** used to combat extreme class imbalance.
* **CTF:** **Continuous Fine-Tuning** (The optimization strategy used for iterative error correction).
* **Final:** Signifies that this file contains the **best, optimized weights** from the completed pipeline.

In [None]:
!nvidia-smi

In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf

# 1. Set CUDA_VISIBLE_DEVICES to "-1" to hide all GPUs from CUDA.
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# 2. Verify and disable any logical devices that have already been created.
try:
    # Get a list of all physical GPU devices visible to TensorFlow
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        # Hide all physical GPU devices from TensorFlow's runtime
        tf.config.set_visible_devices([], 'GPU')
        print("âœ… Successfully disabled physical GPUs for submission.")
except Exception as e:
    # Catching the case where list_physical_devices fails if TF is in a bad state
    print(f"Warning: Could not configure GPU visibility. Error: {e}")

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
from warnings import filterwarnings

filterwarnings('ignore') # Suppress warnings

# --- CONFIGURATION (from the original notebook) ---
TARGET_SIZE = 256
TRAIN_ROOT = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/train_images"
MASK_ROOT = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/train_masks"


# Replicate compute_ela for feature analysis
def compute_ela(img_path, quality=95, scale=10):
    # ... (omitted for brevity, assume the original function is available)
    # The original notebook's ELA function is used here.
    img = cv2.imread(img_path)
    if img is None or img.size == 0:
        try:
            img_data = np.load(img_path)
            if img_data.ndim == 3: img = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
            elif img_data.ndim == 2: img = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
        except Exception: return np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.float32)

    if img is None or img.size == 0:
        return np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.float32)

    img_resized = cv2.resize(img, (TARGET_SIZE, TARGET_SIZE))
    temp_path = f"/tmp/temp_ela_{os.path.basename(img_path)}.jpg" # Simplified temp_path
    try:
        # Use a consistent quality setting (95)
        cv2.imwrite(temp_path, img_resized, [cv2.IMWRITE_JPEG_QUALITY, quality])
        compressed_img = cv2.imread(temp_path)
        if compressed_img is None: return np.zeros((TARGET_SIZE, TARGET_SIZE), dtype=np.float32)
        error = np.abs(img_resized.astype(np.float32) - compressed_img.astype(np.float32))
        ela_feature_2d = np.mean(error, axis=2) * scale # Scale by 10 as in the notebook
    finally:
        if os.path.exists(temp_path): os.remove(temp_path)
    return cv2.resize(ela_feature_2d, (TARGET_SIZE, TARGET_SIZE), interpolation=cv2.INTER_LINEAR).astype(np.float32)

# Load the filtered DataFrame (assuming the prior EDA cell's 'eda_df' is available or recreate it)
data_list = []
for root, _, files in os.walk(TRAIN_ROOT):
    for f in files:
        valid_extensions = ('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.npy')
        if f.lower().endswith(valid_extensions) and 'forged' in root.lower():
            case_id = os.path.splitext(f)[0]
            mask_path = os.path.join(MASK_ROOT, f"{case_id}.npy")
            if os.path.exists(mask_path):
                data_list.append({'img_path': os.path.join(root, f), 'mask_path': mask_path})
eda_df = pd.DataFrame(data_list)

print("--- Starting Advanced EDA (Imbalance & Feature Check) ---")

if eda_df.empty:
    print("ðŸ›‘ EDA Skipped: Data frame is empty.")
else:
    total_pixels = 0
    forgery_pixels = 0
    ela_values, rgb_means = [], []

    # Process only the first 50 images to speed up ELA computation for EDA
    for index, row in tqdm(eda_df.head(50).iterrows(), total=len(eda_df.head(50)), desc="Processing samples"):
        try:
            # 1. Image and Mask Load
            rgb_image = cv2.cvtColor(cv2.imread(row['img_path']), cv2.COLOR_BGR2RGB)
            if rgb_image is None or rgb_image.size == 0: continue

            mask = np.load(row['mask_path'])
            if mask.ndim > 2: mask = mask[:, :, 0]

            # 2. Imbalance Check (Use original sizes for best estimate)
            h, w = rgb_image.shape[:2]
            total_pixels += h * w
            forgery_pixels += np.sum(mask > 0)

            # 3. ELA Feature Check (Use 256x256 resized data)
            ela_feature = compute_ela(row['img_path'])
            ela_values.extend(ela_feature.flatten())

            # RGB feature check (resize/normalize similar to training)
            rgb_resized = cv2.resize(rgb_image, (TARGET_SIZE, TARGET_SIZE)) / 255.0
            rgb_means.extend(rgb_resized.mean(axis=2).flatten())

        except Exception as e:
            # print(f"Warning: Could not process {row['img_path']}: {e}")
            continue

    # --- Analysis 1: Imbalance Ratio ---
    if total_pixels > 0:
        imbalance_ratio = (forgery_pixels / total_pixels) * 100
        print(f"\n--- Imbalance Ratio (Forged Pixels) ---")
        print(f"Total Pixels Sampled: {total_pixels:,}")
        print(f"Forged Pixels Sampled: {forgery_pixels:,}")
        print(f"Forgery Imbalance Ratio: **{imbalance_ratio:.2f}%** (Positive Class)")

    # --- Analysis 2: ELA Feature Distribution vs. RGB ---
    if ela_values:
        ela_values = np.array(ela_values)
        rgb_means = np.array(rgb_means)

        print(f"\n--- ELA Feature Distribution (Scaled by 10) ---")
        print(f"ELA Feature Mean: {np.mean(ela_values):.4f}")
        print(f"ELA Feature Std Dev: {np.std(ela_values):.4f}")
        print(f"RGB Mean (Normalized): {np.mean(rgb_means):.4f}")

        plt.figure(figsize=(12, 5))
        plt.hist(ela_values, bins=50, alpha=0.6, label='ELA Feature (Scaled)', color='red')
        plt.title('Distribution of ELA Feature Values')
        plt.xlabel('ELA Value (0 to ~2550)')
        plt.ylabel('Frequency')
        plt.legend()
        plt.show()

        # This histogram helps visualize if ELA is predominantly zero or clustered.

print("\n--- Advanced EDA Complete ---")

In [None]:
import matplotlib.pyplot as plt
import cv2
import os

# Define the file path
TEST_IMAGE_PATH = "/kaggle/input/recodai-luc-scientific-image-forgery-detection/test_images/45.png"

print(f"Attempting to load image: {TEST_IMAGE_PATH}")

if not os.path.exists(TEST_IMAGE_PATH):
    print("ðŸ›‘ ERROR: The file path was not found. Please ensure the Kaggle competition data is mounted correctly.")
else:
    # Load the image using OpenCV (loads as BGR)
    img = cv2.imread(TEST_IMAGE_PATH)

    if img is None:
        print("ðŸ›‘ ERROR: Could not read the image file.")
    else:
        # Convert the image from BGR (OpenCV default) to RGB (Matplotlib default)
        img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Plot the image
        plt.figure(figsize=(10, 8))
        plt.imshow(img_rgb)
        plt.title(f"Test Image 45 (Dimensions: {img.shape[0]}x{img.shape[1]})")
        plt.axis('off') # Hide axes for a cleaner image view
        plt.show()

In [None]:
import numpy as np
import pandas as pd
import os
import csv
import warnings
import gc
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K
from tensorflow.keras.utils import get_custom_objects
import cv2
import logging
import sys
from tqdm.auto import tqdm

# --- FINAL SUBMISSION CONFIGURATION ---
IMAGE_SIZE = 256
MODEL_INPUT_CHANNELS = 6
# Assuming the CTF7 file is accessible via this path
FINAL_MODEL_PATH = "/kaggle/input/rluc-sfic-st/model_ctf7_output_Final_Balance.weights.h5"
OUTPUT_FILENAME = "submission.csv"
FIXED_THRESHOLD = 0.35
MIN_FORGERY_AREA = 64
Tversky_BETA = 0.55 
alpha = 0.45

# Kaggle Paths
KAGGLEHUB_PATH = "/kaggle/input/recodai-luc-scientific-image-forgery-detection"
TEST_IMAGE_ROOT = os.path.join(KAGGLEHUB_PATH, "test_images")
SAMPLE_SUBMISSION_FILE = os.path.join(KAGGLEHUB_PATH, "sample_submission.csv")

# --- CORE FUNCTIONS ---
def tversky_loss(y_true, y_pred, alpha=alpha, beta=Tversky_BETA, smooth=1e-7):
    y_true_f = K.flatten(y_true); y_pred_f = K.flatten(y_pred)
    TP = K.sum(y_true_f * y_pred_f); FP = K.sum((1 - y_true_f) * y_pred_f)
    FN = K.sum(y_true_f * (1 - y_pred_f))
    tversky_index = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
    return 1 - tversky_index

def get_forgery_features_from_data(img_grayscale_data):
    img = img_grayscale_data.astype(np.uint8); blur = cv2.GaussianBlur(img, (5, 5), 0)
    residual = img.astype(np.float32) - blur.astype(np.float32)
    residual = cv2.resize(residual, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LINEAR)
    residual_min, residual_max = residual.min(), residual.max()
    residual = (residual - residual_min) / (residual_max - residual_min + 1e-7)
    return np.stack([residual]*3, axis=-1).astype(np.float32)

def build_single_stream_unet(input_shape=(IMAGE_SIZE, IMAGE_SIZE, MODEL_INPUT_CHANNELS)):
    input_combined = layers.Input(shape=input_shape, name='combined_input')
    conv1 = layers.Conv2D(32, 3, activation='relu', padding='same')(input_combined); conv1 = layers.Conv2D(32, 3, activation='relu', padding='same')(conv1)
    pool1 = layers.MaxPooling2D((2, 2))(conv1)
    conv2 = layers.Conv2D(64, 3, activation='relu', padding='same')(pool1); conv2 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv2)
    pool2 = layers.MaxPooling2D((2, 2))(conv2)
    bridge = layers.Conv2D(128, 3, activation='relu', padding='same')(pool2); bridge = layers.Conv2D(128, 3, activation='relu', padding='same')(bridge)
    up3 = layers.Conv2DTranspose(64, (2, 2), strides=(2, 2), padding='same')(bridge)
    merge3 = layers.concatenate([up3, conv2]); conv3 = layers.Conv2D(64, 3, activation='relu', padding='same')(merge3); conv3 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv3)
    up4 = layers.Conv2DTranspose(32, (2, 2), strides=(2, 2), padding='same')(conv3)
    merge4 = layers.concatenate([up4, conv1]); conv4 = layers.Conv2D(32, 3, activation='relu', padding='same')(merge4); conv4 = layers.Conv2D(32, 3, activation='relu', padding='same')(conv4)
    output = layers.Conv2D(1, 1, activation='sigmoid', padding='same')(conv4)
    model = models.Model(inputs=input_combined, outputs=output)
    return model

tf.keras.utils.get_custom_objects().update({'tversky_loss': tversky_loss})

def rle_encode(mask):
    if mask.sum() == 0: return "authentic"
    pixels = mask.T.flatten(); pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[::2]
    return ' '.join(str(x) for x in runs)

def create_test_df_robust(test_image_root, sample_submission_path):
    master_df = pd.read_csv(sample_submission_path); master_df['case_id'] = master_df['case_id'].astype(str)
    present_files = {}
    if os.path.exists(test_image_root):
        for root, _, files in os.walk(test_image_root):
            for f in files:
                case_id = os.path.splitext(f)[0]
                if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tif', '.tiff', '.npy')) and case_id.isdigit():
                    present_files[case_id] = os.path.join(root, f)
    master_df['img_path'] = master_df['case_id'].map(present_files).fillna('MISSING_FILE')
    return master_df[master_df['img_path'] != 'MISSING_FILE'][['case_id', 'img_path']].reset_index(drop=True)

def run_submission_inference(unet_model, test_df, fixed_threshold, min_forgery_area):
    results = []
    # Set GPU/CPU device based on environment check
    with tf.device('/cpu:0'):
        for index, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Generating Submission"):
            case_id = str(row['case_id']); img_path = row['img_path']
            gc.collect(); K.clear_session()

            img_bgr = None
            try:
                img_bgr = cv2.imread(img_path)
                if img_bgr is None or img_bgr.size == 0:
                    try:
                        img_data = np.load(img_path)
                        if img_data.ndim == 3: img_bgr = cv2.cvtColor(img_data, cv2.COLOR_RGB2BGR)
                        elif img_data.ndim == 2: img_bgr = cv2.cvtColor(img_data, cv2.COLOR_GRAY2BGR)
                    except Exception: results.append({'case_id': case_id, 'annotation': 'authentic'}); continue
                    if img_bgr is None or img_bgr.size == 0: results.append({'case_id': case_id, 'annotation': 'authentic'}); continue

                img_rgb_orig = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB);
                img_gray_orig = cv2.cvtColor(img_rgb_orig, cv2.COLOR_RGB2GRAY)
                original_shape = img_rgb_orig.shape[:2]

                X1_rgb = cv2.resize(img_rgb_orig, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_LINEAR) / 255.0
                X2_features = get_forgery_features_from_data(img_gray_orig)
                X_combined = np.concatenate([X1_rgb, X2_features], axis=-1)

                input_combined = np.expand_dims(X_combined, axis=0)

                # Use standard Keras predict() on the CPU
                model_output = unet_model.predict(input_combined, verbose=0)
                output_prob = model_output[0, :, :, 0]

                # Use the submission FIXED_THRESHOLD (0.35)
                final_mask_resized = (output_prob > fixed_threshold).astype(np.uint8)
                clean_mask_resized = np.zeros_like(final_mask_resized)

                num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(final_mask_resized, 4, cv2.CV_32S)

                for label in range(1, num_labels):
                    area = stats[label, cv2.CC_STAT_AREA]
                    if area >= min_forgery_area:
                        clean_mask_resized[labels == label] = 1

                final_mask = cv2.resize(clean_mask_resized, (original_shape[1], original_shape[0]), interpolation=cv2.INTER_NEAREST)
                rle_annotation = rle_encode(final_mask); results.append({'case_id': case_id, 'annotation': rle_annotation})

            except Exception as e:
                results.append({'case_id': case_id, 'annotation': 'authentic'}); continue

    return pd.DataFrame(results)

# --- 3. FINAL EXECUTION BLOCK FOR INFERENCE ---
if __name__ == "__main__":

    print("\n--- Starting FINAL SUBMISSION INFERENCE ---")

    # 1. Load Model (CTF7 Final Checkpoint)
    model = build_single_stream_unet((IMAGE_SIZE, IMAGE_SIZE, MODEL_INPUT_CHANNELS))
    tf.keras.utils.get_custom_objects().update({'tversky_loss': tversky_loss})
    model.compile(optimizer='adam', loss=tversky_loss, metrics=['accuracy'])

    try:
        if not os.path.exists(FINAL_MODEL_PATH):
             raise FileNotFoundError(f"Final model weights not found at: {FINAL_MODEL_PATH}.")
        
        # Suppress UserWarning on load_weights
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            model.load_weights(FINAL_MODEL_PATH)
        
        print(f"âœ… Loaded Final CTF Model weights: {FINAL_MODEL_PATH}")
    except FileNotFoundError as e:
        print(f"ðŸ›‘ FATAL Error: {e}. You must successfully run the CTF7 training step to generate the final weights.")
        sys.exit(1)
    except Exception as e:
        print(f"ðŸ›‘ FATAL Error loading final weights: {e}. Aborting submission.")
        sys.exit(1)

    # 2. Generate Submission File for Test Data
    print("\n--- Generating Kaggle Submission File ---")
    test_df = create_test_df_robust(TEST_IMAGE_ROOT, SAMPLE_SUBMISSION_FILE)

    if test_df.empty:
        submission_df = pd.DataFrame(columns=['case_id', 'annotation'])
    else:
        print(f"Processing {len(test_df)} test case(s)...\n")
        results_df = run_submission_inference(model, test_df, FIXED_THRESHOLD, MIN_FORGERY_AREA)
        submission_df = pd.read_csv(SAMPLE_SUBMISSION_FILE)[['case_id']].astype(str)
        submission_df = submission_df.merge(results_df, on='case_id', how='left')
        submission_df['annotation'] = submission_df['annotation'].fillna('authentic')
        submission_df = submission_df[['case_id', 'annotation']].sort_values('case_id').reset_index(drop=True)

    # 3. Write Final CSV (Correct RLE Formatting)
    with open(OUTPUT_FILENAME, "w", newline='') as f:
        # csv.writer will handle the external double quotes for us because of QUOTE_MINIMAL
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(['case_id', 'annotation'])

        for _, row in submission_df.iterrows():
            annotation = row['annotation']

            if annotation.lower() == 'authentic':
                writer.writerow([row['case_id'], annotation])
            else:
                # CRITICAL FIX: Generate the exact comma-separated string required inside the brackets.
                
                # 1. Split the space-separated numbers (e.g., "442080 34 442384 40")
                rle_list = annotation.split(' ')
                
                # 2. Join the list using ", " (e.g., "442080, 34, 442384, 40")
                comma_separated_rle = ", ".join(rle_list)
                
                # 3. Wrap in brackets. The csv.writer handles the external double quotes.
                full_rle_string = f"[{comma_separated_rle}]"
                
                writer.writerow([row['case_id'], full_rle_string])

    print(f"\nâœ… FINAL SUBMISSION CREATED: {OUTPUT_FILENAME} with {len(submission_df)} total rows. Please submit this file.")

In [None]:
import pandas as pd
import numpy as np

def validate_and_print_rle(submission_df):
    """
    Validates RLE output structure and prints debugging info,
    including the total count of RLE segment pairs.
    """
    print("\n--- RLE Output Validation Check ---")

    # Analyze the annotations
    authentic_count = submission_df['annotation'].apply(lambda x: x.lower().strip().replace('[]', '') == 'authentic').sum()
    rle_rows = submission_df[submission_df['annotation'].apply(lambda x: x.lower().strip().replace('[]', '') != 'authentic')]

    print(f"Total Submissions: {len(submission_df)}")
    print(f"Authentic (No Forgery) Count: {authentic_count}")
    print(f"RLE Annotated (Forged) Count: {len(rle_rows)}")

    # --- NEW: Calculate Total Segment Pairs ---
    total_rle_elements = 0

    def count_rle_elements(rle_string):
        nonlocal total_rle_elements
        rle_string = rle_string.strip().strip('[]')
        if not rle_string or rle_string.lower() == 'authentic':
            return True
        try:
            elements = rle_string.replace(',', ' ').split()
            num_elements = len(elements)
            if num_elements % 2 == 0:
                total_rle_elements += num_elements
            return num_elements % 2 == 0
        except:
            return False

    rle_check = rle_rows['annotation'].apply(count_rle_elements)

    if rle_check.all():
        total_pairs = total_rle_elements // 2
        print(f"âœ… RLE Structure: All {len(rle_rows)} RLE strings are structurally valid.")
        print(f"Total Segment Pairs Detected: {total_pairs} (Indicates the complexity of the forgery patterns).")
    else:
        bad_rle_count = len(rle_rows) - rle_check.sum()
        total_pairs = 0
        print(f"ðŸ›‘ RLE ERROR: Found {bad_rle_count} RLE strings with invalid structure.")

    return total_pairs

# --- Execution ---
try:
    # Load the submission file
    submission_df = pd.read_csv("submission.csv")

    # Perform validation
    TS=validate_and_print_rle(submission_df)

    # Print the content for confirmation
    #print("\n--- submission.csv Content ---")
    #print(submission_df.to_string(index=False))

    #if not submission_df.empty:
    #    test_case_result = submission_df.iloc[0]['annotation']
    #   print(f"\nModel Prediction for Test Case {submission_df.iloc[0]['case_id']}: The image is classified as {test_case_result}.")

except FileNotFoundError:
    print("Error: submission.csv not found. Please ensure the inference code was run successfully.")