**Since the original dataset "LEVIR-CD+" obtained from kaggle contains only "test" and "train" directory, we need to make "val" folder which contains 20% of train. (we moved 20% of data from 'train' to 'val')**

In [1]:
import os
import shutil
import random
from pprint import pprint

# -------- CONFIG --------
input_base = "/kaggle/input/levir-cd/LEVIR-CD+"
working_base = "/kaggle/working/LEVIR-CD+"
split_ratio = 0.2   # change to desired val fraction
# ------------------------

# 1) Inspect input folder (so you can see actual structure)
print("Contents of input_base:")
if os.path.exists(input_base):
    pprint(os.listdir(input_base))
else:
    raise FileNotFoundError(f"Input base not found: {input_base}")

# 2) Copy to working dir (writable). Use dirs_exist_ok to avoid error on reruns.
if not os.path.exists(working_base):
    print(f"\nCopying dataset to working dir: {working_base} (this may take a moment)...")
    shutil.copytree(input_base, working_base, dirs_exist_ok=True)
else:
    print(f"\nWorking dir already exists: {working_base}")

# 3) Find the train directory under working_base (handles nesting)
def find_dir_by_name(root, name):
    for cur_root, dirs, files in os.walk(root):
        if name in dirs:
            return os.path.join(cur_root, name)
    return None

train_dir = find_dir_by_name(working_base, "train")
if train_dir is None:
    # fallback: if top-level train exists
    cand = os.path.join(working_base, "train")
    if os.path.exists(cand):
        train_dir = cand

if train_dir is None or not os.path.isdir(train_dir):
    raise FileNotFoundError(f"Could not find a 'train' directory under {working_base}. Check dataset structure.")

print(f"\nUsing train directory: {train_dir}")
print("Train immediate subfolders:")
train_subdirs = [d for d in os.listdir(train_dir) if os.path.isdir(os.path.join(train_dir, d))]
pprint(train_subdirs)

# 4) Decide which folders correspond to images and label
if set(["A","B","label"]).issubset(set(train_subdirs)):
    A_name, B_name, label_name = "A", "B", "label"
elif set(["T1","T2","label"]).issubset(set(train_subdirs)):
    A_name, B_name, label_name = "T1", "T2", "label"
else:
    # try to auto-detect a label folder (contains 'label' in name)
    label_candidates = [d for d in train_subdirs if 'label' in d.lower()]
    if label_candidates:
        label_name = label_candidates[0]
    else:
        # if no obvious label folder, assume last is label
        if len(train_subdirs) < 3:
            raise Exception(f"Unexpected train subfolders: {train_subdirs}. Need at least two image folders and one label folder.")
        label_name = train_subdirs[-1]
    # choose first two other folders as image folders
    others = [d for d in train_subdirs if d != label_name]
    if len(others) < 2:
        raise Exception(f"Not enough image folders found in train: {train_subdirs}")
    A_name, B_name = others[0], others[1]

print(f"\nDetected folders -> Image1: '{A_name}', Image2: '{B_name}', Label: '{label_name}'")

# 5) Prepare val folder in working_base
val_dir = os.path.join(working_base, "val")
for folder in [A_name, B_name, label_name]:
    os.makedirs(os.path.join(val_dir, folder), exist_ok=True)
print(f"\nVal folder created at: {val_dir}")

# 6) List images in train/A_name and perform split
train_A_path = os.path.join(train_dir, A_name)
if not os.path.isdir(train_A_path):
    raise FileNotFoundError(f"Expected directory not found: {train_A_path}")

train_A_images = [f for f in os.listdir(train_A_path) if os.path.isfile(os.path.join(train_A_path, f))]
if len(train_A_images) == 0:
    raise Exception(f"No files found in {train_A_path} - check dataset content.")

random.shuffle(train_A_images)
val_count = int(split_ratio * len(train_A_images))
val_images = train_A_images[:val_count]

# 7) Move files
moved = 0
for img_name in val_images:
    src_a = os.path.join(train_dir, A_name, img_name)
    src_b = os.path.join(train_dir, B_name, img_name)
    src_label = os.path.join(train_dir, label_name, img_name)

    dst_a = os.path.join(val_dir, A_name, img_name)
    dst_b = os.path.join(val_dir, B_name, img_name)
    dst_label = os.path.join(val_dir, label_name, img_name)

    # Only move if source exists; warn if pair missing
    if not os.path.exists(src_a):
        print(f"Warning: {src_a} missing, skipping this sample.")
        continue
    if not os.path.exists(src_b):
        print(f"Warning: {src_b} missing, skipping this sample.")
        continue
    if not os.path.exists(src_label):
        print(f"Warning: {src_label} missing, skipping this sample.")
        continue

    shutil.move(src_a, dst_a)
    shutil.move(src_b, dst_b)
    shutil.move(src_label, dst_label)
    moved += 1

# 8) Summary
print(f"\n✅ Done. Requested split: {split_ratio*100:.1f}%.")
print(f"Attempted to move: {val_count} samples. Successfully moved: {moved}.")
print(f"Remaining in train/{A_name}: {len(os.listdir(os.path.join(train_dir, A_name)))}")
print(f"In val/{A_name}: {len(os.listdir(os.path.join(val_dir, A_name)))}")


Contents of input_base:
['test', 'train']

Copying dataset to working dir: /kaggle/working/LEVIR-CD+ (this may take a moment)...

Using train directory: /kaggle/working/LEVIR-CD+/train
Train immediate subfolders:
['label', 'B', 'A']

Detected folders -> Image1: 'A', Image2: 'B', Label: 'label'

Val folder created at: /kaggle/working/LEVIR-CD+/val

✅ Done. Requested split: 20.0%.
Attempted to move: 127 samples. Successfully moved: 127.
Remaining in train/A: 510
In val/A: 127


**Since the original dataset obtained from kaggle had image size 1024x1024. But we need 256x256. and changed the label as follows: A-> T1, B-> T2**

In [2]:
"""
Resize LEVIR-CD+ images from 1024x1024 to 256x256 and rename A->T1, B->T2.

By default this writes everything into a new folder:
    /kaggle/working/LEVIR-CD+_256
so your original files are preserved.

Run in Kaggle notebook (no external deps beyond Pillow).
"""

import os
from PIL import Image
import shutil

# -------- CONFIG --------
base_dir = "/kaggle/working/LEVIR-CD+"          # original dataset root
out_dir = base_dir.rstrip("/")+ "_256"         # output root (created)
new_size = (256, 256)
splits = ["train", "val", "test"]

# map old folder names -> new folder names in output
rename_map = {"A": "T1", "B": "T2", "label": "label"}

# allowed image extensions (case-insensitive)
IMG_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".gif"}

# choose resampling methods that work across Pillow versions
try:
    RESAMPLE_LANCZOS = Image.Resampling.LANCZOS
    RESAMPLE_NEAREST = Image.Resampling.NEAREST
except AttributeError:
    RESAMPLE_LANCZOS = Image.LANCZOS
    RESAMPLE_NEAREST = Image.NEAREST

os.makedirs(out_dir, exist_ok=True)

total_processed = 0
for split in splits:
    split_in = os.path.join(base_dir, split)
    if not os.path.isdir(split_in):
        print(f"Skipping missing split folder: {split_in}")
        continue

    for old_name, new_name in rename_map.items():
        folder_in = os.path.join(split_in, old_name)
        if not os.path.isdir(folder_in):
            # silently skip missing subfolders
            continue

        folder_out = os.path.join(out_dir, split, new_name)
        os.makedirs(folder_out, exist_ok=True)

        for fname in sorted(os.listdir(folder_in)):
            # skip hidden/system files
            if fname.startswith("."):
                continue
            _, ext = os.path.splitext(fname)
            if ext.lower() not in IMG_EXTS:
                # copy non-image files unchanged (optional)
                # shutil.copy2(os.path.join(folder_in, fname), os.path.join(folder_out, fname))
                continue

            src_path = os.path.join(folder_in, fname)
            dst_path = os.path.join(folder_out, fname)

            try:
                with Image.open(src_path) as im:
                    # If this is a label (mask), use nearest neighbor and keep single-channel
                    if old_name.lower() == "label":
                        # convert to 'L' to ensure a single-channel mask (if not already)
                        im = im.convert("L")
                        im = im.resize(new_size, RESAMPLE_NEAREST)
                    else:
                        # for RGB images: convert to RGB if needed then resize with high-quality resampling
                        if im.mode not in ("RGB", "RGBA"):
                            im = im.convert("RGB")
                        im = im.resize(new_size, RESAMPLE_LANCZOS)

                    # Save to dst. PIL chooses format from filename extension.
                    im.save(dst_path)
                    total_processed += 1

            except Exception as e:
                print(f"ERROR processing {src_path}: {e}")

print(f"\nDone. Processed ~{total_processed} images.")
print(f"Output root: {out_dir}")
print("Folder layout under output root mirrors the input splits, with subfolders T1, T2, label.")



Done. Processed ~2955 images.
Output root: /kaggle/working/LEVIR-CD+_256
Folder layout under output root mirrors the input splits, with subfolders T1, T2, label.


In [3]:
import os
import zipfile

# -------- CONFIG --------
src_dir = "/kaggle/working/LEVIR-CD+_256"          # folder to compress
out_zip = "/kaggle/working/LEVIR-CD+ processed.zip"  # target zip file

# Safety checks
if not os.path.isdir(src_dir):
    raise FileNotFoundError(f"Source folder not found: {src_dir}")

# Create zip
with zipfile.ZipFile(out_zip, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    for root, dirs, files in os.walk(src_dir):
        for f in files:
            full_path = os.path.join(root, f)
            # store files with a relative path inside the zip
            rel_path = os.path.relpath(full_path, start=os.path.dirname(src_dir))
            zf.write(full_path, arcname=rel_path)

print("Created zip:", out_zip)
print("Size (bytes):", os.path.getsize(out_zip))


Created zip: /kaggle/working/LEVIR-CD+ processed.zip
Size (bytes): 282793585
