In [None]:

import os
import json
import shutil
import zipfile
import uuid
from collections import defaultdict
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

old_zip_path = '/content/drive/MyDrive/Dental Dissertation/before_filtered_opg_dataset_for_LLM2.zip'
new_zip_path = '/content/drive/MyDrive/Dental Dissertation/rabia_new_data3.zip'

base_dir = '/content/dataset_merging'
old_dataset_dir = os.path.join(base_dir, 'old_dataset')
new_data_dir = os.path.join(base_dir, 'new_data')
merged_dataset_dir = '/content/merged_opg_dataset'

Mounted at /content/drive


In [None]:
# Cleanup and Unzip
print("Cleaning up old directories...")
shutil.rmtree(base_dir, ignore_errors=True)
shutil.rmtree(merged_dataset_dir, ignore_errors=True)

print(f"Unzipping OLD dataset to {old_dataset_dir}...")
with zipfile.ZipFile(old_zip_path, 'r') as zip_ref:
    zip_ref.extractall(old_dataset_dir)

print(f"Unzipping NEW dataset to {new_data_dir}...")
with zipfile.ZipFile(new_zip_path, 'r') as zip_ref:
    zip_ref.extractall(new_data_dir)

Cleaning up old directories...
Unzipping OLD dataset to /content/dataset_merging/old_dataset...
Unzipping NEW dataset to /content/dataset_merging/new_data...


In [None]:
# Prepare Merged Folder
# find the train/valid/test folders.

print(f"Locating old dataset contents in {old_dataset_dir}...")
unzipped_files = os.listdir(old_dataset_dir)
base_copy_dir = None

if 'train' in unzipped_files and 'valid' in unzipped_files:
    print("Found train/valid folders at the root. Copying from base.")
    base_copy_dir = old_dataset_dir
elif len(unzipped_files) == 1 and os.path.isdir(os.path.join(old_dataset_dir, unzipped_files[0])):
    potential_base = os.path.join(old_dataset_dir, unzipped_files[0])
    if 'train' in os.listdir(potential_base) and 'valid' in os.listdir(potential_base):
        print(f"Found single top-level folder: {unzipped_files[0]}. Copying from there.")
        base_copy_dir = potential_base
    else:
        print(f"Error: Single folder '{unzipped_files[0]}' found, but it doesn't contain train/valid.")
        raise Exception("Old dataset structure not recognized.")
else:
    print(f"Error: Could not determine the structure of the old zip. Found: {unzipped_files}")
    raise Exception("Old dataset structure not recognized. Stopping.")

print(f"Copying old dataset from {base_copy_dir} to {merged_dataset_dir}...")
# create merged_opg_dataset/train, /valid, /test
shutil.copytree(base_copy_dir, merged_dataset_dir)

Locating old dataset contents in /content/dataset_merging/old_dataset...
Found single top-level folder: merged_opg_dataset. Copying from there.
Copying old dataset from /content/dataset_merging/old_dataset/merged_opg_dataset to /content/merged_opg_dataset...
Base copy complete. 'valid' and 'test' folders are now in place.


In [None]:
# Define Key Paths for Merging
master_train_folder = os.path.join(merged_dataset_dir, 'train')
master_coco_path = os.path.join(master_train_folder, '_annotations.coco.json')

# Find the train filder in the new dataset.
new_train_folder = os.path.join(new_data_dir, 'train')
# Handle if new data *also* has a top-level folder
if not os.path.exists(new_train_folder):
    new_data_root = os.path.join(new_data_dir, os.listdir(new_data_dir)[0])
    new_train_folder = os.path.join(new_data_root, 'train')

new_coco_path = os.path.join(new_train_folder, '_annotations.coco.json')

print(f"Master annotation file: {master_coco_path}")
print(f"New annotation file: {new_coco_path}")

if not os.path.exists(master_coco_path):
    raise Exception(f"Master COCO file not found at: {master_coco_path}")
if not os.path.exists(new_coco_path):
    raise Exception(f"New COCO file not found at: {new_coco_path}")

Master annotation file: /content/merged_opg_dataset/train/_annotations.coco.json
New annotation file: /content/dataset_merging/new_data/train/_annotations.coco.json


In [None]:
# Load COCO Files
print("Loading COCO annotation files...")
with open(master_coco_path, 'r') as f:
    master_coco = json.load(f)

with open(new_coco_path, 'r') as f:
    new_coco = json.load(f)

print(f"Master dataset: {len(master_coco['images'])} images, {len(master_coco['annotations'])} annotations, {len(master_coco['categories'])} categories.")
print(f"New dataset: {len(new_coco['images'])} images, {len(new_coco['annotations'])} annotations, {len(new_coco['categories'])} categories.")

Loading COCO annotation files...
Master dataset: 1768 images, 60688 annotations, 63 categories.
New dataset: 105 images, 993 annotations, 46 categories.


In [None]:
# Merge Categories
print("Merging categories...")

# A lookup for master categories by name
master_cat_map = {cat['name']: cat for cat in master_coco['categories']}
max_master_cat_id = 0
if master_coco['categories']:
    max_master_cat_id = max(cat['id'] for cat in master_coco['categories'])

# This map translate NEW_cat_id -> MASTER_cat_id
category_translation_map = {}

for new_cat in new_coco['categories']:
    new_name = new_cat['name']
    new_id = new_cat['id']

    if new_name in master_cat_map:
        # Category already exists, map new ID to master ID
        category_translation_map[new_id] = master_cat_map[new_name]['id']
    else:
        print(f"Adding new category: '{new_name}'")
        max_master_cat_id += 1
        new_master_entry = {'id': max_master_cat_id, 'name': new_name}

        master_coco['categories'].append(new_master_entry)
        master_cat_map[new_name] = new_master_entry # Add to lookup
        category_translation_map[new_id] = max_master_cat_id

print(f"Total categories after merge: {len(master_coco['categories'])}")

Merging categories...
Total categories after merge: 63


In [None]:
# Merge Images & Annotations
print("Merging images and annotations...")

max_img_id = 0
if master_coco['images']:
    max_img_id = max([img['id'] for img in master_coco['images']] or [0])

max_ann_id = 0
if master_coco['annotations']:
    max_ann_id = max([ann['id'] for ann in master_coco['annotations']] or [0])

# translate NEW_image_id -> new MASTER_image_id
image_id_translation_map = {}

for new_img in new_coco['images']:
    old_img_id = new_img['id']
    file_name = new_img['file_name']

    src_img_path = os.path.join(new_train_folder, file_name)
    dst_img_path = os.path.join(master_train_folder, file_name)

    if not os.path.exists(src_img_path):
        print(f"Warning: Image file not found, skipping: {src_img_path}")
        continue

    if os.path.exists(dst_img_path):
        unique_id = uuid.uuid4().hex[:8]
        base, ext = os.path.splitext(file_name)
        new_file_name = f"{base}_new_{unique_id}{ext}"
        print(f"File clash for '{file_name}'. Renaming new file to '{new_file_name}'.")
        dst_img_path = os.path.join(master_train_folder, new_file_name)
        new_img['file_name'] = new_file_name 

    shutil.copy(src_img_path, dst_img_path)

    # Re-ID the image and add to master list
    max_img_id += 1
    new_master_img_id = max_img_id

    image_id_translation_map[old_img_id] = new_master_img_id
    new_img['id'] = new_master_img_id
    master_coco['images'].append(new_img)

# Process Annotations
for new_ann in new_coco['annotations']:
    old_img_id = new_ann['image_id']
    old_cat_id = new_ann['category_id']

    if old_img_id not in image_id_translation_map:
        continue # Image was skipped (e.g., file not found)
    if old_cat_id not in category_translation_map:
        print(f"Warning: Skipping annotation {new_ann['id']} with unknown category {old_cat_id}")
        continue # Category was not in the new list or master list

    # Re-ID the annotation
    max_ann_id += 1
    new_ann['id'] = max_ann_id

    # Translate the image and category IDs
    new_ann['image_id'] = image_id_translation_map[old_img_id]
    new_ann['category_id'] = category_translation_map[old_cat_id]

    master_coco['annotations'].append(new_ann)

Merging images and annotations...


In [None]:
# Save the Merged COCO file
print("Saving merged annotation file...")
with open(master_coco_path, 'w') as f:
    json.dump(master_coco, f, indent=2)

print("\n--- MERGE COMPLETE ---")
print(f"Final merged dataset location: {merged_dataset_dir}")
print(f"Total images in 'train': {len(master_coco['images'])}")
print(f"Total annotations in 'train': {len(master_coco['annotations'])}")
print(f"Total categories: {len(master_coco['categories'])}")

Saving merged annotation file...

--- MERGE COMPLETE ---
Final merged dataset location: /content/merged_opg_dataset
Total images in 'train': 1873
Total annotations in 'train': 61681
Total categories: 63


In [None]:
import time

folder_to_zip = './merged_opg_dataset'
zip_file_name = 'before_filtered_opg_dataset_for_LLM3.zip'
drive_destination_path = f'/content/drive/MyDrive/Dental Dissertation/{zip_file_name}'


print(f"Zipping the folder: {folder_to_zip}")
start_time = time.time()

!zip -r -q {zip_file_name} {folder_to_zip}

zip_time = time.time()
print(f"Zipping complete in {zip_time - start_time:.2f} seconds.")

# Copy the single zip file to Google Drive
print(f"Copying {zip_file_name} to your Google Drive...")
!cp {zip_file_name} {drive_destination_path}

copy_time = time.time()
print(f"Copying complete in {copy_time - zip_time:.2f} seconds.")

print("\n Success!")
print(f"New dataset is saved as a zip file at: {drive_destination_path}")

Attempting to zip the folder: ./merged_opg_dataset
Zipping complete in 40.58 seconds.
Copying before_filtered_opg_dataset_for_LLM3.zip to your Google Drive...
cp: target 'Dissertation/before_filtered_opg_dataset_for_LLM3.zip' is not a directory
Copying complete in 0.11 seconds.

âœ… Success!
Your new dataset is saved as a zip file at: /content/drive/MyDrive/Dental Dissertation/before_filtered_opg_dataset_for_LLM3.zip


In [None]:
from google.colab import files
zip_file_name = 'before_filtered_opg_dataset_for_LLM3.zip'

print(f"Preparing to download {zip_file_name}...")
files.download(zip_file_name)

Preparing to download before_filtered_opg_dataset_for_LLM3.zip...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>