In [1]:
import os
import re
import csv
import shutil
from shutil import copy2
from collections import defaultdict
import pandas as pd

In [None]:
# base_dir = os.path.join(os.getcwd(), "../Dataset/Oral Dose Forms")


# ============= CONFIGURATION =============
input_dir = "../Dataset/Oral Dose Forms/"  # Current directory where images are stored
output_dir = "../Dataset/Labelled_Images_blisterPriority/"  # Output directory
csv_path = "./label_summary_blisterPriority.csv"
priority = "Blisters"  # Change to "Blisters" if needed

In [None]:
# Ensure output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
else:
    # Clear output directory before starting
    for filename in os.listdir(output_dir):
        file_path = os.path.join(output_dir, filename)
        try:
            if os.path.isdir(file_path):
                shutil.rmtree(file_path)
            else:
                os.remove(file_path)
        except Exception as e:
            print(f"Error removing {file_path}: {e}")

# ============= REGEX PATTERNS =============
dosage_pattern = re.compile(r"(?xi)(?:[0-9]+(?:\.[0-9]+)?(?:mg|mcg|g|ml))|(?:[0-9]+\s*unit)")
version_pattern = re.compile(r"v(\d+)", re.IGNORECASE)
aug_pattern = re.compile(r"_aug_(\d+)", re.IGNORECASE)

# ============= HELPER FUNCTION: PACKAGING DETECTION =============
def determine_packaging(original_name: str, priority: str = "Box") -> str:
    """
    Determine packaging type based on keywords.
    - 'Bottle', 'btl', 'ml', 'sachet', and 'oral solution' are always classified as 'Bottle'.
    - 'Box' and 'Blisters' are the main classification groups. If both are found, follow priority setting.

    Args:
        original_name (str): The filename to classify.
        priority (str): Either "Box" or "Blisters". Determines precedence.

    Returns:
        str: The classified packaging type.
    """
    lower = original_name.lower()

    # Define keyword lists
    bottle_keywords = ["bottle", "btl", "ml", "sachet", "oral solution"]
    box_keywords = ["box", "pack"]
    blister_keywords = ["cap", "capsule", "blister"]

    # First, check if it's a "Bottle" (highest precedence)
    if any(keyword in lower for keyword in bottle_keywords):
        return "Bottle"

    # Otherwise, check for "Box" or "Blisters"
    found_box = any(keyword in lower for keyword in box_keywords)
    found_blister = any(keyword in lower for keyword in blister_keywords)

    if found_box and found_blister:
        return priority  # Assign based on priority setting

    if found_box:
        return "Box"
    if found_blister:
        return "Blisters"

    return "Blisters"  # Default category if nothing matches

# ============= MAIN SCRIPT =============
# Track statistics
label_counts = defaultdict(int)
packaging_counts = defaultdict(int)
dosage_counts = defaultdict(int)
medication_counts = defaultdict(int)
unique_labels = set()
csv_data = []

# Get all image files
valid_exts = (".png", ".jpg", ".jpeg", ".bmp", ".tiff")
all_images = [f for f in os.listdir(input_dir) if f.lower().endswith(valid_exts)]

for filename in all_images:
    old_path = os.path.join(input_dir, filename)
    name_wo_ext, ext = os.path.splitext(filename)

    # 1) Detect packaging, dosage, version, and augmentation
    packaging = determine_packaging(name_wo_ext, priority=priority)
    dosages_found = dosage_pattern.findall(name_wo_ext)
    dosage_str = "_".join(d.strip() for d in dosages_found) if dosages_found else "UnknownDosage"
    version_match = version_pattern.search(name_wo_ext)
    version_str = f"v{version_match.group(1)}" if version_match else "v1"
    aug_match = aug_pattern.search(name_wo_ext)
    aug_str = f"aug{aug_match.group(1)}" if aug_match else "aug0"

    # 2) Clean medication name
    med_name_cleaned = name_wo_ext
    for d in dosages_found:
        med_name_cleaned = med_name_cleaned.replace(d, "")
    if version_match:
        med_name_cleaned = med_name_cleaned.replace(version_match.group(0), "")
    if aug_match:
        med_name_cleaned = med_name_cleaned.replace(aug_match.group(0), "")
    med_name_cleaned = re.sub(r"\[.*?\]", "", med_name_cleaned)
    med_name_cleaned = re.sub(r"\(.*?\)", "", med_name_cleaned)
    med_name_cleaned = re.sub(r"\s+", "_", med_name_cleaned)
    med_name_cleaned = re.sub(r"_+", "_", med_name_cleaned)
    med_name_cleaned = re.sub(r"-+", "-", med_name_cleaned)
    med_name_cleaned = med_name_cleaned.strip("_- ")
    if not med_name_cleaned:
        med_name_cleaned = "UnknownMed"

    # 3) Construct the new filename (you may keep packaging in the name for reference)
    new_filename = f"{med_name_cleaned} - {dosage_str} - {version_str} - {aug_str} - {packaging}{ext}"

    # 4) Create the subdirectory for this class (packaging type) if it doesn't exist
    class_dir = os.path.join(output_dir, packaging)
    if not os.path.exists(class_dir):
        os.makedirs(class_dir)

    # 5) Copy the file to its respective class subfolder
    new_path = os.path.join(class_dir, new_filename)
    copy2(old_path, new_path)

    # 6) Update tracking and CSV data
    unique_labels.add((med_name_cleaned, dosage_str, version_str, packaging))
    label_counts[(med_name_cleaned, dosage_str, packaging)] += 1
    packaging_counts[packaging] += 1
    dosage_counts[dosage_str] += 1
    medication_counts[med_name_cleaned] += 1

    csv_data.append([filename, new_filename, med_name_cleaned, dosage_str, version_str, aug_str, packaging])


# ============= WRITE CSV =============
with open(csv_path, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["original_filename","new_filename","medication","dosage","version","augmentation","packaging"])
    writer.writerows(csv_data)

# ============= PRINT STATISTICS =============
print(f"Total unique labels (ignoring augmentation): {len(unique_labels)}")

print("\n=== Packaging Counts ===")
for packaging, count in packaging_counts.items():
    print(f"{packaging}: {count} images")

print("\n=== Dosage Counts (Top 10) ===")
for dosage, count in sorted(dosage_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{dosage}: {count}")

print("\n=== Top 10 Medications ===")
for med, count in sorted(medication_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
    print(f"{med}: {count}")

print("\nProcessing complete!")

Total unique labels (ignoring augmentation): 3827

=== Packaging Counts ===
Box: 8820 images
Blisters: 13620 images
Bottle: 3548 images

=== Dosage Counts (Top 10) ===
10mg: 2716
5mg: 1727
100mg: 1604
25mg: 1440
50mg: 1392
20mg: 1184
200mg: 1080
500mg: 1044
250mg: 876
UnknownDosage: 868

=== Top 10 Medications ===
Rosuvastatin_Tab_0: 192
Losartan_Tab_0: 140
Telmisartan_Tab_0: 128
Sildenafil_Tab_0: 124
Carvedilol_Tab_0: 116
Acitretin_Cap_0: 112
Rosuvastatin_Tab_1: 108
Quetiapine_Tab_0: 108
Enalapril_Tab_0: 108
Finasteride_Tab_0: 108

Processing complete!


In [4]:
# ============= CONFIG =============
dataset_path = "../Dataset/Oral Dose Forms/"         # Folder with images
output_folder = "../Dataset/Labelled_Images_Iter1/"  # Output for processed files
csv_name = "./keyword_label_summary.csv"

os.makedirs(output_folder, exist_ok=True)


if os.path.exists(output_folder):
    for filename in os.listdir(output_folder):
        file_path = os.path.join(output_folder, filename)
        try:
            if os.path.isdir(file_path):
                shutil.rmtree(file_path)
            else:
                os.remove(file_path)
        except Exception as e:
            print(f"Error removing {file_path}: {e}")
else:
    os.makedirs(output_dir)

In [5]:


# ============= KEYWORD CATEGORIES =============
categories = {
    "tabs":    ["tab", "mg", "cap"],
    "boxes":   ["box", "pack", "pak"],
    "bottles": ["bottle", "jar", "btl", "ml", "syrup", "powder", "g"]
}
fallback_category = "others"

# Track label counts
label_counts = {cat: 0 for cat in categories}
label_counts[fallback_category] = 0

# ============= HELPER: DETERMINE CATEGORY =============
def categorize_image(filename: str) -> str:
    lower_name = filename.lower()
    for cat, keywords in categories.items():
        if any(kw in lower_name for kw in keywords):
            return cat
    return fallback_category

# ============= AUGMENTATION PATTERN =============
# We'll search for something like "_aug_2" and convert it to "aug2"
aug_pattern = re.compile(r"_aug_(\d+)", re.IGNORECASE)

# ============= PARSING LOGIC & FILE PROCESSING =============
valid_exts = (".png", ".jpg", ".jpeg", ".bmp", ".tiff")
all_images = [f for f in os.listdir(dataset_path) if f.lower().endswith(valid_exts)]
summary_data = []

for image in all_images:
    category = categorize_image(image)
    label_counts[category] += 1

    base_name, ext = os.path.splitext(image)
    # Example base_name: "Amoxycillin_500mg_v1_aug_2_Box"

    # 1) Convert any `_aug_X` to `augX` in the base name
    #    This ensures a simpler name if you're relying on underscores for splitting.
    #    We'll do a find-and-replace on the entire base_name:
    updated_base_name = aug_pattern.sub(lambda m: f"aug{m.group(1)}", base_name)
    # e.g. "Amoxycillin_500mg_v1_aug_2_Box" -> "Amoxycillin_500mg_v1_aug2_Box"

    # 2) Now split by underscores
    parts = updated_base_name.split("_")
    if len(parts) >= 5:
        medication   = parts[0]
        dosage       = parts[1]
        version      = parts[2]
        augmentation = parts[3]
        packaging    = parts[4]
    else:
        # If not enough parts, fill placeholders
        medication   = parts[0] if len(parts) > 0 else "UnknownMed"
        dosage       = parts[1] if len(parts) > 1 else "UnknownDosage"
        version      = parts[2] if len(parts) > 2 else "UnknownVer"
        augmentation = parts[3] if len(parts) > 3 else "UnknownAug"
        packaging    = parts[4] if len(parts) > 4 else category

    # 3) Construct new filename
    #    e.g. "<Medication> - <Dosage> - <Version> - <Augmentation> - <Packaging>.ext"
    new_filename = f"{medication} - {dosage} - {version} - {augmentation} - {packaging}{ext}"

    # 4) Copy (or move) to output
    src_path = os.path.join(dataset_path, image)
    dst_path = os.path.join(output_folder, new_filename)
    copy2(src_path, dst_path)

    # 5) Prepare row for CSV
    summary_data.append({
        "original_filename": image,
        "new_filename": new_filename,
        "medication": medication,
        "dosage": dosage,
        "version": version,
        "augmentation": augmentation,
        "packaging": packaging,
    })

# ============= WRITE CSV & SUMMARY =============
df = pd.DataFrame(summary_data)
df.to_csv(csv_name, index=False)
print(f"Saved CSV to {csv_name}")

print("\nLabel Counts:")
for cat, ccount in label_counts.items():
    print(f"{cat}: {ccount} images")

print("\nProcessing complete!")


FileNotFoundError: [Errno 2] No such file or directory