In [1]:
# Cell 1: Import Libraries and Setup
import os
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Medical Dataset Rebuilder")
print("=" * 40)
print("Libraries imported successfully!")





Medical Dataset Rebuilder
Libraries imported successfully!


In [2]:
# Cell 2: Define MedicalDatasetBuilder Class - Part 1
class MedicalDatasetBuilder:
    """
    Build medical image dataset from existing CSV files
    """
    
    def __init__(self, csv_directory="."):
        """
        Initialize with directory containing CSV files
        
        Args:
            csv_directory (str): Directory containing the CSV files
        """
        self.csv_directory = Path(csv_directory)
        self.master_df = None
        
    def load_existing_csvs(self):
        """
        Load all existing CSV files and identify available datasets
        """
        print("Scanning for existing CSV files...")
        
        # List of expected CSV files based on your original notebook
        expected_csvs = [
            "burns_1and2.csv",
            "burns_3rd.csv", 
            "burns_unknown_degree.csv",
            "wounds.csv",
            "urgent_care_images_master.csv",
            "urgent_care_images_master_no_wound_burns.csv",
            "urgent_care_images_master_plus_yasin.csv",
            "urgent_care_images_master_final.csv"
        ]
        
        found_csvs = []
        missing_csvs = []
        
        for csv_file in expected_csvs:
            csv_path = self.csv_directory / csv_file
            if csv_path.exists():
                found_csvs.append(csv_file)
                size = csv_path.stat().st_size / 1024  # KB
                print(f"  Found: {csv_file} ({size:.1f} KB)")
            else:
                missing_csvs.append(csv_file)
                print(f"  Missing: {csv_file}")
        
        if missing_csvs:
            print(f"\nMissing {len(missing_csvs)} CSV files")
        
        return found_csvs, missing_csvs

In [3]:
# Cell 3: Add Component Building Methods
    def build_from_components(self):
        """
        Build master dataset from individual component CSV files
        """
        print("\nBuilding dataset from component CSV files...")
        
        dataframes = []
        
        # Try to load burn datasets
        burn_files = ["burns_1and2.csv", "burns_3rd.csv", "burns_unknown_degree.csv"]
        for burn_file in burn_files:
            csv_path = self.csv_directory / burn_file
            if csv_path.exists():
                df = pd.read_csv(csv_path)
                print(f"  Loaded {burn_file}: {len(df)} images")
                dataframes.append(df)
            else:
                print(f"  Skipping missing {burn_file}")
        
        # Try to load wound dataset
        wounds_path = self.csv_directory / "wounds.csv"
        if wounds_path.exists():
            df_wounds = pd.read_csv(wounds_path)
            print(f"  Loaded wounds.csv: {len(df_wounds)} images")
            dataframes.append(df_wounds)
        else:
            print(f"  Skipping missing wounds.csv")
        
        if dataframes:
            # Combine all dataframes
            self.master_df = pd.concat(dataframes, ignore_index=True)
            print(f"\nCombined dataset: {len(self.master_df)} total images")
            return True
        else:
            print("No component CSV files found")
            return False

IndentationError: unexpected indent (3152934310.py, line 2)

In [None]:
# Cell 4: Add Master File Loading Method
    def load_existing_master(self):
        """
        Load existing master CSV file if available
        """
        print("\nLooking for existing master datasets...")
        
        # Priority order for master files (most processed first)
        master_files = [
            "urgent_care_images_master_final.csv",
            "urgent_care_images_master_plus_yasin.csv", 
            "urgent_care_images_master_no_wound_burns.csv",
            "urgent_care_images_master.csv"
        ]
        
        for master_file in master_files:
            csv_path = self.csv_directory / master_file
            if csv_path.exists():
                self.master_df = pd.read_csv(csv_path)
                print(f"  Loaded {master_file}: {len(self.master_df)} images")
                return True
        
        print("  No master CSV files found")
        return False

In [None]:
# Cell 5: Add File Path and Validation Methods
    def fix_file_paths(self, new_base_path=None):
        """
        Fix file paths in the dataset to point to correct locations
        
        Args:
            new_base_path (str): New base path for images, if different from CSV
        """
        if self.master_df is None:
            print("No dataset loaded")
            return
        
        print("\nAnalyzing file paths...")
        
        # Show sample of current paths
        print("Sample current file paths:")
        for i, path in enumerate(self.master_df['filepath'].head(3)):
            print(f"  {i+1}. {path}")
        
        if new_base_path:
            print(f"\nUpdating base path to: {new_base_path}")
            
            def update_path(old_path):
                # Extract just the filename from old path
                filename = Path(old_path).name
                return str(Path(new_base_path) / filename)
            
            self.master_df['filepath'] = self.master_df['filepath'].apply(update_path)
            print("File paths updated")
    
    def validate_files(self, sample_size=10):
        """
        Validate that files exist at specified paths
        
        Args:
            sample_size (int): Number of files to check
        """
        if self.master_df is None:
            print("No dataset loaded")
            return
        
        print(f"\nValidating file existence (checking {sample_size} samples)...")
        
        sample_df = self.master_df.sample(min(sample_size, len(self.master_df)))
        
        existing_count = 0
        missing_count = 0
        
        for _, row in sample_df.iterrows():
            if Path(row['filepath']).exists():
                existing_count += 1
            else:
                missing_count += 1
                print(f"  Missing: {row['filepath']}")
        
        print(f"\nValidation Results:")
        print(f"  Found: {existing_count}/{sample_size} files")
        print(f"  Missing: {missing_count}/{sample_size} files")
        
        if missing_count > 0:
            print("\nSuggestion: Update file paths using fix_file_paths() method")

In [None]:
# Cell 6: Add Data Cleaning Methods
    def clean_dataset(self, remove_classes=None, merge_classes=None):
        """
        Clean and standardize the dataset
        
        Args:
            remove_classes (list): List of class labels to remove
            merge_classes (dict): Dict mapping old labels to new labels
        """
        if self.master_df is None:
            print("No dataset loaded")
            return
        
        print("\nCleaning dataset...")
        
        original_size = len(self.master_df)
        
        # Remove unwanted classes
        if remove_classes:
            print(f"  Removing classes: {remove_classes}")
            self.master_df = self.master_df[~self.master_df['label'].isin(remove_classes)]
            print(f"    Removed {original_size - len(self.master_df)} images")
        
        # Merge/standardize classes
        if merge_classes:
            print(f"  Merging classes: {merge_classes}")
            self.master_df['label'] = self.master_df['label'].replace(merge_classes)
        
        # Standard cleaning from original notebook
        standard_merges = {
            "wound_laseration": "wound_laceration",  # Fix typo
            "wound_stab_wound": "wound_laceration",  # Merge similar classes
        }
        
        self.master_df['label'] = self.master_df['label'].replace(standard_merges)
        
        print(f"Cleaned dataset: {len(self.master_df)} images remaining")

In [None]:
# Cell 7: Add Information Display and Save Methods
    def show_dataset_info(self):
        """
        Display comprehensive dataset information
        """
        if self.master_df is None:
            print("No dataset loaded")
            return
        
        print("\nDataset Information")
        print("=" * 50)
        
        print(f"Total Images: {len(self.master_df):,}")
        print(f"Total Classes: {self.master_df['label'].nunique()}")
        
        print("\nClass Distribution:")
        class_counts = self.master_df['label'].value_counts()
        for label, count in class_counts.items():
            percentage = (count / len(self.master_df)) * 100
            print(f"  {label:25} {count:>6,} images ({percentage:5.1f}%)")
        
        # Create visualization
        plt.figure(figsize=(12, 6))
        class_counts.plot(kind='bar', color='steelblue')
        plt.title('Medical Image Dataset - Class Distribution')
        plt.xlabel('Class Label')
        plt.ylabel('Number of Images')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.show()
        
        return class_counts
    
    def save_dataset(self, filename="medical_dataset_fixed.csv"):
        """
        Save the cleaned dataset
        
        Args:
            filename (str): Output filename
        """
        if self.master_df is None:
            print("No dataset loaded")
            return
        
        output_path = self.csv_directory / filename
        self.master_df.to_csv(output_path, index=False)
        print(f"Saved dataset to: {output_path}")
        print(f"   Total images: {len(self.master_df):,}")
        print(f"   Total classes: {self.master_df['label'].nunique()}")

In [None]:
# Cell 8: Initialize and Scan for Files
# Initialize builder
builder = MedicalDatasetBuilder()

# Scan for existing files
found_csvs, missing_csvs = builder.load_existing_csvs()


In [None]:
# Cell 9: Load Dataset
# Try to load existing master file first
if not builder.load_existing_master():
    # If no master file, build from components
    if not builder.build_from_components():
        print("Failed to load any dataset files")
    else:
        print("Successfully built dataset from components")
else:
    print("Successfully loaded existing master dataset")

In [None]:
# Cell 10: Display Current Dataset Info
if builder.master_df is not None:
    builder.show_dataset_info()
else:
    print("No dataset available to display")

In [None]:
# Cell 11: Optional - Fix File Paths (uncomment and modify if needed)
# If your images are in a different location, uncomment and modify this:
# new_image_path = "/path/to/your/images"
# builder.fix_file_paths(new_image_path)


In [None]:
# Cell 12: Optional - Validate File Existence (uncomment to check)
# Uncomment to check if files exist at current paths:
# builder.validate_files(sample_size=5)


In [None]:
# Cell 13: Clean Dataset
# Remove problematic classes
remove_classes = [
    "wound_burns",  # Duplicate with burn classes
    "wound_ingrown_nails",  # Not relevant for urgent care
    "wound_normal",  # Not a medical condition
    "wound_surgical_wounds"  # Too specific/clinical
]

if builder.master_df is not None:
    builder.clean_dataset(remove_classes=remove_classes)
else:
    print("No dataset loaded for cleaning")

In [None]:
# Cell 14: Show Final Dataset Info
if builder.master_df is not None:
    print("\nFinal Dataset:")
    final_counts = builder.show_dataset_info()
else:
    print("No dataset available")


In [None]:




















# Cell 15: Save Rebuilt Dataset with Original Names
if builder.master_df is not None:
    # Save with the original expected filename that other notebooks use
    builder.save_dataset("urgent_care_images_master_final.csv")
    
    # Also save backup versions for safety
    builder.save_dataset("urgent_care_images_master.csv")
    builder.save_dataset("urgent_care_images_master_no_wound_burns.csv")
    
    print("\nDataset rebuilding complete!")
    print("Saved with original filenames that other notebooks expect:")
    print("- urgent_care_images_master_final.csv (main file)")
    print("- urgent_care_images_master.csv (backup)")
    print("- urgent_care_images_master_no_wound_burns.csv (backup)")
    print("\nNext steps:")
    print("1. Verify file paths point to correct image locations")
    print("2. Update paths using builder.fix_file_paths() if needed")
    print("3. Run builder.validate_files() to check file existence")
else:
    print("No dataset to save")

In [None]:


# Cell 16: Optional - Quick Dataset Exploration
# Uncomment to explore the dataset further:
# if builder.master_df is not None:
#     print("\nFirst few rows of the dataset:")
#     print(builder.master_df.head())
#     
#     print("\nDataset columns:")
#     print(builder.master_df.columns.tolist())
#     
#     print("\nDataset shape:")
#     print(builder.master_df.shape)