In [25]:
import pandas as pd
import numpy as np
from visualize import convert_dicom_to_png
from sklearn.model_selection import train_test_split
import os
from tqdm import tqdm
import pydicom
##run deidentification , split data, visualize then preprocessing pipeline
CSV_PATH = '/Users/shrutibalaji/Downloads/vindr-mammo-master 2/dicom_data/breast-level_annotations.csv'
IMAGE_DIR = '/Users/shrutibalaji/Downloads/vindr-mammo-master 2/dicom_data/images'
OUTPUT_DIR = '/Users/shrutibalaji/Downloads/vindr-mammo-master 2/preprocessed_data'

In [26]:

def create_file_mapping(image_dir):
    file_map = {}
    for study_folder in os.listdir(image_dir):
        if study_folder.startswith('.'):
            continue
        study_path = os.path.join(image_dir, study_folder)
        if os.path.isdir(study_path):
            for file in os.listdir(study_path):
                if file.endswith('.dicom'):
                    file_id = file.split('.')[0]
                    file_map[file_id] = os.path.join(study_path, file)
    return file_map


In [27]:

def load_and_preprocess_data(csv_file, file_map):
    df = pd.read_csv(csv_file)
    
    images = []
    labels = []
    processed_files = []
    error_files = []
    
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing files"):
        dicom_path = file_map.get(row['image_id'])
        if dicom_path:
            try:
                img = convert_dicom_to_png(dicom_path)
                if img is not None:
                    images.append(img)
                    labels.append(1 if row['breast_birads'] != 'BI-RADS 1' else 0)
                    processed_files.append(dicom_path)
                else:
                    print(f"Warning: Image conversion returned None for {dicom_path}")
                    error_files.append((dicom_path, "Conversion returned None"))
            except Exception as e:
                print(f"Error processing {dicom_path}: {str(e)}")
                error_files.append((dicom_path, str(e)))
    
    print(f"\nTotal files in directory: {len(file_map)}")
    print(f"Files successfully processed: {len(processed_files)}")
    print(f"Files with errors: {len(error_files)}")
    
    if error_files:
        print("\nFirst few error files:")
        for path, error in error_files[:5]:
            print(f"{path}: {error}")
    
    return images, np.array(labels)

In [5]:
#original
def split_and_save_data(images, labels, output_dir):
    if len(images) == 0:
        print("No images were successfully processed. Cannot split and save data.")
        return

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, stratify=labels, random_state=42)

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    # Save preprocessed data
    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

    # Save images as separate files
    for i, img in enumerate(X_train):
        np.save(os.path.join(output_dir, f'X_train_{i}.npy'), img)
    for i, img in enumerate(X_test):
        np.save(os.path.join(output_dir, f'X_test_{i}.npy'), img)

    print(f"Preprocessing complete. Data saved to {output_dir}")

In [None]:
def split_and_save_data(images, labels, output_dir):
    if len(images) == 0:
        print("No images were successfully processed. Cannot split and save data.")
        return

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, stratify=labels, random_state=42)

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    # Save preprocessed data
    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

    
    # Group annotations by image_id
    grouped = df.groupby('image_id')

# Display images with bounding boxes
for image_id, annotations in grouped:
    display_image_with_boxes(image_id, annotations)
    # Save images as separate files
    for i, img in enumerate(X_train):
        np.save(os.path.join(output_dir, f'X_train_{df.iloc[0]}.npy'), img)
    for i, img in enumerate(X_test):
        np.save(os.path.join(output_dir, f'X_test_{i}.npy'), img)

    print(f"Preprocessing complete. Data saved to {output_dir}")

In [22]:
# chatgpt generated - mapping code

import os
import numpy as np
from sklearn.model_selection import train_test_split
import json
df = pd.read_csv('/Users/shrutibalaji/Downloads/vindr-mammo-master 2/dicom_data/breast-level_annotations.csv')
def split_and_save_data(images, labels, output_dir, image_paths):
    if len(images) == 0:
        print("No images were successfully processed. Cannot split and save data.")
        return

    # Split data into train and test sets
    X_train, X_test, y_train, y_test, paths_train, paths_test = train_test_split(
        images, labels, image_paths, test_size=0.2, stratify=labels, random_state=42
    )

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")

    # Save preprocessed data
    np.save(os.path.join(output_dir, 'y_train.npy'), y_train)
    np.save(os.path.join(output_dir, 'y_test.npy'), y_test)

    # Create mapping dictionaries
    train_mapping = {}
    test_mapping = {}

    # Save images as separate files and create mappings
    for i, (img, path) in enumerate(zip(X_train, paths_train)):
        np.save(os.path.join(output_dir, f'X_train_{i}.npy'), img)
        train_mapping[f'X_train_{i}.npy'] = path

    for i, (img, path) in enumerate(zip(X_test, paths_test)):
        np.save(os.path.join(output_dir, f'X_test_{i}.npy'), img)
        test_mapping[f'X_test_{i}.npy'] = path

    # Save mapping dictionaries
    with open(os.path.join(output_dir, 'train_mapping.json'), 'w') as f:
        json.dump(train_mapping, f)
    with open(os.path.join(output_dir, 'test_mapping.json'), 'w') as f:
        json.dump(test_mapping, f)

    print(f"Preprocessing complete. Data and mappings saved to {output_dir}")

In [23]:
def main():
    if not os.path.exists(CSV_PATH):
        print(f"Error: CSV file not found at {CSV_PATH}")
        return
    
    if not os.path.exists(IMAGE_DIR):
        print(f"Error: Image directory not found at {IMAGE_DIR}")
        return

    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    # Create file mapping
    print("Creating file mapping...")
    file_map = create_file_mapping(IMAGE_DIR)

    # Load and preprocess data
    print(f"\nLoading and preprocessing data from {CSV_PATH}")
    images, labels = load_and_preprocess_data(CSV_PATH, file_map)

    # Split and save data
    split_and_save_data(images, labels, OUTPUT_DIR)

if __name__ == "__main__":
    main()

Creating file mapping...

Loading and preprocessing data from /Users/shrutibalaji/Downloads/vindr-mammo-master 2/dicom_data/breast-level_annotations.csv


Processing files: 100%|██████████| 20000/20000 [00:02<00:00, 8197.03it/s]


Total files in directory: 40
Files successfully processed: 40
Files with errors: 0





TypeError: split_and_save_data() missing 1 required positional argument: 'image_paths'