In [7]:
import os
import pandas as pd
import numpy as np
import nrrd
import cv2
from scipy.ndimage import binary_dilation, binary_erosion
from skimage.util import random_noise
from skimage.segmentation import slic
from skimage.draw import polygon

# Load the metadata file
metadata_path = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata.csv"
df = pd.read_csv(metadata_path)

# Filter relevant masks (Consensus + Moderately/Highly Suspicious)
target_classes = ["Malignant"]
filtered_df = df[(df['Mask'].str.contains('consensus')) & (df['Malignancy'].isin(target_classes))]

# Target balance (aim for 1600 samples)
current_count = len(filtered_df)
target_count = 1600
augmentation_factor = target_count // current_count

# Function to perform augmentations
def augment_mask(mask):
    augmented_masks = []
    
    # Dilation with factor 0.15
    dilated = binary_dilation(mask, iterations=int(0.15 * mask.shape[0])).astype(mask.dtype)
    augmented_masks.append(dilated)
    
    # Erosion with factor -0.15
    eroded = binary_erosion(mask, iterations=int(abs(-0.15) * mask.shape[0])).astype(mask.dtype)
    augmented_masks.append(eroded)
    
    # Random Noise
    noisy = random_noise(mask, mode='s&p', amount=0.05)
    noisy = (noisy > 0.5).astype(mask.dtype)
    augmented_masks.append(noisy)
    
    # Contour Randomization using SLIC superpixels, repeated twice
    for _ in range(2):
        segments = slic(mask, n_segments=50, compactness=10, sigma=1)
        perturbed_mask = mask.copy()
        for segment in np.unique(segments):
            coords = np.column_stack(np.where(segments == segment))
            if np.random.rand() > 0.5:  # Randomly perturb some segments
                rr, cc = polygon(coords[:, 0], coords[:, 1], mask.shape)
                perturbed_mask[rr, cc] = np.random.choice([0, 1])
        augmented_masks.append(perturbed_mask)
    
    return augmented_masks

# Directory for augmented masks
output_dir = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/Augmented"
os.makedirs(output_dir, exist_ok=True)

# Process each mask
new_entries = []
for _, row in filtered_df.iterrows():
    mask_path = row['Mask']
    image_path = row['Image']  # Get the corresponding CT volume
    patient_id, nodule_id = row['Patient_ID'], row['Nodule_ID']
    
    # Load NRRD mask
    data, header = nrrd.read(mask_path)
    
    # Apply augmentations
    augmented_masks = augment_mask(data)
    
    # Save augmented masks
    for i, aug_mask in enumerate(augmented_masks):
        new_filename = f"{patient_id}_nodule_{nodule_id}_aug_{i}.nrrd"
        save_path = os.path.join(output_dir, new_filename)
        nrrd.write(save_path, aug_mask, header)
        
        # Add to new metadata entries with the same image path
        new_entries.append([patient_id, nodule_id, image_path, save_path, row['Malignancy']])

# Convert new entries to DataFrame
augmented_df = pd.DataFrame(new_entries, columns=['Patient_ID', 'Nodule_ID', 'Image', 'Mask', 'Malignancy'])

# Merge with original metadata
df_final = pd.concat([df, augmented_df], ignore_index=True)

# Save updated metadata
updated_metadata_path = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata_augmented1.csv"
df_final = df_final[df_final['Malignancy'].isin(target_classes)]  # Ensure only target classes remain
df_final.to_csv(updated_metadata_path, index=False)

print(f"Augmentation complete. Saved {len(new_entries)} new masks with corresponding CT volumes.")


Augmentation complete. Saved 1470 new masks with corresponding CT volumes.


In [8]:
import pandas as pd
import os

# Load augmented metadata
metadata_path = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata_augmented1.csv"
df = pd.read_csv(metadata_path)

# Keep only augmented masks (those containing '_aug_' in filename)
df_augmented = df[df['Mask'].str.contains('_aug_')]

# Save a new metadata CSV for PyRadiomics
augmented_metadata_path = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata_augmented_only.csv"
df_augmented.to_csv(augmented_metadata_path, index=False)

print(f"Filtered {len(df_augmented)} augmented masks for feature extraction.")


Filtered 1470 augmented masks for feature extraction.


In [9]:
pyradiomics_command = f"""
pyradiomics "{augmented_metadata_path}" -o "{os.path.join(os.path.dirname(augmented_metadata_path), 'all_nodules_metadata_augmented_only.csv')}" -f csv --param "{os.path.join(os.path.dirname(augmented_metadata_path), 'CT.yaml')}"
"""

print("Run the following command in your terminal to extract radiomics features from augmented masks:")
print(pyradiomics_command)


Run the following command in your terminal to extract radiomics features from augmented masks:

pyradiomics "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata_augmented_only.csv" -o "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output\all_nodules_metadata_augmented_only.csv" -f csv --param "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output\CT.yaml"



In [5]:
import os
print("Image exists:", os.path.exists(imageFilepath))
print("Mask exists:", os.path.exists(maskFilepath))


NameError: name 'imageFilepath' is not defined

In [6]:
import os
import pandas as pd
import numpy as np
import nrrd
from scipy.ndimage import binary_dilation, binary_erosion
from skimage.util import random_noise
from skimage.segmentation import slic
from skimage.draw import polygon

def augment_mask(mask):
    """Generate augmented versions of a binary mask while ensuring PyRadiomics compatibility"""
    augmented_masks = []
    
    # 1. Convert to strict binary (0 or 1) with uint8 type
    binary_mask = np.where(mask > 0.5, 1, 0).astype(np.uint8)
    
    # Skip if mask is empty
    if np.sum(binary_mask) == 0:
        return augmented_masks
    
    # 2. Basic augmentations (dilation/erosion/noise)
    # Dilation - always safe
    dilated = binary_dilation(binary_mask, iterations=max(1, int(0.15 * mask.shape[0])))
    augmented_masks.append(dilated.astype(np.uint8))
    
    # Erosion - only keep if substantial
    eroded = binary_erosion(binary_mask, iterations=max(1, int(0.15 * mask.shape[0])))
    if np.sum(eroded) > 5:  # Minimum 5 voxels
        augmented_masks.append(eroded.astype(np.uint8))
    
    # Noise - threshold to maintain binary
    noisy = (random_noise(binary_mask, mode='s&p', amount=0.05) > 0.5).astype(np.uint8)
    if np.sum(noisy) > 5:
        augmented_masks.append(noisy)
    
    # 3. Advanced augmentations (SLIC-based)
    for _ in range(2):  # Create 2 SLIC variants
        try:
            # Adaptive segmentation based on mask size
            n_segments = max(2, min(50, np.sum(binary_mask)//20))
            segments = slic(binary_mask, n_segments=n_segments, compactness=10, sigma=1)
            
            perturbed = binary_mask.copy()
            modified = False
            
            # Only modify segments containing mask pixels
            for seg_val in np.unique(segments[binary_mask > 0]):
                if np.random.rand() > 0.6:  # 40% modification probability
                    coords = np.where(segments == seg_val)
                    if len(coords[0]) > 0:
                        rr, cc = polygon(coords[0], coords[1], mask.shape)
                        perturbed[rr, cc] = np.random.choice([0, 1])
                        modified = True
            
            if modified and np.sum(perturbed) > 5:
                augmented_masks.append(perturbed.astype(np.uint8))
        except:
            continue
    
    return augmented_masks

def process_dataset():
    # Load metadata
    metadata_path = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_nodules_metadata.csv"
    df = pd.read_csv(metadata_path)
    
    # Filter malignant nodules
    target_classes = ["Malignant"]
    filtered_df = df[(df['Mask'].str.contains('consensus')) & 
                    (df['Malignancy'].isin(target_classes))]
    
    # Prepare output
    output_dir = "C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/Augmented"
    os.makedirs(output_dir, exist_ok=True)
    
    new_entries = []
    for _, row in filtered_df.iterrows():
        try:
            # Load original mask
            mask, header = nrrd.read(row['Mask'])
            
            # Generate augmentations
            augmented_masks = augment_mask(mask)
            
            # Save valid augmentations
            for i, aug_mask in enumerate(augmented_masks):
                if np.sum(aug_mask) > 5:  # Final validation
                    new_path = os.path.join(output_dir, 
                                          f"{row['Patient_ID']}_nodule_{row['Nodule_ID']}_aug_{i}.nrrd")
                    nrrd.write(new_path, aug_mask, header)
                    new_entries.append([
                        row['Patient_ID'],
                        row['Nodule_ID'],
                        row['Image'],
                        new_path,
                        row['Malignancy']
                    ])
        except Exception as e:
            print(f"Error processing {row['Mask']}: {str(e)}")
    
    # Save updated metadata
    if new_entries:
        augmented_df = pd.DataFrame(new_entries, columns=df.columns)
        df_final = pd.concat([df, augmented_df], ignore_index=True)
        df_final.to_csv(metadata_path.replace('.csv', '_augmented.csv'), index=False)
        print(f"Created {len(new_entries)} valid augmented masks")
    else:
        print("Warning: No valid augmentations were created")

if __name__ == "__main__":
    process_dataset()

ValueError: 9 columns passed, passed data had 5 columns