In [None]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def load_nifti(file_path):
    """Load a NIfTI file and return the data array."""
    nifti_data = nib.load(file_path)
    return nifti_data.get_fdata()

def binarize_mask(mask):
    """Convert the NIfTI data to a binary mask."""
    binary_mask = np.where(mask > 0, 1, 0)
    return binary_mask

# Load LEDD.csv file
ledd_path = '/Users/rwankhalifa/Downloads/LEDD.csv' 
ledd_data = pd.read_csv(ledd_path)

# Add a new column for LEDD value categorization
ledd_data['LEDD_category'] = ledd_data['Delta_LEDD'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Zero'))

# Base directory where patient data is stored
base_dir = '/Volumes/MORRISON/retro_clin'
all_patient_data = []

# Set a default shape for missing masks if needed (e.g., [1, 1, 1] if no prior knowledge)
default_shape = (394, 466, 378)  # Actual shape you expect from your NIfTI files

# Loop through each patient, load their stimulation files and binary masks
for patient_number in ledd_data['Patient Number']:
    # Remove the prefix 'PDa' and convert the rest to an integer
    patient_number_int = int(patient_number.replace('PDa', ''))
    
    patient_dir = f'PDa{patient_number_int:03d}'
    file_patient_number = str(patient_number_int)
    
    # Define stimulation file paths for both hemispheres
    stim_file_paths = {
        'right': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-R.nii',
        'left': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-L.nii'
    }
    
    # Initialize patient data
    patient_data = []
    
    # Load and binarize mask for each hemisphere
    for hemi, path in stim_file_paths.items():
        if os.path.exists(path):
            mask_data = load_nifti(path)
            binary_mask = binarize_mask(mask_data)
            patient_data.append(binary_mask.flatten())  # Flatten mask to a 1D array for clustering
        else:
            # Create a binary mask of zeros with the default shape when the file does not exist
            default_mask = np.zeros(default_shape)
            patient_data.append(default_mask.flatten())

    all_patient_data.append(np.concatenate(patient_data))

# Convert patient data to NumPy array for clustering
patient_data_matrix = np.array(all_patient_data)

# Standardize the data before clustering
scaler = StandardScaler()
scaled_data = scaler.fit_transform(patient_data_matrix)

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add cluster labels to LEDD data
ledd_data['Cluster'] = clusters

# Save or display the updated DataFrame for analysis
ledd_data.to_csv('/Users/rwankhalifa/Downloads/clusters.csv', index=False)  # Update with desired output path
print(ledd_data)



In [1]:
import nibabel as nib

# Load the NIfTI file to determine the default shape
nifti_file_path = '/Volumes/MORRISON/retro_clin/PDa144/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd144/rsub-leads_sim-binary_model-simbio_hemi-L.nii'  # Update this path with the location of your NIfTI file
nifti_data = nib.load(nifti_file_path)
default_shape = nifti_data.shape

print("Default shape of the NIfTI file:", default_shape)

Default shape of the NIfTI file: (394, 466, 378)


In [None]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def load_nifti(file_path):
    """Load a NIfTI file and return the data array."""
    print(f"Loading NIfTI file from {file_path}...")
    nifti_data = nib.load(file_path)
    print(f"NIfTI file loaded from {file_path}.")
    return nifti_data.get_fdata()

def binarize_mask(mask):
    """Convert the NIfTI data to a binary mask."""
    binary_mask = np.where(mask > 0, 1, 0)
    return binary_mask

# Load LEDD.csv file
ledd_path = '/Users/rwankhalifa/Downloads/LEDD.csv' 
print(f"Loading LEDD data from {ledd_path}...")
ledd_data = pd.read_csv(ledd_path)
print("LEDD data loaded.")

# Add a new column for LEDD value categorization
print("Categorizing LEDD values...")
ledd_data['LEDD_category'] = ledd_data['Delta_LEDD'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Zero'))
print("LEDD values categorized.")

# Base directory where patient data is stored
base_dir = '/Volumes/MORRISON/retro_clin'
all_patient_data = []

# Set a default shape for missing masks if needed
default_shape = (394, 466, 378)  # Replace with the actual shape of your NIfTI files

print("Starting to process each patient...")

# Loop through each patient, load their stimulation files and binary masks
for idx, patient_number in enumerate(ledd_data['Patient Number']):
    # Remove the prefix 'PDa' and convert the rest to an integer
    patient_number_int = int(patient_number.replace('PDa', ''))
    
    # Only process patients from 20 to 144
    if patient_number_int < 20:
        continue
    elif patient_number_int > 144:
        print(f"Reached patient number {patient_number}, stopping processing.")
        break

    print(f"Processing patient {idx + 1}/{len(ledd_data['Patient Number'])}: {patient_number}")

    patient_dir = f'PDa{patient_number_int:03d}'
    file_patient_number = str(patient_number_int)
    
    # Define stimulation file paths for both hemispheres
    stim_file_paths = {
        'right': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-R.nii',
        'left': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-L.nii'
    }
    
    # Initialize patient data
    patient_data = []
    
    # Load and binarize mask for each hemisphere
    for hemi, path in stim_file_paths.items():
        print(f"Checking file existence for {hemi} hemisphere: {path}")
        if os.path.exists(path):
            print(f"File found. Loading and processing {hemi} hemisphere...")
            mask_data = load_nifti(path)
            binary_mask = binarize_mask(mask_data)
            patient_data.append(binary_mask.flatten())  # Flatten mask to a 1D array for clustering
            print(f"{hemi.capitalize()} hemisphere processed.")
        else:
            print(f"File not found for {hemi} hemisphere. Using default mask.")
            default_mask = np.zeros(default_shape)
            patient_data.append(default_mask.flatten())

    print(f"Finished processing patient {patient_number}.")
    all_patient_data.append(np.concatenate(patient_data))

print("All patients processed. Preparing data for clustering...")

# Convert patient data to NumPy array for clustering
patient_data_matrix = np.array(all_patient_data)

# Standardize the data before clustering
print("Standardizing data...")
scaler = StandardScaler()
scaled_data = scaler.fit_transform(patient_data_matrix)
print("Data standardized.")

# Apply KMeans clustering
print("Applying KMeans clustering...")
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_data)
print("KMeans clustering applied.")

# Add cluster labels to LEDD data
ledd_data['Cluster'] = clusters

# Save or display the updated DataFrame for analysis
output_path = '/Users/rwankhalifa/Downloads/clusters.csv'
ledd_data.to_csv(output_path, index=False)
print(f"Cluster data saved to {output_path}.")
print(ledd_data)


Loading LEDD data from /Users/rwankhalifa/Downloads/LEDD.csv...
LEDD data loaded.
Categorizing LEDD values...
LEDD values categorized.
Starting to process each patient...
Processing patient 15/268: PDa020
Checking file existence for right hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii
File found. Loading and processing right hemisphere...
Loading NIfTI file from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii...
NIfTI file loaded from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii.
Right hemisphere processed.
Checking file existence for left hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-b

In [2]:
!pip install --upgrade pip
!pip install --upgrade nibabel numpy pandas scikit-learn


Collecting nibabel
  Downloading nibabel-5.2.1-py3-none-any.whl.metadata (8.8 kB)
Collecting numpy
  Downloading numpy-2.1.0-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (12 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading nibabel-5.2.1-py3-none-any.whl (3.3 MB)
[2K   [9

In [None]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def load_nifti(file_path):
    """Load a NIfTI file with memory mapping to avoid loading the entire file into memory."""
    print(f"Loading NIfTI file from {file_path} with memory mapping...")
    nifti_data = nib.load(file_path, mmap=True)
    print(f"NIfTI file loaded from {file_path}.")
    return nifti_data.get_fdata()

def binarize_mask(mask):
    """Convert the NIfTI data to a binary mask."""
    binary_mask = np.where(mask > 0, 1, 0)
    return binary_mask

def process_patients(ledd_data, start_patient, end_patient):
    # Base directory where patient data is stored
    base_dir = '/Volumes/MORRISON/retro_clin'
    all_patient_data = []

    # Set a default shape for missing masks if needed
    default_shape = (394, 466, 378)  # Replace with the actual shape of your NIfTI files

    print("Starting to process each patient...")

    # Loop through each patient, load their stimulation files and binary masks
    for idx, patient_number in enumerate(ledd_data['Patient Number']):
        # Remove the prefix 'PDa' and convert the rest to an integer
        patient_number_int = int(patient_number.replace('PDa', ''))

        # Only process patients within the specified range
        if patient_number_int < start_patient:
            continue
        elif patient_number_int > end_patient:
            print(f"Reached patient number {patient_number}, stopping processing.")
            break

        print(f"Processing patient {idx + 1}/{len(ledd_data['Patient Number'])}: {patient_number}")

        patient_dir = f'PDa{patient_number_int:03d}'
        file_patient_number = str(patient_number_int)
        
        # Define stimulation file paths for both hemispheres
        stim_file_paths = {
            'right': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-R.nii',
            'left': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-L.nii'
        }
        
        # Initialize patient data
        patient_data = []
        
        # Load and binarize mask for each hemisphere
        for hemi, path in stim_file_paths.items():
            print(f"Checking file existence for {hemi} hemisphere: {path}")
            if os.path.exists(path):
                print(f"File found. Loading and processing {hemi} hemisphere...")
                mask_data = load_nifti(path)
                binary_mask = binarize_mask(mask_data)
                patient_data.append(binary_mask.flatten())  # Flatten mask to a 1D array for clustering
                print(f"{hemi.capitalize()} hemisphere processed.")
                # Free up memory by deleting large variables that are no longer needed
                del mask_data, binary_mask
            else:
                print(f"File not found for {hemi} hemisphere. Using default mask.")
                default_mask = np.zeros(default_shape)
                patient_data.append(default_mask.flatten())
                # Free up memory for the default mask as well
                del default_mask

        print(f"Finished processing patient {patient_number}.")
        all_patient_data.append(np.concatenate(patient_data))
        # Clear patient_data list to free up memory
        del patient_data

    print("All patients processed. Preparing data for clustering...")

    # Convert patient data to NumPy array for clustering
    patient_data_matrix = np.array(all_patient_data)
    # Clear all_patient_data to free up memory
    del all_patient_data

    # Standardize the data before clustering
    print("Standardizing data...")
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(patient_data_matrix)
    print("Data standardized.")
    # Clear patient_data_matrix to free up memory
    del patient_data_matrix

    # Apply KMeans clustering
    print("Applying KMeans clustering...")
    kmeans = KMeans(n_clusters=3, random_state=42)
    clusters = kmeans.fit_predict(scaled_data)
    print("KMeans clustering applied.")
    # Clear scaled_data to free up memory
    del scaled_data

    # Add cluster labels to LEDD data
    ledd_data['Cluster'] = clusters

    # Save or display the updated DataFrame for analysis
    output_path = '/Users/rwankhalifa/Downloads/clusters.csv'
    ledd_data.to_csv(output_path, index=False)
    print(f"Cluster data saved to {output_path}.")
    print(ledd_data)

# Load LEDD.csv file
ledd_path = '/Users/rwankhalifa/Downloads/LEDD.csv' 
print(f"Loading LEDD data from {ledd_path}...")
ledd_data = pd.read_csv(ledd_path)
print("LEDD data loaded.")

# Add a new column for LEDD value categorization
print("Categorizing LEDD values...")
ledd_data['LEDD_category'] = ledd_data['Delta_LEDD'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Zero'))
print("LEDD values categorized.")

# Process patients from 20 to 144 using the optimized function
process_patients(ledd_data, 20, 144)

Loading LEDD data from /Users/rwankhalifa/Downloads/LEDD.csv...
LEDD data loaded.
Categorizing LEDD values...
LEDD values categorized.
Starting to process each patient...
Processing patient 15/268: PDa020
Checking file existence for right hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii
File found. Loading and processing right hemisphere...
Loading NIfTI file from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii with memory mapping...
NIfTI file loaded from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii.
Right hemisphere processed.
Checking file existence for left hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/p

In [None]:
# Using Scipy.sparse for better memory 

import os
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from scipy import sparse

def load_nifti(file_path):
    """Load a NIfTI file with memory mapping to avoid loading the entire file into memory."""
    print(f"Loading NIfTI file from {file_path} with memory mapping...")
    nifti_data = nib.load(file_path, mmap=True)
    print(f"NIfTI file loaded from {file_path}.")
    return nifti_data.get_fdata()

def binarize_mask(mask):
    """Convert the NIfTI data to a binary mask."""
    binary_mask = np.where(mask > 0, 1, 0)
    return binary_mask

def process_patients_sparse(ledd_data, start_patient, end_patient):
    # Base directory where patient data is stored
    base_dir = '/Volumes/MORRISON/retro_clin'
    all_patient_data = []

    # Set a default shape for missing masks if needed
    default_shape = (394, 466, 378)  # Replace with the actual shape of your NIfTI files

    print("Starting to process each patient...")

    # Loop through each patient, load their stimulation files and binary masks
    for idx, patient_number in enumerate(ledd_data['Patient Number']):
        # Remove the prefix 'PDa' and convert the rest to an integer
        patient_number_int = int(patient_number.replace('PDa', ''))

        # Only process patients within the specified range
        if patient_number_int < start_patient:
            continue
        elif patient_number_int > end_patient:
            print(f"Reached patient number {patient_number}, stopping processing.")
            break

        print(f"Processing patient {idx + 1}/{len(ledd_data['Patient Number'])}: {patient_number}")

        patient_dir = f'PDa{patient_number_int:03d}'
        file_patient_number = str(patient_number_int)
        
        # Define stimulation file paths for both hemispheres
        stim_file_paths = {
            'right': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-R.nii',
            'left': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-L.nii'
        }
        
        # Initialize patient data
        patient_data = []
        
        # Load and binarize mask for each hemisphere
        for hemi, path in stim_file_paths.items():
            print(f"Checking file existence for {hemi} hemisphere: {path}")
            if os.path.exists(path):
                print(f"File found. Loading and processing {hemi} hemisphere...")
                mask_data = load_nifti(path)
                binary_mask = binarize_mask(mask_data)
                patient_data.append(binary_mask.flatten())  # Flatten mask to a 1D array for clustering
                print(f"{hemi.capitalize()} hemisphere processed.")
                # Free up memory by deleting large variables that are no longer needed
                del mask_data, binary_mask
            else:
                print(f"File not found for {hemi} hemisphere. Using default mask.")
                default_mask = np.zeros(default_shape)
                patient_data.append(default_mask.flatten())
                # Free up memory for the default mask as well
                del default_mask

        print(f"Finished processing patient {patient_number}.")
        all_patient_data.append(np.concatenate(patient_data))
        # Clear patient_data list to free up memory
        del patient_data

    print("All patients processed. Converting to sparse matrix...")

    # Convert patient data to a sparse matrix for clustering
    patient_data_matrix_sparse = sparse.csr_matrix(np.array(all_patient_data))
    # Clear all_patient_data to free up memory
    del all_patient_data

    # Standardize the data before clustering using sparse matrix operations
    print("Standardizing data (mean centering)...")
    scaler = StandardScaler(with_mean=False)  # `with_mean=False` is used to avoid dense conversion
    scaled_data_sparse = scaler.fit_transform(patient_data_matrix_sparse)
    print("Data standardized.")

    # Apply MiniBatchKMeans clustering
    print("Applying Mini-Batch KMeans clustering on sparse data...")
    mini_batch_kmeans = MiniBatchKMeans(n_clusters=3, random_state=42, batch_size=100)
    clusters = mini_batch_kmeans.fit_predict(scaled_data_sparse)
    print("Mini-Batch KMeans clustering applied on sparse data.")

    # Add cluster labels to LEDD data
    ledd_data['Cluster'] = clusters

    # Save or display the updated DataFrame for analysis
    output_path = '/Users/rwankhalifa/Downloads/clusters.csv'
    ledd_data.to_csv(output_path, index=False)
    print(f"Cluster data saved to {output_path}.")
    print(ledd_data)

# Load LEDD.csv file
ledd_path = '/Users/rwankhalifa/Downloads/LEDD.csv' 
print(f"Loading LEDD data from {ledd_path}...")
ledd_data = pd.read_csv(ledd_path)
print("LEDD data loaded.")

# Add a new column for LEDD value categorization
print("Categorizing LEDD values...")
ledd_data['LEDD_category'] = ledd_data['Delta_LEDD'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Zero'))
print("LEDD values categorized.")

# Process patients from 20 to 144 using the optimized function for sparse matrices
process_patients_sparse(ledd_data, 20, 144)


Loading LEDD data from /Users/rwankhalifa/Downloads/LEDD.csv...
LEDD data loaded.
Categorizing LEDD values...
LEDD values categorized.
Starting to process each patient...
Processing patient 15/268: PDa020
Checking file existence for right hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii
File found. Loading and processing right hemisphere...
Loading NIfTI file from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii with memory mapping...
NIfTI file loaded from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii.
Right hemisphere processed.
Checking file existence for left hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/p

In [2]:
# Using dimentionality reduction before clustering

import os
import nibabel as nib
import numpy as np
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import sparse

def load_nifti(file_path):
    """Load a NIfTI file with memory mapping to avoid loading the entire file into memory."""
    print(f"Loading NIfTI file from {file_path} with memory mapping...")
    nifti_data = nib.load(file_path, mmap=True)
    print(f"NIfTI file loaded from {file_path}.")
    return nifti_data.get_fdata()

def binarize_mask(mask):
    """Convert the NIfTI data to a binary mask."""
    binary_mask = np.where(mask > 0, 1, 0)
    return binary_mask

def process_patients_with_pca(ledd_data, start_patient, end_patient):
    # Base directory where patient data is stored
    base_dir = '/Volumes/MORRISON/retro_clin'
    all_patient_data = []

    # Set a default shape for missing masks if needed
    default_shape = (394, 466, 378)  # Replace with the actual shape of your NIfTI files

    print("Starting to process each patient...")

    # Loop through each patient, load their stimulation files and binary masks
    for idx, patient_number in enumerate(ledd_data['Patient Number']):
        # Remove the prefix 'PDa' and convert the rest to an integer
        patient_number_int = int(patient_number.replace('PDa', ''))

        # Only process patients within the specified range
        if patient_number_int < start_patient:
            continue
        elif patient_number_int > end_patient:
            print(f"Reached patient number {patient_number}, stopping processing.")
            break

        print(f"Processing patient {idx + 1}/{len(ledd_data['Patient Number'])}: {patient_number}")

        patient_dir = f'PDa{patient_number_int:03d}'
        file_patient_number = str(patient_number_int)
        
        # Define stimulation file paths for both hemispheres
        stim_file_paths = {
            'right': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-R.nii',
            'left': f'{base_dir}/{patient_dir}/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd{file_patient_number}/rsub-leads_sim-binary_model-simbio_hemi-L.nii'
        }
        
        # Initialize patient data
        patient_data = []
        
        # Load and binarize mask for each hemisphere
        for hemi, path in stim_file_paths.items():
            print(f"Checking file existence for {hemi} hemisphere: {path}")
            if os.path.exists(path):
                print(f"File found. Loading and processing {hemi} hemisphere...")
                mask_data = load_nifti(path)
                binary_mask = binarize_mask(mask_data)
                patient_data.append(binary_mask.flatten())  # Flatten mask to a 1D array for clustering
                print(f"{hemi.capitalize()} hemisphere processed.")
                # Free up memory by deleting large variables that are no longer needed
                del mask_data, binary_mask
            else:
                print(f"File not found for {hemi} hemisphere. Using default mask.")
                default_mask = np.zeros(default_shape)
                patient_data.append(default_mask.flatten())
                # Free up memory for the default mask as well
                del default_mask

        print(f"Finished processing patient {patient_number}.")
        all_patient_data.append(np.concatenate(patient_data))
        # Clear patient_data list to free up memory
        del patient_data

    print("All patients processed. Converting to sparse matrix...")

    # Convert patient data to a sparse matrix for clustering
    patient_data_matrix_sparse = sparse.csr_matrix(np.array(all_patient_data))
    # Clear all_patient_data to free up memory
    del all_patient_data

    # Standardize the data before dimensionality reduction
    print("Standardizing data (mean centering)...")
    scaler = StandardScaler(with_mean=False)  # `with_mean=False` to avoid dense conversion
    scaled_data_sparse = scaler.fit_transform(patient_data_matrix_sparse)
    print("Data standardized.")

    # Apply PCA for dimensionality reduction
    print("Applying PCA for dimensionality reduction...")
    pca = PCA(n_components=50)  # Adjust n_components based on your needs
    reduced_data = pca.fit_transform(scaled_data_sparse.toarray())  # Convert to dense array for PCA
    print("PCA applied. Data dimensionality reduced.")

    # Apply Mini-Batch KMeans clustering on reduced data
    print("Applying Mini-Batch KMeans clustering on reduced data...")
    mini_batch_kmeans = MiniBatchKMeans(n_clusters=3, random_state=42, batch_size=100)
    clusters = mini_batch_kmeans.fit_predict(reduced_data)
    print("Mini-Batch KMeans clustering applied on reduced data.")

    # Add cluster labels to LEDD data
    ledd_data['Cluster'] = clusters

    # Save or display the updated DataFrame for analysis
    output_path = '/Users/rwankhalifa/Downloads/clusters.csv'
    ledd_data.to_csv(output_path, index=False)
    print(f"Cluster data saved to {output_path}.")
    print(ledd_data)

# Load LEDD.csv file
ledd_path = '/Users/rwankhalifa/Downloads/LEDD.csv' 
print(f"Loading LEDD data from {ledd_path}...")
ledd_data = pd.read_csv(ledd_path)
print("LEDD data loaded.")

# Add a new column for LEDD value categorization
print("Categorizing LEDD values...")
ledd_data['LEDD_category'] = ledd_data['Delta_LEDD'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Zero'))
print("LEDD values categorized.")


Loading LEDD data from /Users/rwankhalifa/Downloads/LEDD.csv...
LEDD data loaded.
Categorizing LEDD values...
LEDD values categorized.


In [3]:
# Process patients from 20 to 40 using PCA for dimensionality reduction
process_patients_with_pca(ledd_data, 20, 30)

Starting to process each patient...
Processing patient 15/268: PDa020
Checking file existence for right hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii
File found. Loading and processing right hemisphere...
Loading NIfTI file from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii with memory mapping...
NIfTI file loaded from /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-R.nii.
Right hemisphere processed.
Checking file existence for left hemisphere: /Volumes/MORRISON/retro_clin/PDa020/derivatives/leaddbs/sub-leads/stimulations/MNI152NLin2009bAsym/pd20/rsub-leads_sim-binary_model-simbio_hemi-L.nii
File found. Loading and processing left hemisphere...
Loading NIfTI file from /Volume

ValueError: n_components=50 must be between 0 and min(n_samples, n_features)=9 with svd_solver='full'