# Parkinson MRI Preprocessing

### Data Acquiring

In [3]:
import os

# URL and file paths
url = "https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/umf_pd/neurocon.tar.gz"
filename = "neurocon.tar.gz"
extracted_folder = "neurocon"

# Download the file if it doesn't exist
if not os.path.exists(filename):
    !wget {url}

# Extract the file if the folder doesn't exist
if not os.path.exists(extracted_folder):
    !tar -xzf {filename}

# Remove the archive
if os.path.exists(filename):
    os.remove(filename)


--2024-10-02 04:15:32--  https://fcp-indi.s3.amazonaws.com/data/Projects/INDI/umf_pd/neurocon.tar.gz
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving fcp-indi.s3.amazonaws.com (fcp-indi.s3.amazonaws.com)... 52.217.235.73, 3.5.27.94, 52.217.232.89, ...
Connecting to fcp-indi.s3.amazonaws.com (fcp-indi.s3.amazonaws.com)|52.217.235.73|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1617681469 (1.5G) [application/x-gzip]
Saving to: ‘neurocon.tar.gz’

neurocon.tar.gz       0%[                    ]  32.58K  49.0KB/s               ^C

gzip: stdin: unexpected end of file
tar: Unexpected EOF in archive
tar: Unexpected EOF in archive
tar: Error is not recoverable: exiting now


##### Loading labels tsv file

In [3]:
import pandas as pd

neurocon_dir = './neurocon'
tsv_file = os.path.join(neurocon_dir, "neurocon_patients.tsv")
metadata = pd.read_csv(tsv_file, sep='\t')

metadata.info()
metadata.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   code             43 non-null     object 
 1   status           43 non-null     object 
 2   UPDRS_motor_ON   27 non-null     float64
 3   UPDRS_motor_OFF  27 non-null     float64
 4   Treatment        27 non-null     object 
 5   age              43 non-null     int64  
 6   sex              43 non-null     object 
 7   H&Y              27 non-null     float64
 8   birthdate        43 non-null     object 
 9   scan_date        43 non-null     object 
dtypes: float64(3), int64(1), object(6)
memory usage: 3.5+ KB


Unnamed: 0,code,status,UPDRS_motor_ON,UPDRS_motor_OFF,Treatment,age,sex,H&Y,birthdate,scan_date
0,control032014,NORMAL,,,,53,M,,6/16/1959,11/4/2012
1,control032015,NORMAL,,,,64,M,,1/30/1948,11/4/2012
2,control032016,NORMAL,,,,58,F,,10/5/1954,11/4/2012
3,control032017,NORMAL,,,,77,F,,3/27/1935,11/11/2012
4,control032018,NORMAL,,,,55,F,,4/3/1957,11/11/2012


##### Create X and Y var with Max dimn definition

In [4]:
# Create lists to store the MRI data and the corresponding labels
X = []
Y = []
max_dims = [0, 0, 0]  # To track maximum dimensions of MRI scans

##### Loading MRI image and map with labels

In [5]:
import nibabel as nib
import numpy as np


# Iterate through the folders in 'neurocon' to load MRI data and match with labels
for patient_folder in os.listdir(neurocon_dir):
    patient_folder_path = os.path.join(neurocon_dir, patient_folder)
    
    # Skip the .tsv file itself
    if not os.path.isdir(patient_folder_path):
        continue
    
    # Look for the MRI scan inside the 'anat' subfolder
    anat_folder = os.path.join(patient_folder_path, "anat")
    if os.path.isdir(anat_folder):
        nii_files = [f for f in os.listdir(anat_folder) if f.endswith('.nii.gz')]
        
        if nii_files:
            nii_path = os.path.join(anat_folder, nii_files[0])
            
            # Load the MRI scan as a NumPy array
            mri_data = nib.load(nii_path).get_fdata()
            
            # Find maximum dimensions
            max_dims = np.maximum(max_dims, mri_data.shape)
            
            # Append the MRI data to X
            X.append(mri_data)
            
            # Extract the patient code and status from the metadata
            patient_code = patient_folder.replace("sub-", "")
            status = metadata.loc[metadata['code'] == patient_code, 'status'].values[0]
            
            # Append the status (Control or PD) to Y
            Y.append(status)


##### Padding the MRI images to match max-DIM

In [6]:
# Pad the MRI images to match the maximum dimensions
X_padded = []
for mri_data in X:
    # Create an empty array with the maximum dimensions, filled with zeros
    padded_image = np.zeros(max_dims)
    
    # Get the original shape of the MRI data
    original_shape = mri_data.shape
    
    # Place the original image in the top-left corner of the padded image
    padded_image[:original_shape[0], :original_shape[1], :original_shape[2]] = mri_data
    
    # Append the padded image to the new list
    X_padded.append(padded_image)

##### Convert to Numpy array

In [7]:
# Convert X_padded and Y into NumPy arrays
X_padded = np.array(X_padded)
Y = np.array(Y)

##### Reshape 3D to 2D to apply SMOTE

In [8]:
# Reshape X_padded for SMOTE to apply; SMOTE requires 2D data
n_samples, *image_dims = X_padded.shape
X_flat = X_padded.reshape(n_samples, -1)  # Flatten each image into a 1D array


##### Apply SMOTE

In [9]:
from imblearn.over_sampling import SMOTE

# Use SMOTE to balance the classes
sm = SMOTE(random_state=42)
X_resampled, Y_resampled = sm.fit_resample(X_flat, Y)

##### Reshape back to 3D

In [10]:
# Reshape X_resampled back to the original image dimensions
X_resampled = X_resampled.reshape(-1, *image_dims)

##### Save the data using h5py 

In [11]:
import h5py

# Save the balanced X_resampled and Y_resampled into an HDF5 file
output_file = "mri_data_balanced.h5"
with h5py.File(output_file, 'w') as h5f:
    h5f.create_dataset('X', data=X_resampled)
    h5f.create_dataset('Y', data=Y_resampled.astype('S10'))  # Convert Y to bytes to store as a string

print(f"Data saved to {output_file}")

Data saved to mri_data_balanced.h5
