<a href="https://colab.research.google.com/github/StevenVuong/MSc_Project/blob/master/v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dependencies
!pip install deepbrain; # semi-colon to hide the output
!pip install pydicom;



In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pydicom
import pickle
from deepbrain import Extractor
from sklearn.model_selection import train_test_split
import nibabel as nb

In [3]:
from google.colab import drive

# mount google drive into google colab
drive.mount('/content/gdrive')

os.chdir('/content')

# go to where the data is
print (os.listdir())
os.chdir('gdrive/My Drive/msc_project')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['.config', 'gdrive', 'sample_data']


In [4]:
# https://www.kaggle.com/sentdex/first-pass-through-data-w-3d-convnet
patients_file_dir = 'T1_SAG_SIEMEN_3T_CLEAN'

patients = os.listdir(patients_file_dir) # get all patients ID's in scan
patient_df = pd.read_csv('T1_SAG_SIEMEN_3T_CLEAN_5_29_2019.csv') # get dataframe too to cross reference

# Map GenCohort to regular PD and Controls
patient_df['Group'] = patient_df['Group'].replace({'GenCohort PD':'PD', 'GenCohort Unaff':'Control'})

# map control to 0 and pd to 1
patient_df['Group'] = patient_df['Group'].replace({'Control':0, 'PD':1})

patient_df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,1130198,75422,0,M,73,1,MRI,MPRAGE GRAPPA,Original,11/13/2018,DCM,5/07/2019
1,1130190,75414,0,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
2,1130191,75414,0,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
3,1125041,74375,0,F,59,1,MRI,MPRAGE_GRAPPA,Original,9/06/2018,DCM,4/24/2019
4,1003469,72138,0,F,55,1,MRI,MPRAGE GRAPPA,Original,2/19/2018,DCM,4/24/2019


In [5]:
def get_grappa_dir(path):
  # get the file ending with 'GRAPPA', would need to accomodate this for grappa also
  returning_path = None
  for next_path in os.listdir(path):
    last_item = next_path.split("_")[-1]
    acceptable_last_items = ['GRAPPA', 'MPRAGE', 'SAG', 'ND']
    if (last_item in acceptable_last_items): # for the t1 weighted
      returning_path = next_path
      return returning_path
    else: # some print statements to debug
      print ("NOT ACCEPTABLE: %s " % last_item)
      print ("Path: %s " % next_path)

def get_dcm_s(path):
  # get the path beginning with S, so doesn't clash with GZ File
  for next_path in os.listdir(path):
    if (next_path[0] == 'S'):
      return next_path

def get_path_to_patient(patient):
  path = None
  try:
    # label = patient_df.get_value(patient, 'Subject') # cannot go by patient, must get the ID
    path = cwd + '/' + patients_file_dir + '/' + str(patient) # get to the GRAPPA 
    path = path + '/' + get_grappa_dir(path)
    path = path + '/' + os.listdir(path)[-1] # get the most recent scan for patient
    path = path + '/' + get_dcm_s(path)
  except Exception as e:
    print ("No path found for patient %s: %s" % (str(patient),str(e)))
    
  return path
    

def get_no_scans(patient_ids):
  '''Return patient list which have no scans'''
  no_scan_list = []
  
  for patient_id in patient_ids:
    
    patient_path = get_path_to_patient(patient_id)
    
    dcm_filepath = os.listdir(patient_path)

    if len(dcm_filepath) == 0:
      no_scan_list.append(patient_id)
    
  return no_scan_list
  
  
cwd = os.getcwd()
print (cwd)

def check_patients_exist(patient_ids):
  '''Check patient number is stored in files, otherwise remove from df'''
  to_remove_list = []
  
  path_to = cwd + '/' + patients_file_dir
  p_list = os.listdir(path_to)
  reference_list = patient_ids.values

  for patients in p_list:
    patients = int(patients)
    if patients not in reference_list: # if file is not in list of our csv file
      to_remove_list.append(patients)
  
  return to_remove_list

# get patient id
patient_ids = patient_df['Subject']

patients_missing_scans = get_no_scans(patient_ids) # get empty folders
to_remove_list = check_patients_exist(patient_ids) # check if patient numbers are there

union_list = list(set(patients_missing_scans + to_remove_list)) # patient check for our files
patient_df = patient_df[~patient_df['Subject'].isin(union_list)] # reverse, so keep patients that are NOT missing scans, remove the rest

/content/gdrive/My Drive/msc_project


In [0]:
def get_img_no(path):
  # get the image identification numberm any image will do for this so take first
  image_number = None
  for image_file in os.listdir(path):
    image_number = int(image_file.split("_")[-1][1:-4]) # index to get the ID
   
  return image_number

def filename_sort(filename):
    
    # split by underlines and delimiter
    split_line = filename.split("_")
    int_return = int(split_line[-3])
    
    return int_return
  
def get_slices(patient):
  '''For any given patient, returns the slices for them'''
  try:
    path = get_path_to_patient(patient)

    # get information related around the image
    image_number = get_img_no(path)
    image_row = patient_df_sample.loc[patient_df_sample['Image Data ID'] == image_number] # relate to df
    image_sex = image_row.Sex.values[0]
    image_group = image_row.Group.values[0]
    image_age = image_row.Age.values[0]

    # create image object and append to total info
    image_info = [image_number, image_sex, image_group, image_age]

    # print ("Sex: %s, Age: %s, Group: %s " % (image_sex, image_age, image_group))

    # get files and sort them in order
    dcm_files = os.listdir(path)
      
    dcm_files = sorted(dcm_files, key=lambda filename: filename_sort(filename)) # some have length 3

    slices = []
    # loop through slices and build the array
    for dcm_file in dcm_files:
      path_to_file = path + '/' + dcm_file
      slices.append(pydicom.read_file(path_to_file).pixel_array)
    slices = np.array(slices)[15:175, :, :]
    
    return slices, image_info
 
  except Exception as e:
    print ("No File Found: %s" % str(e))

In [0]:
def process_slice(total_slice):
  # deal with mixed slice information
  slices = total_slice[0]
  
  # transform into axial view
  slice_axial = slices.transpose((1,2,0))
  
  # initialise skull stripper
  ext = Extractor()

  # get probability of part of image being brain tissue or not
  prob = ext.run(slice_axial)
  mask = prob > 1e-3 # mask can be obtained as:
  slice_axial[~mask] = 0 # apply mask
  
  slice_axial = slice_axial[30:230, 30:230, :] # trim blank ones
  
  # flip images and add to total processed arrays
  flipped_slices = np.array([np.flip(sl,1) for sl in slice_axial])
  
  ######################
  # NEED A WAY TO DEAL WITH DIFFERENT DIMENSION IMAGE SIZES
  # LOOP THROUGH AND CHECK THE SAMPLE SIZE?
  ######################
  
  # expand dimensions to meet input requirements
  slice_axial = np.expand_dims(slice_axial, axis=4)
  flipped_slices = np.expand_dims(flipped_slices, axis=4)
  
  return slice_axial, flipped_slices, total_slice[1]

In [8]:
import keras

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, labels, batch_size=10, dim=(200,200,160), n_channels=1,
                 n_classes=2, shuffle=True):
        'Initialization'
        self.dim = dim
        self.batch_size = batch_size
        self.labels = labels
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
        # Initialization (modified for flipped slices)
        X = np.empty((2*self.batch_size, *self.dim, self.n_channels))
        y = np.empty((2*self.batch_size), dtype=int)

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            
            try:
              # get and process sample
              slices, image_info = get_slices(ID) # grab slices for a patient
              slice_axial, flipped_slices, sl_info = process_slice([slices, image_info]) # works
  
              # Store sample
              j = 2*i
              X[j,] = slice_axial
              X[j+1,] = flipped_slices # flipped version
              ######################
              # Store class
              y[j] = self.labels[ID]
              y[j+1] = self.labels[ID]
              
            except Exception as e:
              print ("Ayyah we have an error with ID %s: %s" % (str(ID),str(e)))

        return X, keras.utils.to_categorical(y, num_classes=self.n_classes)

Using TensorFlow backend.


In [0]:
from keras.utils import to_categorical

# need to format data into the right method.. 
patient_df_sample = patient_df.sample(50)
patient_df_sample_id = patient_df_sample['Subject'].values

# split to train and test set
train_set, validation_set = train_test_split(patient_df_sample_id, test_size=0.2)

labels = pd.Series(patient_df_sample.Group.values,index=patient_df_sample.Subject).to_dict()

# Need to process labels still to keras categorical, also get skl train test split anyhow

# build dictionary and populate
partition = {}
partition['train'] = train_set
partition['validation'] = validation_set

# traditional approach
y_output = np.array(to_categorical(patient_df_sample.Group.values, 2)) # convert to something categorical with keras util

In [10]:
# get distribution of our outputs, to test if our results are better than random 
distribution_array = np.unique(patient_df_sample.Group.values, return_counts=True)
percentage_control = distribution_array[1][0]/np.sum(distribution_array[1])*100

print ("Percentage Control: %f%%" % percentage_control)
print ("Percentage PD: %f%%" % (100-percentage_control))

Percentage Control: 28.000000%
Percentage PD: 72.000000%


In [0]:
# Traditional Approach: Load data, process all the data, store it in total slices
# then compile it all, split it and run it through a model

In [0]:
from keras.models import Sequential

# Parameters
params = {'dim': (200,200,160),
          'batch_size': 10,
          'n_classes': 2,
          'n_channels': 1,
          'shuffle': True}

# Generators
training_generator = DataGenerator(partition['train'], labels, **params)
validation_generator = DataGenerator(partition['validation'], labels, **params)

In [0]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Convolution3D, MaxPooling3D
from keras.layers.normalization import BatchNormalization
from keras.optimizers import SGD, RMSprop, Adam
from keras.utils import np_utils, generic_utils
from keras.layers import LeakyReLU

In [14]:
# Design model
model = Sequential()

model.add(Convolution3D(filters=8, kernel_size=2, strides=1, padding='same', input_shape=(200,200,160,1))) # or should activation be linear?
model.add(LeakyReLU(alpha=0.01)) # set to 0.01
model.add(MaxPooling3D())

model.add(Convolution3D(filters=16, kernel_size=2, strides=1, padding='same'))
model.add(LeakyReLU(alpha=0.01)) 
model.add(MaxPooling3D())

model.add(Convolution3D(filters=32, kernel_size=3, strides=1, padding='same'))
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling3D())

model.add(Convolution3D(filters=64, kernel_size=3, strides=1, padding='same'))
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling3D())

model.add(Convolution3D(filters=128, kernel_size=4, strides=1, padding='same'))
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling3D())

model.add(Convolution3D(filters=256, kernel_size=4, strides=1, padding='same'))
model.add(LeakyReLU(alpha=0.01))
model.add(MaxPooling3D())

model.add(Flatten())
model.add(Dense(512))
model.add(LeakyReLU(alpha=0.01))

model.add(Dense(2, activation='softmax'))

model.compile(optimizer=Adam(lr=0.00005), loss='categorical_crossentropy',metrics = ['accuracy']) # metrics=['categorical_accuracy']

Instructions for updating:
Colocations handled automatically by placer.


In [15]:
# Train model on dataset with generator (loading and inputting data on the fly)
# https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
model.fit_generator(generator=training_generator,
                    validation_data=validation_generator,
                    use_multiprocessing=True,
                    workers=6)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1
Instructions for updating:
Use tf.gfile.GFile.




Instructions for updating:
Use tf.gfile.GFile.




Instructions for updating:
Use tf.gfile.GFile.
Instructions for updating:
Use tf.gfile.GFile.




Instructions for updating:
Use tf.gfile.GFile.




Ayyah we have an error with ID 3160: could not broadcast input array from shape (200,200,145,1) into shape (200,200,160,1)


ResourceExhaustedError: ignored