<a href="https://colab.research.google.com/github/StevenVuong/MSc_Project/blob/master/Step1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This Notebook contains the script to clean the dataframe of the CSV file to get the number of subjects which are valid entries**

In [1]:
# install dependencies
!pip install deepbrain; # semi-colon to hide the output
!pip install pydicom;



In [0]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
%matplotlib inline

import pydicom
import nibabel as nb
from deepbrain import Extractor
from sklearn.model_selection import train_test_split
from google.colab import drive

In [3]:
# mount google drive into google colab
drive.mount('/content/gdrive')

os.chdir('/content')

# go to where we will be working
print (os.listdir())
os.chdir('gdrive/My Drive/msc_project/all_mprage_grappa')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['.config', 'gdrive', 'sample_data']


**Load Dataframe**

In [4]:
# load dataframe
patients_file_dir = 'raw_brainscans'

# import in patients
patients = os.listdir(patients_file_dir) # get all patients ID's in scan
patient_df = pd.read_csv('t1_mprage_grappa_ONLY.csv') # get dataframe too to cross reference

# map control to 0 and pd to 1
patient_df['Group'] = patient_df['Group'].replace({'Control':0, 'PD':1})

print ("There are %d number of patients prior to processing" % len(patient_df))
patient_df.head()

There are 259 number of patients prior to processing


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,367385,4032,0,M,68,1,MRI,MPRAGE GRAPPA,Original,2/05/2013,DCM,5/29/2019
1,417044,4032,0,M,69,5,MRI,MPRAGE GRAPPA,Original,2/20/2014,DCM,
2,365287,4018,0,M,65,5,MRI,MPRAGE GRAPPA,Original,2/27/2013,DCM,
3,334563,4018,0,M,64,1,MRI,MPRAGE GRAPPA,Original,2/13/2012,DCM,5/29/2019
4,340891,4010,0,M,43,5,MRI,MPRAGE GRAPPA,Original,9/28/2012,DCM,


**Functions to load up image file**

In [0]:
def get_grappa_dir(path):
  # get the file ending with 'GRAPPA', would need to accomodate this for grappa also
  returning_path = None
  for next_path in os.listdir(path):
    last_item = next_path.split("_")[-1]
    acceptable_last_items = ['GRAPPA']
    if (last_item in acceptable_last_items): # for the t1 weighted
      returning_path = next_path
      return returning_path
    else: # some print statements to debug
      print ("NOT ACCEPTABLE: %s " % last_item)
      print ("Path: %s " % (path+next_path))

In [0]:
def get_dcm_s(path):
  # get the path beginning with S, so doesn't clash with GZ File
  for next_path in os.listdir(path):
    if (next_path[0] == 'S'):
      return next_path

In [0]:
cwd = os.getcwd()
def get_path_to_patient(patient):
  path = None
  paths = [] # list of all dated scans for the patient
  try:
    # label = patient_df.get_value(patient, 'Subject') # cannot go by patient, must get the ID
    path = cwd + '/' + patients_file_dir + '/' + str(patient) # get to the GRAPPA 
    path = path + '/' + get_grappa_dir(path)
    
    # check for multiple patient paths, add all to list
    for scan_id in os.listdir(path):
      path_scan = path + '/' + scan_id
      path_scan = path_scan + '/' +get_dcm_s(path_scan)
      paths.append(path_scan)
      
  except Exception as e:
    print ("No path found for patient %s: %s" % (str(patient),str(e)))
    
  return paths

In [0]:
def get_img_no(path):
# get the image identification numberm any image will do for this so take first
  image_number = None
  example_file = os.listdir(path)[0]
  image_number = int(example_file.split("_")[-1][1:-4]) # index to get the ID
   
  return image_number

In [0]:
def filename_sort(filename):
    
    # split by underlines and delimiter
    split_line = filename.split("_")
    int_return = int(split_line[-3])
    
    return int_return

In [0]:
def get_slices(patient):
  '''For any given patient, returns the slices for them'''
  try:
    paths = get_path_to_patient(patient) # Should return multiple paths
    all_slices = []
    all_info = []
    for path in paths:
      
      dcm_files = os.listdir(path) # path to dcm files
      # get information related around the image
      image_number = get_img_no(path)
      image_row = patient_df.loc[patient_df['Image Data ID'] == int(image_number)] # relate to df
      # retrieve more info on patient
      image_sex = image_row.Sex.values[0]
      image_group = image_row.Group.values[0]
      image_age = image_row.Age.values[0]

      # create patient info list and append to total info
      image_info = [image_number, image_sex, image_group, image_age]

      # sort dcm files in order
      dcm_files = sorted(dcm_files, key=lambda filename: filename_sort(filename)) # some have length 3

      slices = []
      # loop through slices and build the array
      for dcm_file in dcm_files:
        path_to_file = path + '/' + dcm_file
        slices.append(pydicom.read_file(path_to_file).pixel_array)
      slices = np.array(slices)
      
      # then add to our growing array
      all_slices.append(slices)
      all_info.append(image_info)
    
    return all_slices, all_info
 
  except Exception as e:
    print ("No File Found: %s" % str(e))

In [0]:
def process_slices(slices):
  '''Remove the skull from our slice, expand and remove blanks'''
  
  # check shape is correct
  if (np.shape(slices) == (176, 256, 240)):
    
    # transform to axial view
    slices_axial = slices.transpose((1,2,0)) # question this
    
    # initialise skull stripper
    ext = Extractor()
    
    # get probability of part of image being brain tissue or not
    prob = ext.run(slices_axial)
    mask = prob > 1e-3 # mask can be obtained as:
    slices_axial[~mask] = 0 # apply mask
    
    # expand dimensions to meet input requirements
    slices_axial = np.expand_dims(slices_axial, axis=4)
    
    # index to reduce size and 'trim the fat' essentially
    slices_axial = slices_axial[35:195,50:210,10:170]
    
    # flip images and add to total processed arrays
    flipped_slices = np.array([np.flip(sl,1) for sl in slices_axial]) # if we wanted flipped slices
    
    return slices_axial, flipped_slices
  
  else:
    return None, None

**Load up and process image files (Put in one giant function)**

In [22]:
# get unique patient id's
patient_ids = patient_df['Subject']
unique_patient_ids = np.unique(patient_ids.values)

# shuffle the unique patient id's
np.random.shuffle(unique_patient_ids)
print (unique_patient_ids)

[3321 3157 3314 3768 3125 3176 3806 3320 3327 3178 3565 3174 3107 3114
 3372 3179 3569 3803 3316 3111 3358 3105 3305 3366 3756 3817 3350 3809
 3325 3173 3765 3122 3571 3118 3116 3365 3353 3554 3389 3373 3106 3123
 3168 3360 3131 3759 3184 3172 3132 3369 3769 3357 3354 3161 3113 3322
 3375 3175 3813 3150 3368 3181 3108 3171 4018 3805 3570 3572 3332 3380
 3160 3767 3555 3374 3359 3378 3311 3779 3134 3364 3124 3154 3115 3377
 3308 3371 3811 3361 3166 3182 4032 3309 3816 3367 3750 3127 3390 3300
 3120 3151 3351 3128 4004 3355 3104 3126 3563 4010 3551 3352 3112 3328
 3318 3165 3130 3301 3812 3307 3310 3804 3102 3129 3807 3119 3169]


In [0]:
total_slices = []
total_slices_info = []

# loop through, put slices and info into one giant array
for patient_id in unique_patient_ids:
  
  slices_array, image_info_array = get_slices(patient_id) # grab slices for a patient
  
  # loop through all the scans a patient may have had
  for i in range(len(slices_array)): # reference from larger array
    slices = slices_array[i]
    image_info = image_info_array[i]
    
    if slices is not None:
      # process each slice (problem is this process step)
      slice_axial, flipped_slice = process_slices(slices) 
    
      if slice_axial is not None:
        # add to our total array only if match shape
        if np.shape(slice_axial) == (160, 160, 160, 1): 
          total_slices.append(slice_axial)
          total_slices.append(flipped_slice)
          total_slices_info.append(image_info)
          total_slices_info.append(image_info)

  print (len(total_slices))
    
# turn into numpy arrays
total_slices = np.array(total_slices)
total_slices_info = np.array(total_slices_info)
    
## Saving our dataset
# If total_slices and total_slices_info is too large, break into 7 chunks and save
for i in range(4):
  j = 100 * i
  k = 100 * (i+1)
  
  # create file and separate slices and information
  batch_name = ('processed_brains_aug/batch' + str(i) + '.pkl')
  batch_slice = total_slices[j: k]
  batch_info = total_slices_info[j: k]
  
  # save 
  with open(batch_name, 'wb') as f:
    pickle.dump([batch_slice, batch_info], f)
    

In [0]:
np.shape(total_slices) ## must check for none's whilst loading

***Slice sample***

In [0]:
# sample slice
all_slices, all_info = get_slices(3320)
slices = all_slices[0]
slice_axials, flipped_slice = process_slices(slices)
np.shape(flipped_slice) # new final shape is (160, 160, 160), try this now..

In [0]:
  # test slice

  from ipywidgets import interact, interactive, fixed, interact_manual
  import ipywidgets as widgets # interactive plots
  import matplotlib.pyplot as plt
  %matplotlib inline

  slice_ex = slice_axials[:,:,:,0]
  def g(i): # basic slideshow plot to get an idea of the effectiveness of the mask itself
      plt.figure(figsize=(15,8)) # make plot larger
      plt.imshow(slice_ex[i])
      plt.show()
      return None

  interact(g, i=widgets.IntSlider(min=0,max=(len(slice_ex)-1),step=1,value=65)); # plots our axial view, this is it
  # img_slices, slice_info = process_slice(all_slices)