<a href="https://colab.research.google.com/github/StevenVuong/MSc_Project/blob/master/p2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This notebook is concerned with loading all the DCM files into an array and storing them accordingly**

In [1]:
# install dependencies
!pip install deepbrain; # semi-colon to hide the output
!pip install pydicom;



In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pydicom
import pickle
from deepbrain import Extractor
from sklearn.model_selection import train_test_split
import nibabel as nb

In [3]:
from google.colab import drive

# mount google drive into google colab
drive.mount('/content/gdrive')

os.chdir('/content')

# go to where the data is
print (os.listdir())
os.chdir('gdrive/My Drive/msc_project')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['.config', 'gdrive', 'sample_data']


**Load Dataframe and select subsample of patients to load**

In [4]:
# load the data frame pickle file
patient_df = pd.read_pickle('processed_patient_df_TRY2.pkl')
print ("Number of image samples: %s" % len(patient_df))
patient_df.head()

Number of image samples: 351


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,1130198,75422,0,M,73,1,MRI,MPRAGE GRAPPA,Original,11/13/2018,DCM,5/07/2019
2,1130191,75414,0,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
3,1125041,74375,0,F,59,1,MRI,MPRAGE_GRAPPA,Original,9/06/2018,DCM,4/24/2019
4,1003469,72138,0,F,55,1,MRI,MPRAGE GRAPPA,Original,2/19/2018,DCM,4/24/2019
5,1119693,71935,1,M,66,1,MRI,MPRAGE GRAPPA,Original,4/03/2018,DCM,4/24/2019


**Functions to define grab the corresponding patient files into an array**

In [0]:
cwd = os.getcwd()
patients_file_dir = 'T1_SAG_SIEMEN_3T_CLEAN_TRY2'

In [0]:
def get_grappa_dir(path):
  # get the file ending with 'GRAPPA', would need to accomodate this for grappa also
  returning_path = None
  for next_path in os.listdir(path):
    last_item = next_path.split("_")[-1]
    acceptable_last_items = ['GRAPPA', 'MPRAGE', 'SAG', 'ND', '2']
    if (last_item in acceptable_last_items): # for the t1 weighted
      returning_path = next_path
      return returning_path
    else: # some print statements to debug
      print ("NOT ACCEPTABLE: %s " % last_item)
      print ("Path: %s " % next_path)

def get_dcm_s(path):
  # get the path beginning with S, so doesn't clash with GZ File
  for next_path in os.listdir(path):
    if (next_path[0] == 'S'):
      return next_path

def get_path_to_patient(patient):
  path = None
  paths = [] # list of all dated scans for the patient
  try:
    # label = patient_df.get_value(patient, 'Subject') # cannot go by patient, must get the ID
    path = cwd + '/' + patients_file_dir + '/' + str(patient) # get to the GRAPPA 
    path = path + '/' + get_grappa_dir(path)
    
    # check for multiple patient paths, add all to list
    for scan_id in os.listdir(path):
      path_scan = path + '/' + scan_id
      path_scan = path_scan + '/' +get_dcm_s(path_scan)
      paths.append(path_scan)
      
  except Exception as e:
    print ("No path found for patient %s: %s" % (str(patient),str(e)))
    
  return paths

In [99]:
# just incase this was needed..
backup_df = pd.read_csv('T1_SAG_SIEMEN_3T_CLEAN_5_29_2019.csv') # last straw df?
backup_df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,1130198,75422,GenCohort Unaff,M,73,1,MRI,MPRAGE GRAPPA,Original,11/13/2018,DCM,5/07/2019
1,1130190,75414,GenCohort Unaff,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
2,1130191,75414,GenCohort Unaff,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
3,1125041,74375,GenCohort Unaff,F,59,1,MRI,MPRAGE_GRAPPA,Original,9/06/2018,DCM,4/24/2019
4,1003469,72138,GenCohort Unaff,F,55,1,MRI,MPRAGE GRAPPA,Original,2/19/2018,DCM,4/24/2019


In [101]:
# Map GenCohort to regular PD and Controls
backup_df['Group'] = backup_df['Group'].replace({'GenCohort PD':'PD', 'GenCohort Unaff':'Control'})

# map control to 0 and pd to 1
backup_df['Group'] = backup_df['Group'].replace({'Control':0, 'PD':1})

TypeError: ignored

In [0]:
def get_img_no(path):
# get the image identification numberm any image will do for this so take first
  image_number = None
  example_file = os.listdir(path)[0]
  image_number = int(example_file.split("_")[-1][1:-4]) # index to get the ID
   
  return image_number

def filename_sort(filename):
    
    # split by underlines and delimiter
    split_line = filename.split("_")
    int_return = int(split_line[-3])
    
    return int_return
  
def get_slices(patient):
  '''For any given patient, returns the slices for them'''
  try:
    paths = get_path_to_patient(patient) # Should return multiple paths
    all_slices = []
    all_info = []
    print (patient)
    for path in paths:
      
      dcm_files = os.listdir(path) # path where dcm files are
      print (len(dcm_files))
      # get information related around the image
      image_number = get_img_no(path)
      image_row = patient_df_sample.loc[patient_df_sample['Image Data ID'] == int(image_number)] # relate to df
      
      if (image_row.empty == True):
        # Pulls from the larger data frame
        print (image_number)
        image_row =  patient_df.loc[patient_df['Image Data ID'] == int(image_number)]
        
        # pulls from really big df
        if (image_row.empty == True):
          image_row = backup_df.loc[backup_df['Image Data ID'] == int(image_number)]
       
      print (image_row) # Problem is here, need to pull from the larger frame (undeleted one)
      
      image_sex = image_row.Sex.values[0]
      image_group = image_row.Group.values[0]
      image_age = image_row.Age.values[0]

      # create image object and append to total info
      image_info = [image_number, image_sex, image_group, image_age]

      # print ("Sex: %s, Age: %s, Group: %s " % (image_sex, image_age, image_group))

      # sort dcm files in order
      dcm_files = sorted(dcm_files, key=lambda filename: filename_sort(filename)) # some have length 3

      slices = []
      # loop through slices and build the array
      for dcm_file in dcm_files:
        path_to_file = path + '/' + dcm_file
        slices.append(pydicom.read_file(path_to_file).pixel_array)
      slices = np.array(slices)[15:175, :, :]
      
      # add all the information about slices and information
      all_slices.append(slices)
      all_info.append(image_info)
    
    return all_slices, all_info
 
  except Exception as e:
    print ("No File Found: %s" % str(e))
    
def process_slice(total_slice):
  # deal with mixed slice information
  slices = total_slice[0]
  
  # transform into axial view
  slice_axial = slices.transpose((1,2,0))
  
  # initialise skull stripper
  ext = Extractor()

  # get probability of part of image being brain tissue or not
  prob = ext.run(slice_axial)
  mask = prob > 1e-3 # mask can be obtained as:
  slice_axial[~mask] = 0 # apply mask
  
  slice_axial = slice_axial[30:230, 30:230, :] # trim blank ones
  
  # flip images and add to total processed arrays
  flipped_slices = np.array([np.flip(sl,1) for sl in slice_axial])
  
  # expand dimensions to meet input requirements
  slice_axial = np.expand_dims(slice_axial, axis=4)
  flipped_slices = np.expand_dims(flipped_slices, axis=4)
  
  return slice_axial, flipped_slices, total_slice[1]

In [96]:
# get the sample of patients we want to use 
patient_df_sample = patient_df # .sample(351) (USING WHOLE DF)

# load those patients and put them into an array
# get the y-values for the corresponding array
# save the file in one big numpy array (including multiple images per same patient (treated independently))
patient_df_sample_id = patient_df_sample['Subject'].values

len(patient_df_sample_id) # we got it all

351

In [97]:
total_slices = []
total_slices_info = []

# loop through, put slices and info into one giant array
for patient_id in patient_df_sample_id:
  
  slices_array, image_info_array = get_slices(patient_id) # grab slices for a patient
  
  for i in range(len(slices_array)): # reference from larger array
    slices = slices_array[i]
    image_info = image_info_array[i]
    
    # process each slice (problem is this process step)
    slice_axial, flipped_slices, sl_info = process_slice([slices, image_info]) 
    
    # add to our total array only if match shape
    if np.shape(slice_axial) == (200, 200, 160, 1): 
      total_slices.append(slice_axial)
      total_slices.append(flipped_slices)
      total_slices_info.append(sl_info)
      total_slices_info.append(sl_info)
    
# turn into numpy arrays
total_slices = np.array(total_slices)
total_slices_info = np.array(total_slices_info)

75422
176
   Image Data ID  Subject  Group Sex  ...      Type    Acq Date Format Downloaded
0        1130198    75422      0   M  ...  Original  11/13/2018    DCM  5/07/2019

[1 rows x 12 columns]




75414
176
1130190
   Image Data ID  Subject            Group  ...    Acq Date  Format  Downloaded
1        1130190    75414  GenCohort Unaff  ...  12/13/2018     DCM   4/24/2019

[1 rows x 12 columns]


KeyboardInterrupt: ignored

**Save total slices array and slice info into one giant array**

In [0]:
with open('total_slices_all_TRY2.pkl', 'wb') as f:
    pickle.dump([total_slices, total_slices_info], f)

**Put in one giant function**

In [0]:
def process_multitudes(batch_size, num_batches):
  '''Process to get multiple random batches'''
  for n_batch in range(num_batches):

    # randomly select samples 
    patient_df_sample = patient_df.sample(batch_size)

    # load those patients and put them into an array
    patient_df_sample_id = patient_df_sample['Subject'].values
    
    # to store the arrays
    total_slices = []
    total_slices_info = []

    # loop through, put slices in the sample and info into one giant array
    for patient_id in patient_df_sample_id:

      slices_array, image_info_array = get_slices(patient_id) # grab slices for a patient

      for i in range(len(slices_array)): # reference from larger array
        slices = slices_array[i]
        image_info = image_info_array[i]

        # process each slice
        slice_axial, flipped_slices, sl_info = process_slice([slices, image_info]) 
            
        # add to our total array only if match shape
        if np.shape(slice_axial) == (200, 200, 160, 1): 
          total_slices.append(slice_axial)
          total_slices.append(flipped_slices)
          total_slices_info.append(sl_info)
          total_slices_info.append(sl_info)

    # store the file as a pickle
    file_name = ('stored_batches/total_slices_batch' + str(n_batch) + '.pkl')
    with open(file_name, 'wb') as f:
      pickle.dump([total_slices, total_slices_info], f)

In [0]:
# run the function with 10 batch sizes of 50
process_multitudes(50, 10)

**Test Slice showing an image post-processing**

In [0]:
# test slice

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets # interactive plots
import matplotlib.pyplot as plt
%matplotlib inline

slice_ex = total_slices[0,:,:,:,0]
def g(i): # basic slideshow plot to get an idea of the effectiveness of the mask itself
    plt.figure(figsize=(15,8)) # make plot larger
    plt.imshow(slice_ex[i])
    plt.show()
    return None
  
interact(g, i=widgets.IntSlider(min=0,max=(len(slice_ex)-1),step=1,value=65)); # plots our axial view, this is it