<a href="https://colab.research.google.com/github/StevenVuong/MSc_Project/blob/master/p1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**This Notebook contains the script to clean the dataframe of the CSV file to get the number of subjects which are valid entries**

In [1]:
# install dependencies
!pip install deepbrain; # semi-colon to hide the output
!pip install pydicom;

Collecting deepbrain
[?25l  Downloading https://files.pythonhosted.org/packages/61/e4/1f4f6c483dd9c5c1e0e38193b3c7ade8beb13c24c90bab593ca545c7da92/deepbrain-0.1-py3-none-any.whl (677kB)
[K     |████████████████████████████████| 686kB 2.8MB/s 
Installing collected packages: deepbrain
Successfully installed deepbrain-0.1
Collecting pydicom
[?25l  Downloading https://files.pythonhosted.org/packages/43/88/d3c419ab2e753e7651510882a53219373e78fb55294cb247dffd3934ea55/pydicom-1.2.2-py2.py3-none-any.whl (7.0MB)
[K     |████████████████████████████████| 7.0MB 2.8MB/s 
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-1.2.2


In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pydicom
import pickle
from deepbrain import Extractor
from sklearn.model_selection import train_test_split
import nibabel as nb

In [3]:
from google.colab import drive

# mount google drive into google colab
drive.mount('/content/gdrive')

os.chdir('/content')

# go to where the data is
print (os.listdir())
os.chdir('gdrive/My Drive/msc_project')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive
['.config', 'gdrive', 'sample_data']


**Load Dataframe and change name tags for Gen Cohort and Controls**

In [4]:
# https://www.kaggle.com/sentdex/first-pass-through-data-w-3d-convnet
patients_file_dir = 'T1_SAG_SIEMEN_3T_CLEAN'

patients = os.listdir(patients_file_dir) # get all patients ID's in scan
patient_df = pd.read_csv('T1_SAG_SIEMEN_3T_CLEAN_5_29_2019.csv') # get dataframe too to cross reference

# Map GenCohort to regular PD and Controls
patient_df['Group'] = patient_df['Group'].replace({'GenCohort PD':'PD', 'GenCohort Unaff':'Control'})

# map control to 0 and pd to 1
patient_df['Group'] = patient_df['Group'].replace({'Control':0, 'PD':1})

print ("There are %d number of patients prior to processing" % len(patient_df))
patient_df.head()

There are 381 number of patients prior to processing


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,1130198,75422,0,M,73,1,MRI,MPRAGE GRAPPA,Original,11/13/2018,DCM,5/07/2019
1,1130190,75414,0,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
2,1130191,75414,0,F,73,1,MRI,Sag MPRAGE GRAPPA,Original,12/13/2018,DCM,4/24/2019
3,1125041,74375,0,F,59,1,MRI,MPRAGE_GRAPPA,Original,9/06/2018,DCM,4/24/2019
4,1003469,72138,0,F,55,1,MRI,MPRAGE GRAPPA,Original,2/19/2018,DCM,4/24/2019


In [5]:
def get_grappa_dir(path):
  # get the file ending with 'GRAPPA', would need to accomodate this for grappa also
  returning_path = None
  for next_path in os.listdir(path):
    last_item = next_path.split("_")[-1]
    acceptable_last_items = ['GRAPPA', 'MPRAGE', 'SAG', 'ND']
    if (last_item in acceptable_last_items): # for the t1 weighted
      returning_path = next_path
      return returning_path
    else: # some print statements to debug
      print ("NOT ACCEPTABLE: %s " % last_item)
      print ("Path: %s " % next_path)

def get_dcm_s(path):
  # get the path beginning with S, so doesn't clash with GZ File
  for next_path in os.listdir(path):
    if (next_path[0] == 'S'):
      return next_path

def get_path_to_patient(patient):
  path = None
  try:
    # label = patient_df.get_value(patient, 'Subject') # cannot go by patient, must get the ID
    path = cwd + '/' + patients_file_dir + '/' + str(patient) # get to the GRAPPA 
    path = path + '/' + get_grappa_dir(path)
    path = path + '/' + os.listdir(path)[-1] # get the most recent scan for patient
    path = path + '/' + get_dcm_s(path)
  except Exception as e:
    print ("No path found for patient %s: %s" % (str(patient),str(e)))
    
  return path
    
def get_no_scans(patient_ids):
  '''Return patient list which have no scans'''
  no_scan_list = []
  
  for patient_id in patient_ids:
    
    patient_path = get_path_to_patient(patient_id)
    
    dcm_filepath = os.listdir(patient_path)

    if len(dcm_filepath) == 0:
      no_scan_list.append(patient_id)
    
  return no_scan_list
  
cwd = os.getcwd()
print (cwd)

def check_patients_exist(patient_ids):
  '''Check patient number is stored in files, otherwise remove from df'''
  to_remove_list = []
  
  path_to = cwd + '/' + patients_file_dir
  p_list = os.listdir(path_to)
  reference_list = patient_ids.values

  for patients in p_list:
    patients = int(patients)
    if patients not in reference_list: # if file is not in list of our csv file
      to_remove_list.append(patients)
  
  return to_remove_list

def check_img_size(patient_ids):
  '''check first image size of each patient, must meet size of [15:175, 30:230, 30:230]'''
  mismatched_patients = []
  
  for patient_id in patient_ids: # loop through patients, get shape, if not match, discard
    # get path to dcm
    path = get_path_to_patient(patient_id)
    dcm_files = os.listdir(path)
    
    random_dcm = dcm_files[0]
    num_files = len(os.listdir(path))
    
    path_dcm = path + '/' + random_dcm
    
    # check which have less than 160 scans
    if (num_files <160):
      mismatched_patients.append(patient_id)
      
    # check if any images have dimensions less than (240, 240)    
    random_dcm_shape = np.shape(pydicom.read_file(path_dcm).pixel_array)
    
    if (random_dcm_shape[0] < 240 or random_dcm_shape[1] < 240):
      mismatched_patients.append(patient_id)
      
  return mismatched_patients

/content/gdrive/My Drive/msc_project


**Clean up Dataframe**

In [0]:
# get patient id
patient_ids = patient_df['Subject']

patients_missing_scans = get_no_scans(patient_ids) # get empty folders
patients_absent = check_patients_exist(patient_ids) # check if patient numbers are there

union_list = list(set(patients_missing_scans + patients_absent)) # join lists of patients we want to rid of
patient_df = patient_df[~patient_df['Subject'].isin(union_list)] # reverse, so keep patients that are NOT missing scans, remove the rest

# get id's now of patients with bad image size
patient_ids = patient_df['Subject']
patients_wrong_imgsize = check_img_size(patient_ids)

patient_df = patient_df[~patient_df['Subject'].isin(patients_wrong_imgsize)] # what we get now should be okay

# finally, drop duplicated subject ID's (we grab all the multiple images from later on when we open it's file)
patient_df = patient_df.drop_duplicates('Subject', keep='last')

In [7]:
print ("There are %d number of patients after processing" % len(patient_df))
patient_df.head()

There are 181 number of patients after processing


Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
197,353477,3750,0,M,54,5,MRI,MPRAGE GRAPPA,Original,10/10/2012,DCM,
198,368584,3638,1,M,66,1,MRI,SAG T1 3D MPRAGE,Original,4/02/2013,DCM,
199,363979,3637,0,M,57,1,MRI,SAG T1 3D MPRAGE,Original,2/27/2013,DCM,
200,362046,3636,0,M,64,1,MRI,SAG T1 3D MPRAGE,Original,2/22/2013,DCM,
201,362045,3635,0,M,57,1,MRI,SAG T1 3D MPRAGE,Original,2/07/2013,DCM,


**Save df as a pickle file**

In [0]:
patient_df.to_pickle('processed_patient_df.pkl')