# This Notebook...
...preprocesses the patient meta data, and saves it locally as "extracted_meta.pkl".

# Dependencies

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

from utils import unique, combine_mr_seqs

import pickle

# Load and investigate data

In [13]:
data_raw = pd.read_excel("/home/simjo484/Desktop/link_to_xml_data/MRI_summary_extended.xlsx")

print("Shape: ", data_raw.shape, "\n")
print(data_raw.columns)


Shape:  (252144, 38) 

Index([                 'subjetID',                    'gender',
                       'ethnicity',                  'survival',
                    'session_name',           'type_of_session',
                'From prev. study',            'session_status',
                       'diagnosis',                     'Notes',
                'tumor_descriptor',          'age_at_diagnosis',
       'age_at_sample_acquisition',            'tumor_location',
                      'image_type',             'magnification',
                         'scanner',               'dimension_x',
                     'dimension_y',               'dimension_z',
                        'pixels_x',                  'pixels_y',
                        'pixels_z',                 'file_name',
                         'Column1',                        '_1',
                     'Unnamed: 26',    'session-previous study',
                     'Unnamed: 28',            'session_name.1',
  

# Filter on the *pre-op* sequences of patients

In [14]:
# Filter only on pre_op sequences
data_preop = data_raw[data_raw["session_status"] == "pre_op"]
print(data_preop.shape)

unique_subjetID = unique(data_preop.drop_duplicates(subset=["subjetID", "diagnosis"])["diagnosis"]) # Find the diagnoses that have at least 18 patients
diagnoses = unique_subjetID[unique_subjetID["Counts"] >= 18]["Values"].tolist()

data_preop_diags = data_preop[data_preop["diagnosis"].isin(diagnoses)]
print(data_preop_diags.shape)

(27813, 38)
(26526, 38)


# Rename and combine features
* "Low-grade glioma/astrocytoma (WHO grade I/II)" ---> "L-GA"
* "Medulloblastoma" ---> "Medu"
* "High-grade glioma/astrocytoma (WHO grade III/IV)" ---> "H-GA"
* "Ganglioglioma" ---> "Gang"
* "Ependymoma" ---> "Epen"
* "Atypical Teratoid Rhabdoid Tumor (ATRT)" ---> "ATRT"
* "Brainstem glioma- Diffuse intrinsic pontine glioma" ---> "DIPG"
* "Craniopharyngioma" ---> "Cran"

# How many unique patient and session pairs are there?

In [2]:
#data_preop_diags

### Rename MR Sequences


unique(combine_mr_seqs(data_preop_diags["image_type"]))

NameError: name 'data_preop_diags' is not defined

In [48]:
df_patient_sessions = data_preop_diags.copy()

df_patient_sessions = df_patient_sessions.drop_duplicates(subset=["subjetID", "session_name", "image_type"])

df_patient_sessions["image_type"] = combine_mr_seqs(df_patient_sessions["image_type"])

unique(df_patient_sessions["image_type"])

Unnamed: 0,Values,Counts
17,remove,1973
9,T1W,1403
14,T2W,1310
6,FLAIR,1116
10,T1W-GD,1063
0,ADC,1044
16,TRACE,849
3,DIFFUSION,735
7,MPR,559
4,EXP,501


# Save the extracted data locally

In [6]:
# This is the main dataset of sequences that we will be analysing
extracted_meta = data_preop_diags

# Save it locally
with open("/local/data1/simjo484/mt_data/all_data/MRI/extracted_meta.pkl", "wb") as f:
    pickle.dump(extracted_meta, f)