In [1]:
import pandas as pd
pd.set_option('display.max_rows', None)
import os
from os.path import join, exists
from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold

In [2]:
dataset_path = '../../spreadsheet/Clinical_Metadata_FDG_PET_CT_Lesions.csv'
#mips_dir = '../../MIP'
mips_dir = '../../MIP_square512'

export_split_path = '../../spreadsheet/data_split_study_level.xlsx'
#export_path = '../../spreadsheet/MIP_paths_and_prompts.xlsx'
export_path = '../../spreadsheet/MIP_paths_and_prompts_square512.xlsx'

In [3]:
suv_mip_paths = []
seg_mip_paths = []
for r,d,f in tqdm(os.walk(mips_dir)):
    for filename in f:
        if filename.startswith('SUV_MIP'):
            suv_mip_paths.append(join(r,filename))
        if filename.startswith('SEG_MIP'):
            seg_mip_paths.append(join(r,filename))
        

3943it [00:00, 6362.44it/s]


In [4]:
records = []

for suv_mip_path in tqdm(suv_mip_paths):
    seg_mip_path = suv_mip_path.replace('SUV_MIP', 'SEG_MIP')
    comps = suv_mip_path.split('/')
    subject_id = comps[3]
    study_id = comps[4]
    
    filename = comps[-1]
    filename_comps = filename.split('_')
    axis = filename_comps[-3]
    degree = filename_comps[-2]
    
    records.append({
        'Subject ID': subject_id,
        'Study ID': study_id,
        'SUV_MIP_path': suv_mip_path,
        'SEG_MIP_path': seg_mip_path,
        'projection_axis': axis,
        'rotation_degrees': degree
    })

100%|██████████| 12168/12168 [00:00<00:00, 122670.42it/s]


In [5]:
df = pd.DataFrame.from_records(records)
print(df.shape)

(12168, 6)


In [6]:
annot_df = pd.read_csv(dataset_path, dtype=str)
print(annot_df.shape)
annot_df = annot_df.drop_duplicates(subset=['Subject ID','Study UID'],
                                   ignore_index=True)
print(annot_df.shape)

sgkf = StratifiedGroupKFold(n_splits=5)
for fold, (_, test_index) in enumerate(sgkf.split(annot_df, annot_df['diagnosis'], annot_df['Subject ID'])):
    annot_df.loc[test_index,'fold'] = fold

(3042, 20)
(1014, 20)


In [7]:
annot_df.loc[annot_df['fold']==3, 'allocated_set'] = 'test'
annot_df.loc[annot_df['fold']!=3, 'allocated_set'] = 'train_val'

In [8]:
for row_idx in annot_df.index:
    file_loc = annot_df.loc[row_idx, 'File Location']
    comps = file_loc.split('/')
    study_id = comps[3]
    annot_df.loc[row_idx, 'Study ID'] = study_id

In [9]:
annot_df.to_excel(export_split_path, index=False)

In [10]:
annot_df.columns

Index(['Series UID', 'Collection', '3rd Party Analysis',
       'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
       'Study Date', 'Series Description', 'Manufacturer', 'Modality',
       'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
       'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex',
       'fold', 'allocated_set', 'Study ID'],
      dtype='object')

In [11]:
df = df.merge(annot_df[['Study ID','diagnosis', 'age', 'sex', 'fold', 'allocated_set']],
             how='left',
             on='Study ID').reset_index(drop=True)
print(df.shape)

(12168, 11)


In [12]:
df['diagnosis'].value_counts()

diagnosis
NEGATIVE       6156
MELANOMA       2256
LUNG_CANCER    2016
LYMPHOMA       1740
Name: count, dtype: int64

In [13]:
df['diagnosis_lower'] = df['diagnosis'].str.replace('_',' ').str.lower()

In [14]:
df['diagnosis_lower'].value_counts()

diagnosis_lower
negative       6156
melanoma       2256
lung cancer    2016
lymphoma       1740
Name: count, dtype: int64

In [15]:
df.columns

Index(['Subject ID', 'Study ID', 'SUV_MIP_path', 'SEG_MIP_path',
       'projection_axis', 'rotation_degrees', 'diagnosis', 'age', 'sex',
       'fold', 'allocated_set', 'diagnosis_lower'],
      dtype='object')

# create prompts

In [16]:
for row_idx in df.index:
    axis = df.loc[row_idx, 'projection_axis']
    if axis == 'x':
        plane = 'sagittal'
    elif axis == 'y':
        plane = 'coronal'
    elif axis == 'z':
        plane = 'cross-sectional'
    degree = df.loc[row_idx, 'rotation_degrees']
    age = int(df.loc[row_idx, 'age'][:-1])
    sex = df.loc[row_idx, 'sex']
    if sex == 'M':
        sex = 'male'
    elif sex == 'F':
        sex = 'female'
    else:
        sex = 'unknown-sex'
    Dx = df.loc[row_idx, 'diagnosis_lower']
    
    if Dx == 'negative':
        prompt = 'Positron emission tomography (PET) of a {0}-year-old {1} patient without cancer. The PET scan is rotated by {2} degrees. Maximum intensity projection is calculated along the {3} plane.'.format(
            age,
            sex,
            degree,
            plane
        )
    else:
        prompt = 'Positron emission tomography (PET) of a {0}-year-old {1} patient with {2}. The PET scan is rotated by {3} degrees. Maximum intensity projection is calculated along the {4} plane.'.format(
            age,
            sex,
            Dx,
            degree,
            plane
        )
    
    df.loc[row_idx, 'prompt'] = prompt
        

In [17]:
df.to_excel(export_path, index=False)