In [1]:
import os
import numpy as np
import pandas as pd

In [7]:
def data_summary(database_dir, summary_path):
    """To summarize database information.

    Please use `pandas.DataFrame` to summarize database
    information. Header includes `PatientID`, `CaseDate`,
    `CaseTime` and `Modality`. Save the dataframe by
    following `PatientID`, `CaseDate`, `CaseTime` and
    `Modality` in ascending order, also, without index.

    Args:
        database_dir (str): Path to database directory.
            Database directory contains each patient directory.
            For example, database_dir = '../data/patients-samples'.
            Under `database_dir` contains `patient-00000`,
            `patient-00001`, etc.
        summary_path (str): Path to save the summary dataframe.
            Dataframe will save in csv format.
    """
    raw_data = []
    for root, dirs, files in os.walk(database_dir):
        if root[-2:] == 'CT' or root[-2:] == 'MR':
            folder_root = root.replace(database_dir, '')
            raw_data.append(folder_root[1:].split('/'))
            
    df = pd.DataFrame(columns=['PatientID', 'CaseDate', 'CaseTime', 'Modality'])
    for i, data in enumerate(raw_data):
        df.loc[i] = [data[0].split('-')[1],
                     "/".join(data[1][4:14].split('.')), 
                     ":".join(data[1][15:].split('.')), 
                     data[2]]
    df = df.sort_values(by=['PatientID', 'CaseDate', 'CaseTime', 'Modality'])
    df = df.reset_index(drop=True)
    
    # output to summary.csv 
    filename = summary_path + "summary.csv"
    df.to_csv(filename, index = False)
    
    return df

summary_df = data_summary(database_dir='data/patients-samples', summary_path='./')
summary_df

Unnamed: 0,PatientID,CaseDate,CaseTime,Modality
0,00000,2008/04/12,21:53:41,CT
1,00000,2008/04/12,23:08:33,MR
2,00001,2005/05/29,17:28:11,CT
3,00001,2012/04/04,14:36:43,CT
4,00001,2012/04/04,18:07:09,MR
5,00002,2006/02/12,05:40:24,CT
6,00002,2006/02/12,10:41:17,MR
7,00002,2010/01/01,18:00:54,CT
8,00002,2010/01/01,22:33:49,MR
9,00003,2009/08/25,04:18:22,CT


In [None]:
### ref: https://stackoverflow.com/a/9728478

def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
#         print("root:", root)
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
#         for f in files:
#             print('{}{}'.format(subindent, f))

list_files('data/patients-samples/')

In [17]:
def get_target_cases_df(summary_df):
    """ Get target cases dataframe.

    Target is defined by following:
        1. Must be modality CT.
        2. If patient have several CTs,
           use the latest one.

    Args:
        summary_df (pandas.core.frame.DataFrame):
            Info summary dataframe.

    Returns:
        target_cases_df (pandas.core.frame.DataFrame):
            Target cases dataframe. Index should be successive.
    """
    
    # keep only CT
    df = summary_df[summary_df.Modality == 'CT']
    
    # keep the latest case per patient
    df = df.drop_duplicates(subset='PatientID', keep="last")

    # reset the index
    df = df.reset_index(drop=True)
    
    return df

get_target_cases_df(summary_df)

Unnamed: 0,PatientID,CaseDate,CaseTime,Modality
0,0,2008/04/12,21:53:41,CT
1,1,2012/04/04,14:36:43,CT
2,2,2010/01/01,18:00:54,CT
3,3,2009/08/25,04:18:22,CT
4,4,2011/03/30,08:33:18,CT
5,5,2013/11/03,02:12:07,CT
6,6,2011/09/12,03:37:06,CT
7,7,2009/07/09,06:43:47,CT
8,8,2013/02/22,16:42:03,CT
9,9,2012/02/14,18:58:20,CT
