In [1]:
import os
import pandas as pd
import numpy as np
import datetime


In [121]:
def get_cell_line_code(redcap: pd.DataFrame,
                       record_id: str):
    """
    Get the cell line code from the redcap data set.add

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    record_id: str
    """

    selected_row = redcap[(redcap['record_id'] == record_id) & 
                          (redcap['redcap_repeat_instrument'] == 'organoides')]
    
    # check that there is only one PDO cell line
    if len(selected_row) > 1:
        raise ValueError(f'There are {len(selected_row)} PDO cell lines for record_id {record_id}.')
    elif len(selected_row) == 0:
        raise ValueError(f'There are no PDO cell lines for record_id {record_id}.')
    else:
    
        cell_line_code = selected_row['nom_lign_e'].values[0]
        date_cell_line = selected_row['date_sample'].values[0]

    return cell_line_code, date_cell_line

def get_content_matching_type_1(single_patient_treatment_row,
                                redcap_CRC_conversion_table,
                                column_name,
                                ):
    
    assert column_name in redcap_CRC_conversion_table.orakloncology_name.values, 'column name not found in conversion table'
    assert len(redcap_CRC_conversion_table[(redcap_CRC_conversion_table.orakloncology_name == column_name)&(redcap_CRC_conversion_table.matching_type == 1)]['redcap_name']) == 1, 'multiple redcap names found for column name in a type 1 match'

    redcap_column_name = redcap_CRC_conversion_table[(redcap_CRC_conversion_table.orakloncology_name == column_name)&(redcap_CRC_conversion_table.matching_type == 1)]['redcap_name'].values[0]
    content = single_patient_treatment_row[redcap_column_name]

    return content

def get_content_matching_type_2(single_patient_treatment_row,
                                redcap_CRC_conversion_table,
                                column_name,
                                ):
    
    content = []
    redcap_column_rows = redcap_CRC_conversion_table[(redcap_CRC_conversion_table.orakloncology_name == column_name)&(redcap_CRC_conversion_table.matching_type == 2)]

    for index, row in redcap_column_rows.iterrows():

        if single_patient_treatment_row[row['redcap_name']] == 1:
            content.append(row['orakloncology_options'])

    return ';'.join(content)

def get_content_matching_type_3(single_patient_treatment_row,
                                redcap_CRC_conversion_table,
                                column_name,
                                ):
    
    content = []
    redcap_column_rows = redcap_CRC_conversion_table[(redcap_CRC_conversion_table.orakloncology_name == column_name)&(redcap_CRC_conversion_table.matching_type == 3)]

    for index, row in redcap_column_rows.iterrows():

        if row['redcap_options'] == single_patient_treatment_row[row['redcap_name']]:
            content.append(row['orakloncology_options'])

    return ';'.join(content)

def get_content_matching_type_4(single_patient_treatment_row,
                                redcap_CRC_conversion_table,
                                column_name,
                                ):
    
    content = []
    redcap_column_rows = redcap_CRC_conversion_table[(redcap_CRC_conversion_table.orakloncology_name == column_name)&(redcap_CRC_conversion_table.matching_type == 4)]

    for index, row in redcap_column_rows.iterrows():
        
        # test if not nan, then add to content
        if single_patient_treatment_row[row['redcap_name']] == single_patient_treatment_row[row['redcap_name']]:
            content.append(single_patient_treatment_row[row['redcap_name']])

    return ';'.join(content)

def add_content(content, prior_content):

    # test if empty str
    if prior_content == '':
        return content
    
    # test if nan
    elif prior_content != prior_content:
        return content
    
    # test if empty str
    elif content == '':
        return prior_content
    
    # test if nan
    elif content != content:
        return prior_content
    
    else:
        # test if the content and prior content are of the same type
        assert type(content) == type(prior_content), 'content and prior content are not of the same type'
        
        if type(content) == str:
            return prior_content + ';' + content
        
        else:
            print(content)

def get_single_patient_treatment_data(patient_treatment_data,
                                      redcap_CRC_conversion_table):

    cleaned_patient_treatment_data = pd.DataFrame('',
                                                index = patient_treatment_data.index,
                                                columns = redcap_CRC_conversion_table['orakloncology_name'].unique(),)

    # Select single therapy row
    for index, row in patient_treatment_data.iterrows():

        # loop through all the columns in the redcap table
        for column_name in cleaned_patient_treatment_data.columns:

            matching_types = redcap_CRC_conversion_table[redcap_CRC_conversion_table['orakloncology_name'] == column_name]['matching_type'].unique()

            for matching_type in matching_types:

                if matching_type == 1:

                    content = get_content_matching_type_1(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 2:

                    content = get_content_matching_type_2(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 3:

                    content = get_content_matching_type_3(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 4:

                    content = get_content_matching_type_4(row, redcap_CRC_conversion_table, column_name)


                # special case for chemotherapy_type
                if (column_name == 'chemotherapy_type')&(content == 'other'):
                    content = row['other_iv'].lower()

                # formating
                elif (column_name == 'targeted_therapy_type')&(content != '')&(content == content):
                    content = content.lower()
                
                content= add_content(content, cleaned_patient_treatment_data.loc[index, column_name])
                cleaned_patient_treatment_data.loc[index, column_name] = content

    return cleaned_patient_treatment_data
    
def make_single_patient_treatment_files(redcap_path: str,
                       redcap_conversion_table_path: str,
                       output_dir: str,
                       ):
    """
    Get the treatment data from the redcap data set.

    Parameters
    ----------
    redcap_path: str
        path to redcap data set
    redcap_conversion_table_path: str
        path to redcap CRC conversion table
    output_dir: str
        output directory
    """

    # Read in the data
    redcap = pd.read_csv(redcap_path, sep=';')

    # column name mapping
    redcap_CRC_conversion_table = pd.read_csv(redcap_conversion_table_path, sep=';')
    redcap_CRC_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table.data_type == 'treatment']
    redcap_CRC_conversion_table['redcap_name'] = redcap_CRC_conversion_table['redcap_name'].str.strip()

    # get the unique record ids
    record_ids = redcap.record_id.unique()

    # loop through all the record ids
    for record_id in record_ids:

        # get the cell line code
        cell_line_code, date_cell_line = get_cell_line_code(redcap, record_id)

        # select the patient data
        patient_treatment_data = redcap[(redcap.record_id == record_id) &
                                        (redcap['redcap_repeat_instrument'].isin(['ligne_mtastatique_de_traitement', 'hai_chemotherapy']))]

        # get the single patient treatment data
        cleaned_patient_treatment_data = get_single_patient_treatment_data(patient_treatment_data,
                                                                            redcap_CRC_conversion_table)

        # add the cell line code and date
        cleaned_patient_treatment_data['cell_line_code'] = cell_line_code
        cleaned_patient_treatment_data['date_cell_line'] = date_cell_line

        # create a file name
        filename = f'{output_dir}/TR_C_PID_{cell_line_code}_SID_0001.csv'   

        # save the data
        cleaned_patient_treatment_data.to_csv(filename, sep=';')

    return None

In [122]:
redcap_path = '../data/prototype_redcap.csv'
redcap_conversion_table_path = '../redcap_CRC_conversion_table.csv'
output_dir = '../data/treatment_data'

make_single_patient_treatment_files(redcap_path, redcap_conversion_table_path, output_dir)

In [108]:
# select single patient
single_record_id = redcap['record_id'].unique()[0]

cleaned_patient_treatment_data = pd.DataFrame(columns = column_names['orakloncology_name'].unique())
i = 0

# get the cell line code
cell_line_code,date_cell_line = get_cell_line_code(redcap, single_record_id)
cell_line_code = cell_line_code[1:]

# select the therapy rows of the data frame
patient_treatment_data = redcap[(redcap.record_id == single_record_id) &
                                redcap['redcap_repeat_instrument'].isin(['ligne_mtastatique_de_traitement', 'hai_chemotherapy'])]


# get the single patient treatment data
cleaned_patient_treatment_data = get_single_patient_treatment_data(patient_treatment_data,
                                                                   redcap_CRC_conversion_table)



# save the row as a csv file
cleaned_patient_treatment_data.to_csv(filename, index=True, sep=';')
print(f'Saved {filename}')
    


[3 2]
[3 2]
[3 2]
[3 2]
[3 2]


AttributeError: 'str' object has no attribute 'record_id'

In [None]:
column_map = column_names.set_index('redcap_name').to_dict()['orakloncology_name']

# Extract the clinical data row from the data set
redcap_clinical_data = redcap.groupby('record_id').first().reset_index()

In [5]:
def get_cell_line_code(redcap: pd.DataFrame,
                       record_id: str):
    """
    Get the cell line code from the redcap data set.add

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    record_id: str
    """

    selected_row = redcap[(redcap['record_id'] == record_id) & 
                          (redcap['redcap_repeat_instrument'] == 'organoides')]
    
    # check that there is only one PDO cell line
    if len(selected_row) > 1:
        raise ValueError(f'There are {len(selected_row)} PDO cell lines for record_id {record_id}.')
    elif len(selected_row) == 0:
        raise ValueError(f'There are no PDO cell lines for record_id {record_id}.')
    else:
    
        cell_line_code = selected_row['nom_lign_e'].values[0]
        date_cell_line = selected_row['date_sample'].values[0]

    return cell_line_code, date_cell_line

def preprocess_single_patient_clinical_data(redcap: pd.DataFrame,
                                            row: pd.Series,
                                            column_map: dict,
                                            redcap_CRC_conversion_table: pd.DataFrame):
    
    """
    Preprocess a single patient clinical data row from the redcap data set.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data for all patients
    row: pd.Series
        row of the redcap data set
    column_map: dict
        mapping of the column names between redcap and orakloncology
    redcap_CRC_conversion_table: pd.DataFrame
        conversion table between redcap and orakloncology
    """


    # get the cell line code
    cell_line_code,date_cell_line = get_cell_line_code(redcap, row['record_id'])
    cell_line_code = cell_line_code[1:]
    
    clinical_data_row = row.copy()

    # if relevant, change the content of each row
    for column_name in clinical_data_row.index:
        if column_name in redcap_CRC_conversion_table['redcap_name'].values:

            # limit the conversion table to the rows relative to the variable
            restricted_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table['redcap_name'] == column_name]

            # get the value in the clinical data row
            row_value = clinical_data_row[column_name]

            if len(restricted_conversion_table) > 1:

                # get the conversion table row
                converted_row_value = restricted_conversion_table[restricted_conversion_table['redcap_options'] == row_value]['orakloncology_options']

                # check that there is only one value
                if len(converted_row_value) > 1:
                    raise ValueError(f'There are {len(converted_row_value)} values for {column_name}.')
                elif len(converted_row_value) == 0:
                    clinical_data_row[column_name] = np.nan
                else:
                    # replace the value in clinical_data_row
                    clinical_data_row[column_name] = converted_row_value.values[0]


    # rename the columns to match target mapping
    clinical_data_row = clinical_data_row.rename(index=column_map)
    clinical_data_row = clinical_data_row.loc[list(column_map.values())]
    

    # add the cell line code and date
    clinical_data_row['cell_line_code'] = cell_line_code
    clinical_data_row['date_cell_line'] = date_cell_line

    # normalize date format
    clinical_data_row['date_birth'] = datetime.datetime.strptime(clinical_data_row['date_birth'], '%d/%m/%Y')

    return clinical_data_row

Target:
- split redcap excel into sub-excels containing the right information
- each excel is of one of three types: clinical, molecular profile or treatment information
- each file is a single patient

Left:
- build the file to split the molecular and the treatment data

In [28]:
def split_clinical_data_from_redcap(redcap: pd.DataFrame,
                 column_map: dict,
                 redcap_CRC_conversion_table: pd.DataFrame,
                 output_dir: str):
    """
    Split the redcap data set into individual clinical data files.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    column_map: dict
        dictionary mapping redcap column names to orakl column names
    output_dir: str
        output directory
    """

    # create a 'clinical_data' folder if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract the clinical data row from the data set
    redcap_clinical_data = redcap.groupby('record_id').first().reset_index()

    # iterate over the rows of the clinical data set to split them
    # and find the PDO-related information
    for index, row in redcap_clinical_data.iterrows():

        clinical_data_row = preprocess_single_patient_clinical_data(redcap, row, column_map, redcap_CRC_conversion_table)
        cell_line_code = clinical_data_row['cell_line_code']

        # create a file name
        filename = f'{output_dir}/CL_C_PID_{cell_line_code}_SID_0001.csv'

        # save the row as a csv file
        clinical_data_row.to_csv(filename, index=True)
        print(f'Saved {filename}')

    return 

def split_molecular_data_from_redcap(redcap: pd.DataFrame,
                 column_map: dict,
                 redcap_CRC_conversion_table: pd.DataFrame,
                 output_dir: str):
    """
    Split the redcap data set into individual molecular data files.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    column_map: dict
        dictionary mapping redcap column names to orakl column names
    output_dir: str
        output directory
    """

    # create a 'clinical_data' folder if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract the molecular data row from the data set
    redcap_molecular_data = redcap[redcap['redcap_repeat_instrument'] == 'molecular_data'] 

    # iterate over the rows of the molecular data set to split them
    # and find the PDO-related information
    for index, row in redcap_clinical_data.iterrows():

        clinical_data_row = preprocess_single_patient_clinical_data(redcap, row)
        cell_line_code = clinical_data_row['cell_line_code']

        # create a file name
        filename = f'{output_dir}/CL_C_PID_{cell_line_code}_SID_0001.csv'

        # save the row as a csv file
        clinical_data_row.to_csv(filename, index=True)
        print(f'Saved {filename}')

    return 

In [29]:
output_dir = '../data/clinical_data'

split_clinical_data_from_redcap(redcap,column_map,redcap_CRC_conversion_table,output_dir)

Saved ../data/clinical_data/CL_C_PID_GR0069_SID_0001.csv
Saved ../data/clinical_data/CL_C_PID_GR0001_SID_0001.csv


In [1]:
import redcap_preprocessing.split_clinical_data_from_redcap as split_clinical_data_from_redcap

# load redcap data
redcap_filepath = '../tests/test_data/sample_test_data.csv'
output_dir = '../tests/test_script_result'
conversion_table_filepath = '../redcap_CRC_conversion_table.csv'

split_clinical_data_from_redcap.split_clinical_data_from_redcap_directory(redcap_filepath,
                                                        conversion_table_filepath,
                                                        output_dir)

Saved ../tests/test_script_result/CL_C_PID_GR0069_SID_0001.csv
