In [1]:
import os
import pandas as pd
import numpy as np
import datetime


In [103]:
redcap_filepath = '../data/prototype_redcap.csv'

# Read in the data
redcap = pd.read_csv(redcap_filepath, sep=';')

# column name mapping
redcap_CRC_conversion_table = pd.read_csv('../redcap_CRC_conversion_table.csv', sep=';')
redcap_CRC_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table.data_type == 'treatment']
redcap_CRC_conversion_table['redcap_name'] = redcap_CRC_conversion_table['redcap_name'].str.strip()

column_names = redcap_CRC_conversion_table[['redcap_name', 'orakloncology_name']]



In [104]:
redcap_CRC_conversion_table

Unnamed: 0,redcap_name,redcap_options,orakloncology_name,orakloncology_options,data_type,Unnamed: 5
45,redcap_repeat_instrument,,redcap_repeat_instrument,,treatment,
48,line_num,,line_number,,treatment,
49,ind_line,1.0,line_indication,induction_or_perioperative,treatment,
50,ind_line,2.0,line_indication,metastatic,treatment,
51,ind_line,3.0,line_indication,intensification,treatment,
...,...,...,...,...,...,...
132,recist_ia,1.0,chemotherapy_RECIST_response,partial_response,treatment,
133,recist_ia,2.0,chemotherapy_RECIST_response,stable_disease,treatment,
134,recist_ia,3.0,chemotherapy_RECIST_response,progressive_disease,treatment,
135,recist_ia,4.0,chemotherapy_RECIST_response,complete_response,treatment,


In [128]:
output_dir = '../data/treatment_data'

In [131]:
# loop through each individual patient
for record_id in redcap['record_id'].unique():

    cleaned_patient_treatment_data = pd.DataFrame(columns = column_names['orakloncology_name'].unique())

    # get the cell line code
    cell_line_code,date_cell_line = get_cell_line_code(redcap, record_id)
    cell_line_code = cell_line_code[1:]

    # select the therapy rows of the data frame
    patient_treatment_data = redcap[(redcap.record_id == record_id) &
                                    redcap['redcap_repeat_instrument'].isin(['ligne_mtastatique_de_traitement', 'hai_chemotherapy'])]
    
    # loop through all the columns in the redcap table
    for column_name in redcap_CRC_conversion_table['redcap_name'].unique():

        # remove and ' ' at the end of the word
        column_name = column_name.strip()

        # loop through the different operations
        for redcap_instrument in patient_treatment_data['redcap_repeat_instrument'].unique():

            single_treatment_data = patient_treatment_data[patient_treatment_data.redcap_repeat_instrument == redcap_instrument]

            # loop through the repetitions
            for repetition in sorted(single_treatment_data['redcap_repeat_instance']):

                cleaned_patient_treatment_data.loc[repetition, 'record_id'] = record_id
                        
                single_treatment_instance_data = single_treatment_data[(single_treatment_data.record_id == record_id) &
                                                 (single_treatment_data.redcap_repeat_instrument == redcap_instrument) &
                                                 (single_treatment_data.redcap_repeat_instance == repetition)]
                                
                
                content = single_treatment_instance_data[column_name].values[0]
                
                # if repetition of the column item in the conversion table then go and 
                # look for the proper object name in the new nomenclature 
                if redcap_CRC_conversion_table['redcap_name'].value_counts()[column_name] > 1:

                    converted_content = redcap_CRC_conversion_table[(redcap_CRC_conversion_table.redcap_name == column_name) &
                                                                    (redcap_CRC_conversion_table.redcap_options == content)]
                    
                    if len(converted_content) > 0:
                        content = converted_content['orakloncology_options'].values[0]

                        cleaned_patient_treatment_data.loc[repetition, converted_content['orakloncology_name'].values[0]] = content

    for column_name in redcap_CRC_conversion_table['orakloncology_name'].unique():

        # if repetition of the column item in the conversion table on the new name side then go and 
        # look for the proper object name in the old nomenclature 
        if redcap_CRC_conversion_table['orakloncology_name'].value_counts()[column_name] > 1:

            if len(redcap_CRC_conversion_table[redcap_CRC_conversion_table.orakloncology_name == column_name]) == len(redcap_CRC_conversion_table[redcap_CRC_conversion_table.orakloncology_name == column_name]['redcap_name'].unique()):
                
                for redcap_name in redcap_CRC_conversion_table[redcap_CRC_conversion_table.orakloncology_name == column_name]['redcap_name'].unique():
                
                    # loop through the different operations
                    for redcap_instrument in patient_treatment_data['redcap_repeat_instrument'].unique():

                        single_treatment_data = patient_treatment_data[patient_treatment_data.redcap_repeat_instrument == redcap_instrument]

                        # loop through the repetitions
                        for repetition in sorted(single_treatment_data['redcap_repeat_instance']):

                            cleaned_patient_treatment_data.loc[repetition, 'record_id'] = record_id
                                    
                            single_treatment_instance_data = single_treatment_data[(single_treatment_data.record_id == record_id) &
                                                            (single_treatment_data.redcap_repeat_instrument == redcap_instrument) &
                                                            (single_treatment_data.redcap_repeat_instance == repetition)]
                            
                            content = single_treatment_instance_data[redcap_name].values[0]
                            
                            if "date" in column_name:

                                cleaned_patient_treatment_data.loc[repetition, column_name] = content

                            else:

                                if content == 1:
                                    cleaned_patient_treatment_data.loc[repetition, column_name] = redcap_CRC_conversion_table[redcap_CRC_conversion_table.redcap_name == redcap_name]['orakloncology_options'].values[0]
                                

    # create a file name
    filename = f'{output_dir}/TR_C_PID_{cell_line_code}_SID_0001.csv'

    # save the row as a csv file
    cleaned_patient_treatment_data.to_csv(filename, index=True, sep=';')
    print(f'Saved {filename}')


Saved ../data/treatment_data/TR_C_PID_GR0069_SID_0001.csv
Saved ../data/treatment_data/TR_C_PID_GR0001_SID_0001.csv


In [125]:
cleaned_patient_treatment_data

Unnamed: 0,redcap_repeat_instrument,line_number,line_indication,CIAH,associated_chip,chemotherapy_type,targeted_therapy_type,chip_type,immunotherapy_type,first_dose_date,...,stop_chemotherapy_cause,neurotoxicity,progression_date,progression_type,chemotherapy_RECIST_response,PFS,intra_arterial_chemotherapy,line_ia_number,chemotherapy_iv_ia,record_id
1,,,,,,folfox,none,,,,...,stop_chemotherapy_cause_instensification,,,progression_local,progressive_disease,,,,,38.0


In [96]:
redcap_CRC_conversion_table[(redcap_CRC_conversion_table.redcap_name == 'neurotox') &
                            (redcap_CRC_conversion_table.redcap_options == 1.0)]

Unnamed: 0,redcap_name,redcap_options,orakloncology_name,orakloncology_options,data_type,Unnamed: 5
98,neurotox,1.0,neurotoxicity,grade_1,treatment,


In [83]:
single_treatment_instance_data

Unnamed: 0,record_id,redcap_repeat_instrument,redcap_repeat_instance,cohorte___1,cohorte___2,cohorte___3,cohorte___4,cohorte___5,sexe,dob,...,hit_chimiogramme_drug___23,hit_chimiogramme_drug___24,hit_chimiogramme_drug___25,hit_chimiogramme_drug___26,hit_chimiogramme_drug___27,hit_chimiogramme_drug___28,excel_chimiogramme,ctb_yes_no,date_ctb,organoides_complete


In [None]:
column_map = column_names.set_index('redcap_name').to_dict()['orakloncology_name']

# Extract the clinical data row from the data set
redcap_clinical_data = redcap.groupby('record_id').first().reset_index()

In [126]:
def get_cell_line_code(redcap: pd.DataFrame,
                       record_id: str):
    """
    Get the cell line code from the redcap data set.add

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    record_id: str
    """

    selected_row = redcap[(redcap['record_id'] == record_id) & 
                          (redcap['redcap_repeat_instrument'] == 'organoides')]
    
    # check that there is only one PDO cell line
    if len(selected_row) > 1:
        raise ValueError(f'There are {len(selected_row)} PDO cell lines for record_id {record_id}.')
    elif len(selected_row) == 0:
        raise ValueError(f'There are no PDO cell lines for record_id {record_id}.')
    else:
    
        cell_line_code = selected_row['nom_lign_e'].values[0]
        date_cell_line = selected_row['date_sample'].values[0]

    return cell_line_code, date_cell_line

def preprocess_single_patient_clinical_data(redcap: pd.DataFrame,
                                            row: pd.Series,
                                            column_map: dict,
                                            redcap_CRC_conversion_table: pd.DataFrame):
    
    """
    Preprocess a single patient clinical data row from the redcap data set.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data for all patients
    row: pd.Series
        row of the redcap data set
    column_map: dict
        mapping of the column names between redcap and orakloncology
    redcap_CRC_conversion_table: pd.DataFrame
        conversion table between redcap and orakloncology
    """


    # get the cell line code
    cell_line_code,date_cell_line = get_cell_line_code(redcap, row['record_id'])
    cell_line_code = cell_line_code[1:]
    
    clinical_data_row = row.copy()

    # if relevant, change the content of each row
    for column_name in clinical_data_row.index:
        if column_name in redcap_CRC_conversion_table['redcap_name'].values:

            # limit the conversion table to the rows relative to the variable
            restricted_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table['redcap_name'] == column_name]

            # get the value in the clinical data row
            row_value = clinical_data_row[column_name]

            if len(restricted_conversion_table) > 1:

                # get the conversion table row
                converted_row_value = restricted_conversion_table[restricted_conversion_table['redcap_options'] == row_value]['orakloncology_options']

                # check that there is only one value
                if len(converted_row_value) > 1:
                    raise ValueError(f'There are {len(converted_row_value)} values for {column_name}.')
                elif len(converted_row_value) == 0:
                    clinical_data_row[column_name] = np.nan
                else:
                    # replace the value in clinical_data_row
                    clinical_data_row[column_name] = converted_row_value.values[0]


    # rename the columns to match target mapping
    clinical_data_row = clinical_data_row.rename(index=column_map)
    clinical_data_row = clinical_data_row.loc[list(column_map.values())]
    

    # add the cell line code and date
    clinical_data_row['cell_line_code'] = cell_line_code
    clinical_data_row['date_cell_line'] = date_cell_line

    # normalize date format
    clinical_data_row['date_birth'] = datetime.datetime.strptime(clinical_data_row['date_birth'], '%d/%m/%Y')

    return clinical_data_row

Target:
- split redcap excel into sub-excels containing the right information
- each excel is of one of three types: clinical, molecular profile or treatment information
- each file is a single patient

Left:
- build the file to split the molecular and the treatment data

In [28]:
def split_clinical_data_from_redcap(redcap: pd.DataFrame,
                 column_map: dict,
                 redcap_CRC_conversion_table: pd.DataFrame,
                 output_dir: str):
    """
    Split the redcap data set into individual clinical data files.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    column_map: dict
        dictionary mapping redcap column names to orakl column names
    output_dir: str
        output directory
    """

    # create a 'clinical_data' folder if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract the clinical data row from the data set
    redcap_clinical_data = redcap.groupby('record_id').first().reset_index()

    # iterate over the rows of the clinical data set to split them
    # and find the PDO-related information
    for index, row in redcap_clinical_data.iterrows():

        clinical_data_row = preprocess_single_patient_clinical_data(redcap, row, column_map, redcap_CRC_conversion_table)
        cell_line_code = clinical_data_row['cell_line_code']

        # create a file name
        filename = f'{output_dir}/CL_C_PID_{cell_line_code}_SID_0001.csv'

        # save the row as a csv file
        clinical_data_row.to_csv(filename, index=True)
        print(f'Saved {filename}')

    return 

def split_molecular_data_from_redcap(redcap: pd.DataFrame,
                 column_map: dict,
                 redcap_CRC_conversion_table: pd.DataFrame,
                 output_dir: str):
    """
    Split the redcap data set into individual molecular data files.

    Parameters
    ----------
    redcap: pd.DataFrame
        redcap data set
    column_map: dict
        dictionary mapping redcap column names to orakl column names
    output_dir: str
        output directory
    """

    # create a 'clinical_data' folder if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Extract the molecular data row from the data set
    redcap_molecular_data = redcap[redcap['redcap_repeat_instrument'] == 'molecular_data'] 

    # iterate over the rows of the molecular data set to split them
    # and find the PDO-related information
    for index, row in redcap_clinical_data.iterrows():

        clinical_data_row = preprocess_single_patient_clinical_data(redcap, row)
        cell_line_code = clinical_data_row['cell_line_code']

        # create a file name
        filename = f'{output_dir}/CL_C_PID_{cell_line_code}_SID_0001.csv'

        # save the row as a csv file
        clinical_data_row.to_csv(filename, index=True)
        print(f'Saved {filename}')

    return 

In [29]:
output_dir = '../data/clinical_data'

split_clinical_data_from_redcap(redcap,column_map,redcap_CRC_conversion_table,output_dir)

Saved ../data/clinical_data/CL_C_PID_GR0069_SID_0001.csv
Saved ../data/clinical_data/CL_C_PID_GR0001_SID_0001.csv


In [1]:
import redcap_preprocessing.split_clinical_data_from_redcap as split_clinical_data_from_redcap

# load redcap data
redcap_filepath = '../tests/test_data/sample_test_data.csv'
output_dir = '../tests/test_script_result'
conversion_table_filepath = '../redcap_CRC_conversion_table.csv'

split_clinical_data_from_redcap.split_clinical_data_from_redcap_directory(redcap_filepath,
                                                        conversion_table_filepath,
                                                        output_dir)

Saved ../tests/test_script_result/CL_C_PID_GR0069_SID_0001.csv
