In [9]:
import pandas as pd
import numpy as np

# Preprocessing clinical data

We do this prior to upload because of patient merging issues.

In [10]:
from redcap_preprocessing.utils import get_delimiter, get_content_matching_type_1, get_content_matching_type_2, get_content_matching_type_3, get_content_matching_type_4, add_content


def get_single_patient_treatment_data(patient_treatment_data,
                                      redcap_CRC_conversion_table,
                                      disease_type,):

    cleaned_patient_treatment_data = pd.DataFrame('',
                                                index = patient_treatment_data.index,
                                                columns = redcap_CRC_conversion_table['orakloncology_name'].unique(),)
    
    if disease_type == 'PDAC':
        slice_df = patient_treatment_data[patient_treatment_data['bras_essai_clinique_met'].isna()]
        patient_treatment_data.loc[slice_df.index, 'bras_essai_clinique_met'] = patient_treatment_data['bras_essai_clinique_met'].fillna('')

        slice_df = patient_treatment_data[patient_treatment_data['spe_autre_met'].isna()]
        patient_treatment_data.loc[slice_df.index, 'spe_autre_met'] = patient_treatment_data['spe_autre_met'].fillna('')

    elif disease_type == 'CRC':
        slice_df = patient_treatment_data[patient_treatment_data['other_iv'].isna()]
        patient_treatment_data.loc[slice_df.index, 'other_iv'] = patient_treatment_data['other_iv'].fillna('')

    # Select single therapy row
    for index, row in patient_treatment_data.iterrows():

        # loop through all the columns in the redcap table
        for column_name in cleaned_patient_treatment_data.columns:

            matching_types = redcap_CRC_conversion_table[redcap_CRC_conversion_table['orakloncology_name'] == column_name]['matching_type'].unique()

            for matching_type in matching_types:

                if matching_type == 1:

                    content = get_content_matching_type_1(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 2:

                    content = get_content_matching_type_2(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 3:

                    content = get_content_matching_type_3(row, redcap_CRC_conversion_table, column_name)

                if matching_type == 4:

                    content = get_content_matching_type_4(row, redcap_CRC_conversion_table, column_name)

                # special case for chemotherapy_type
                if disease_type == 'CRC':
                    if (column_name == 'chemotherapy_type')&(content == 'other'):
                        content = row['other_iv'].lower()
                elif disease_type == 'PDAC':
                    if (column_name == 'chemotherapy_type')&(content == 'other'):
                        content = row['spe_autre_met'].lower()
                    elif (column_name == 'chemotherapy_type')&(content == 'clinical_trial'):
                        content = row['bras_essai_clinique_met'].lower()

                # formating
                elif (column_name == 'targeted_therapy_type')&(content != '')&(content == content):
                    content = content.lower()
                
                content= add_content(content, cleaned_patient_treatment_data.loc[index, column_name])
                cleaned_patient_treatment_data.loc[index, column_name] = content

    # add treatment type            
    cleaned_patient_treatment_data['treatment_type'] = patient_treatment_data['redcap_repeat_instrument'].replace('ligne_mtastatique_de_traitement', 'metastatic').replace('hai_chemotherapy', 'hai').replace('', 'neo_adjuvant')

    # if neo_adjuvant, change stop cause
    cleaned_patient_treatment_data.loc[cleaned_patient_treatment_data['treatment_type'] == 'neo_adjuvant', 'stop_chemotherapy_cause'] = 'surgery'

    return cleaned_patient_treatment_data

In [13]:

redcap_filepath = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/raw/PANCREASGR-PDO_DATA_2024-03-29_1423.csv'
output_dir = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/single_files'
conversion_table_filepath = 'conversion_table/redcap_PDAC_conversion_table.csv'
disease_type = 'PDAC'


redcap = pd.read_csv(redcap_filepath, sep=';')
redcap['redcap_repeat_instrument'] = redcap['redcap_repeat_instrument'].fillna('')
conversion_table_delimiter = get_delimiter(conversion_table_filepath)
redcap_CRC_conversion_table = pd.read_csv(conversion_table_filepath, delimiter=conversion_table_delimiter)    
redcap_CRC_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table.data_type == 'treatment']
redcap_CRC_conversion_table['redcap_name'] = redcap_CRC_conversion_table['redcap_name'].str.strip()

patient_treatment_data = redcap[(redcap.record_id == 755) &
                                (redcap['redcap_repeat_instrument'].isin(['','ligne_mtastatique_de_traitement', 'hai_chemotherapy']))]
            



In [14]:
get_single_patient_treatment_data(patient_treatment_data,
                                  redcap_CRC_conversion_table,
                                  disease_type,)

Unnamed: 0,redcap_repeat_instrument,line_number,PFS,chemotherapy_type,pre_metastasis_treatment,other_chemotherapy_type,first_dose_date,last_dose_date,number_chemo_cycles,chemotherapy_RECIST_response,stop_chemotherapy_cause,progression_date,progression_type,treatment_type
137,,,,,,,,,,,surgery,,,neo_adjuvant
138,ligne_mtastatique_de_traitement,1.0,1.9,folfirinox,,,2021-02-15,2021-04-15,,progressive_disease,progression,2021-04-15,progression_metastasis,metastatic
139,ligne_mtastatique_de_traitement,2.0,4.0,gemcitabine carboplatine,,Gemcitabine carboplatine,2021-04-15,2021-08-15,,progressive_disease,progression,2021-08-15,progression_metastasis,metastatic
140,ligne_mtastatique_de_traitement,3.0,2.7,paclitaxel,,,2021-08-15,2021-10-15,,progressive_disease,progression,2021-11-04,progression_metastasis,metastatic
141,ligne_mtastatique_de_traitement,4.0,7.6,capécitabine erlotinib,,Capécitabine erlotinib,2021-12-21,2022-08-16,,progressive_disease,AEG,2022-08-10,progression_metastasis,metastatic


In [5]:

redcap_filepath = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/raw/ColonGR-PDO_DATA_2024-03-29_1424.csv'
output_dir = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/single_files'
conversion_table_filepath = 'conversion_table/redcap_CRC_conversion_table.csv'
disease_type = 'CRC'


redcap = pd.read_csv(redcap_filepath, sep=';')
redcap['redcap_repeat_instrument'] = redcap['redcap_repeat_instrument'].fillna('')
conversion_table_delimiter = get_delimiter(conversion_table_filepath)
redcap_CRC_conversion_table = pd.read_csv(conversion_table_filepath, delimiter=conversion_table_delimiter)    
redcap_CRC_conversion_table = redcap_CRC_conversion_table[redcap_CRC_conversion_table.data_type == 'treatment']
redcap_CRC_conversion_table['redcap_name'] = redcap_CRC_conversion_table['redcap_name'].str.strip()

patient_treatment_data = redcap[(redcap.record_id == 22) &
                                (redcap['redcap_repeat_instrument'].isin(['','ligne_mtastatique_de_traitement', 'hai_chemotherapy']))]
            

get_single_patient_treatment_data(patient_treatment_data,
                                  redcap_CRC_conversion_table,
                                  disease_type,)

Unnamed: 0,redcap_repeat_instrument,line_number,line_indication,CIAH,associated_chip,chemotherapy_type,targeted_therapy_type,immunotherapy_type,first_dose_date,last_dose_date,stop_chemotherapy_cause,neurotoxicity,progression_date,date_RECIST_evaluation,progression_type,chemotherapy_RECIST_response,PFS,treatment_type
0,,,,,,,,,,,,,,,,,,neo_adjuvant
1,ligne_mtastatique_de_traitement,1.0,induction_or_perioperative,0.0,0.0,folfirinox,none,none,2020-07-29,2021-01-28,curative;maintenance,grade_2,2021-04-28,,progression_metastasis,partial_response,9.0,metastatic_line
2,ligne_mtastatique_de_traitement,1.0,induction_or_perioperative,0.0,0.0,folfiri,bevacizumab,none,2020-12-30,2021-05-05,progression;instensification,grade_0,2021-04-28,,progression_metastasis,partial_response,3.9,metastatic_line
3,ligne_mtastatique_de_traitement,2.0,intensification,1.0,0.0,folfirinox,bevacizumab,none,2021-09-30,2022-01-13,progression,grade_1,2022-01-18,,progression_metastasis,stable_disease,3.6,metastatic_line
4,ligne_mtastatique_de_traitement,3.0,metastatic,0.0,0.0,folfiri3,aflibercept,none,2022-01-26,2022-03-23,progression;AEG,grade_1,2022-04-01,,progression_metastasis,progressive_disease,2.1,metastatic_line
5,hai_chemotherapy,,,,,oxaliplatine;folfiri,Bevacizumab,,2021-06-25,2021-08-06,,,2021-08-31;2021-08-31,2021-07-29,,stable_disease,,hai


In [5]:
from redcap_preprocessing import redcap_preprocessing

redcap_filepath = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/raw/ColonGR-PDO_DATA_2024-03-29_1424.csv'
output_dir = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/single_files'
conversion_table_filepath = 'conversion_table/redcap_CRC_conversion_table.csv'
disease_type = 'CRC'

redcap_preprocessing.preprocess_redcap_data(redcap_filepath,
                                            disease_type=disease_type,
                                            output_dir=output_dir,
                                            save_as_single_file = False)

There are no PDO cell lines for record_id 53.
There are no PDO cell lines for record_id 53.
There are no PDO cell lines for record_id 53.
There are no PDO cell lines for record_id 53.
There are no PDO cell lines for record_id 53.
There are no PDO cell lines for record_id 53.


In [3]:
from redcap_preprocessing import redcap_preprocessing

redcap_filepath = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/raw/PANCREASGR-PDO_DATA_2024-03-29_1423.csv'
output_dir = '/Users/gustaveronteix/Documents/data/OraklOnco/clinical/20240403/single_files'
conversion_table_filepath = 'conversion_table/redcap_PGR_conversion_table.csv'
disease_type = 'PDAC'

redcap_preprocessing.preprocess_redcap_data(redcap_filepath,
                                            disease_type=disease_type,
                                            output_dir=output_dir,
                                            save_as_single_file = False)

There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 450.
There are no PDO cell lines for record_id 450.
There are no PDO cell lines for record_id 1174.
There are no PDO cell lines for record_id 1174.
There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 450.
There are no PDO cell lines for record_id 450.
There are no PDO cell lines for record_id 1174.
There are no PDO cell lines for record_id 1174.
There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 12.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 101.
There are no PDO cell lines for record_id 450.
There are no PD