In [22]:
import pandas as pd
import numpy as np
import warnings
from functools import reduce
import re

In [23]:
file_name = "clinical_Pan-cancer.Dec2020.tsv"
file_path = "../../../input/clinical_Pan-cancer.Dec2020.tsv"

In [24]:
filter_type = 'pancanccrcc'

In [31]:
def load_clin(filter_type):
    data = {}
    # Get tumor_code
    tumor_codes = {'pancanbrca': 'BR', 'pancanccrcc':'CCRCC', 
                   'pancanendometrial':'UCEC','pancangbm':'GBM','pancanhnscc':'HNSCC',
                   'pancanlscc': 'LSCC','pancanluad':'LUAD', 'pancanpda':'PDA',
                   'pancanhcc':'HCC','pancancolon':'CO','pancanovarian':'OV'}

    if file_name == "clinical_Pan-cancer.Dec2020.tsv": #.gz
        df = pd.read_csv(file_path, sep="\t") # take care of nan, are blanks = ''?
        df = df.loc[df['tumor_code'] == tumor_codes[filter_type]] 
        df = df.set_index("case_id")
        df.index.name = 'Patient_ID'
        df = df.sort_values(by=["Patient_ID"])
        data["clinical"] = df


        # Separate out demographic, previous_cancer, medical_conditions, cancer_diagnosis, and followup dfs
        all_clinical = data["clinical"]
        demographic_df = all_clinical[['discovery_study', 'discovery_study/type_of_analyzed_samples', 'consent/age', 
                          'consent/sex', 'consent/race', 'consent/ethnicity', 'consent/ethnicity_race_ancestry_identified',
                          'consent/collection_in_us', 'consent/participant_country', 'consent/maternal_grandmother_country',
                          'consent/maternal_grandfather_country', 'consent/paternal_grandmother_country', 
                          'consent/paternal_grandfather_country', 'consent/deaf_or_difficulty_hearing', 
                          'consent/blind_or_difficulty_seeing', 
                          'consent/difficulty_concentrating_remembering_or_making_decisions',
                          'consent/difficulty_walking_or_climbing_stairs', 'consent/difficulty_dressing_or_bathing',
                          'consent/difficulty_doing_errands', 'consent/consent_form_signed', 'consent/case_stopped',
                          'medications/medication_name_vitamins_supplements', 'medications/history_source',
                          'medical_history/height_at_time_of_surgery_cm', 'medical_history/weight_at_time_of_surgery_kg',
                          'medical_history/bmi', 'medical_history/history_of_cancer', 'medical_history/alcohol_consumption', 
                          'medical_history/tobacco_smoking_history', 
                          'medical_history/age_at_which_the_participant_started_smoking',
                          'medical_history/age_at_which_the_participant_stopped_smoking', 
                          'medical_history/on_the_days_participant_smoked_how_many_cigarettes_did_he_she_usually_smoke',
                          'medical_history/number_of_pack_years_smoked', 
                          'medical_history/was_the_participant_exposed_to_secondhand_smoke',
                          'medical_history/exposure_to_secondhand_smoke_in_household_during_participants_childhood',
                          'medical_history/exposure_to_secondhand_smoke_in_participants_current_household',    'medical_history/number_of_years_participant_has_consumed_more_than_2_drinks_per_day_for_men_and_more_than_1_drink_per_day_for_women']]         
        data['demographic'] = demographic_df

        # Create previous_cancer df
        previous_cancer_df = all_clinical[['cancer_history/cancer_type', 'cancer_history/history_source', 
                                        'cancer_history/history_of_any_treatment',
                                        'cancer_history/medical_record_documentation_of_this_history_of_cancer_and_treatment']]
        # include previous_cancer if it's not empty
        previous_cancer_df = previous_cancer_df.dropna(how='all')
        if len(previous_cancer_df.index) != 0: 
            data['previous_cancer'] = previous_cancer_df
        #else:
        #    print('no data for previous_cancer for', tumor_codes[filter_type])


        # Create medical_conditions df
        medical_conditions_df = all_clinical[['general_medical_history/medical_condition',
                                         'general_medical_history/history_of_treatment',
                                         'general_medical_history/history_source']]
        # include general_medical if it's not empty
        medical_conditions_df = medical_conditions_df.dropna(how='all')
        if len(medical_conditions_df.index) != 0: 
            data['medical_conditions'] = medical_conditions_df
        #else:
        #    print('no data for medical_conditions for', tumor_codes[filter_type])


        # Create cancer_diagnosis df    
        cancer_diagnosis_df = all_clinical[['baseline/tumor_site', 'baseline/tumor_site_other', 'baseline/tumor_laterality',
                               'baseline/tumor_focality', 'baseline/tumor_size_cm', 'baseline/histologic_type',
                               'cptac_path/histologic_grade', 'baseline/tumor_necrosis', 'baseline/margin_status',
                               'baseline/ajcc_tnm_cancer_staging_edition', 'baseline/pathologic_staging_primary_tumor',
                               'baseline/pathologic_staging_regional_lymph_nodes', 'baseline/number_of_lymph_nodes_examined',
                               'baseline/ihc_staining_done', 'baseline/he_staining_done', 
                               'baseline/number_of_positive_lymph_nodes_by_he_staining',
                               'baseline/clinical_staging_distant_metastasis', 'baseline/pathologic_staging_distant_metastasis',
                               'baseline/specify_distant_metastasis_documented_sites', 'baseline/residual_tumor',
                               'baseline/tumor_stage_pathological', 'baseline/paraneoplastic_syndrome_present',
                               'baseline/performance_status_assessment_ecog_performance_status_score', 
                               'baseline/performance_status_assessment_karnofsky_performance_status_score',
                               'baseline/number_of_positive_lymph_nodes_by_ihc_staining', 'baseline/perineural_invasion',
                               'procurement/blood_collection_minimum_required_blood_collected', 
                               'procurement/blood_collection_number_of_blood_tubes_collected',
                               'procurement/tumor_tissue_collection_tumor_type', 
                               'procurement/tumor_tissue_collection_number_of_tumor_segments_collected', 
                               'procurement/tumor_tissue_collection_clamps_used', 
                               'procurement/tumor_tissue_collection_frozen_with_oct',
                               'procurement/normal_adjacent_tissue_collection_number_of_normal_segments_collected', 
                               'Recurrence-free survival', 'Overall survial', 'Recurrence status (1, yes; 0, no)',
                               'Survial status (1, dead; 0, alive)']]
        data['cancer_diagnosis'] = cancer_diagnosis_df # Maps dataframe name to dataframe (self._data)

        # Create followup df
        followup_df = all_clinical[['follow-up/follow_up_period','follow-up/is_this_patient_lost_to_follow-up',
                       'follow-up/vital_status_at_date_of_last_contact', 
                       'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact',
                       'follow-up/adjuvant_post-operative_radiation_therapy',
                       'follow-up/adjuvant_post-operative_pharmaceutical_therapy', 
                       'follow-up/adjuvant_post-operative_immunological_therapy', 
                       'follow-up/tumor_status_at_date_of_last_contact_or_death',
                       'follow-up/measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment',
                       'follow-up/measure_of_success_of_outcome_at_last_available_follow-up',
                       'follow-up/eastern_cooperative_oncology_group_at_last_available_follow-up',
                       'follow-up/karnofsky_score_preoperative_at_last_available_follow-up', 
                       'follow-up/performance_status_scale_timing_at_last_available_follow-up',
                       'follow-up/measure_of_success_of_outcome_at_first_NTE', 
                       'follow-up/eastern_cooperative_oncology_group_at_first_NTE',
                       'follow-up/karnofsky_score_preoperative_at_first_NTE', 
                       'follow-up/performance_status_scale_timing_at_first_NTE',
                       'follow-up/new_tumor_after_initial_treatment', 
                       'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_new_tumor_after_initial_treatment',
                       'follow-up/type_of_new_tumor', 'follow-up/site_of_new_tumor', 'follow-up/other_site_of_new_tumor',
                       'follow-up/diagnostic_evidence_of_recurrence_or_relapse', 'follow-up/additional_surgery_for_new_tumor',
                       'follow-up/residual_tumor_after_surgery_for_new_tumor', 
                       'follow-up/additional_treatment_for_new_tumor_radiation',
                       'follow-up/additional_treatment_for_new_tumor_pharmaceutical', 
                       'follow-up/additional_treatment_for_new_tumor_immunological',
                       'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor',
                       'follow-up/cause_of_death', 'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_death']]
        data['followup'] =followup_df

        categories = {'demographic': ['consent/', 'medications/', 'medical_history/'], 
                      'cancer_diagnosis': ['baseline/', 'cptac_path/', 'procurement/'], 
                      'followup': ['follow-up/']} 

        # remove general categories from column labels
        for df_name in categories.keys():
            df = data[df_name]
            for c in categories[df_name]:
                df.columns = df.columns.str.replace(c, "")
                data[df_name] = df

        if len(medical_conditions_df.index) != 0:
            df = data['medical_conditions']
            df.columns = df.columns.str.replace('general_medical_history/', '')
            df = df.assign(medical_condition = df.medical_condition.str.split("|"))
            df = df.assign(history_of_treatment = df.history_of_treatment.str.split("|"))
            df = df.assign(history_source = df.history_source.str.split("|"))
            exploded_df = df.apply(pd.Series.explode) # explode 
            data['medical_conditions'] = exploded_df

        if len(previous_cancer_df.index) != 0:
            pc = data['previous_cancer']
            pc.columns = pc.columns.str.replace('cancer_history/', "")
            pc = pc.assign(cancer_type = pc.cancer_type.str.split("|"))
            pc = pc.assign(history_source = pc.history_source.str.split("|"))
            pc = pc.assign(history_of_any_treatment = pc.history_of_any_treatment.str.split("|"))
            pc = pc.assign(medical_record_documentation_of_this_history_of_cancer_and_treatment = 
                           pc.medical_record_documentation_of_this_history_of_cancer_and_treatment.str.split("|"))
            exploded_pc = pc.apply(pd.Series.explode)
            data['previous_cancer'] = exploded_pc

        # make lists for vals in medication cols
        dem = data['demographic'] 
        dem = dem.assign(medication_name_vitamins_supplements = dem.medication_name_vitamins_supplements.str.split("|"))
        dem = dem.assign(history_source = dem.history_source.str.split("|"))
        dem = dem.rename(columns={'history_source':'med_history_source'}) # ?? ok to rename?
        data['demographic'] = dem

        return data


In [32]:
data = load_clin('pancanccrcc')

In [42]:
dem = data['demographic']
demv = dem.dropna(axis =1, how = 'all')
print('cols demographic:', len(dem.columns))
print('cols with values (not all NA):', len(demv.columns))
dem

cols demographic: 37
cols with values (not all NA): 37


Unnamed: 0_level_0,discovery_study,discovery_study/type_of_analyzed_samples,age,sex,race,ethnicity,ethnicity_race_ancestry_identified,collection_in_us,participant_country,maternal_grandmother_country,...,alcohol_consumption,tobacco_smoking_history,age_at_which_the_participant_started_smoking,age_at_which_the_participant_stopped_smoking,on_the_days_participant_smoked_how_many_cigarettes_did_he_she_usually_smoke,number_of_pack_years_smoked,was_the_participant_exposed_to_secondhand_smoke,exposure_to_secondhand_smoke_in_household_during_participants_childhood,exposure_to_secondhand_smoke_in_participants_current_household,number_of_years_participant_has_consumed_more_than_2_drinks_per_day_for_men_and_more_than_1_drink_per_day_for_women
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,Yes,Tumor_and_Normal,72,Male,White,Not-Hispanic or Latino,White,Yes,United States,Other: Unknown,...,Lifelong non-drinker,Current reformed smoker within past 15 years,18,62,20,44.0,Exposure to secondhand smoke history not avail...,,,
C3L-00010,Yes,Tumor_and_Normal,30,Male,White,Not-Hispanic or Latino,White,Yes,United States,Other: Unknown,...,Alcohol consumption history not available,Current smoker: Includes daily and non-daily s...,20,,20,10.0,Exposure to secondhand smoke history not avail...,,,
C3L-00011,Yes,Tumor_and_Normal,63,Female,White,Not-Hispanic or Latino,White,Yes,Other: Unknown,Other: Unknown,...,Alcohol consumption equal to or less than 2 dr...,Lifelong non-smoker: Less than 100 cigarettes ...,,,,,Exposure to secondhand smoke history not avail...,,,
C3L-00026,Yes,Tumor_and_Normal,65,Female,White,Not-Hispanic or Latino,White,Yes,United States,Other: Unknown,...,Lifelong non-drinker,Lifelong non-smoker: Less than 100 cigarettes ...,,,,,Exposure to secondhand smoke history not avail...,,,
C3L-00079,Yes,Tumor_and_Normal,49,Male,White,Not-Hispanic or Latino,white,Yes,United States,Other: Unknown,...,Lifelong non-drinker,Lifelong non-smoker: Less than 100 cigarettes ...,,,,,Yes,Yes,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02945,No,,66,Female,White,Not-Hispanic or Latino,white,Yes,United States,United States,...,Lifelong non-drinker,Lifelong non-smoker: Less than 100 cigarettes ...,,,,,Exposure to secondhand smoke history not avail...,,,
C3N-03018,No,,65,Male,,,Caucasian,No,Poland,Poland,...,Alcohol consumption equal to or less than 2 dr...,"Current reformed smoker, more than 15 years",16,21,20,5.0,Exposure to secondhand smoke history not avail...,,,
C3N-03019,No,,46,Male,,,Caucasian,No,Poland,Poland,...,Alcohol consumption equal to or less than 2 dr...,Current reformed smoker within past 15 years,17,37,20,20.0,Exposure to secondhand smoke history not avail...,,,
C3N-03020,No,,65,Male,,,Caucasian,No,Poland,Poland,...,Alcohol consumption equal to or less than 2 dr...,Lifelong non-smoker: Less than 100 cigarettes ...,,,,,Yes,Yes,,


In [44]:
pc = data['previous_cancer']
pcv = pc.dropna(axis =1, how = 'all')
print('cols demographic:', len(pc.columns))
print('cols with values (not all NA):', len(pcv.columns))
pc.dropna(axis = 0, how = 'all')
pc.loc['C3L-00902']
pc

cols demographic: 4
cols with values (not all NA): 4


Unnamed: 0_level_0,cancer_type,history_source,history_of_any_treatment,medical_record_documentation_of_this_history_of_cancer_and_treatment
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C3L-00011,Metastatic Renal Cell Carcinoma to Rt Patella,Medical Report,Surgery,Yes
C3L-00902,Left Breast Cancer,Medical Record,"Radiation,Surgery,Chemotherapy",Yes
C3L-00902,Left Breast Cancer,Medical Record,"Radiation,Surgery,Chemotherapy",Yes
C3L-00907,melanoma,Medical Record,Surgery,Yes
C3L-01465,Prostate,Medical Record,Surgery,No
C3L-01861,Basal cell carcinoma of skin,Medical Record,Unknown,No
C3L-01885,Renal Cancer,Medical Record,Surgery,Yes
C3L-01885,Prostate Cancer,Medical Record,Surgery,Yes
C3L-02346,Metastatic CRCC to Left Sacrum,Medical Record,Radiation,Yes
C3L-02352,Breast,Medical Record,"Radiation,Surgery",Yes


In [47]:
med = data['medical_conditions']
medv = med.dropna(axis =1, how = 'all')
print('cols medical_conditions:', len(med.columns))
print('cols with values (not all NA):', len(medv.columns))
med

cols medical_conditions: 3
cols with values (not all NA): 3


Unnamed: 0_level_0,medical_condition,history_of_treatment,history_source
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C3L-00004,cataracts,Yes,Medical Record
C3L-00004,pulmonary embolus,Yes,Medical Record
C3L-00004,Hypertension,Yes,Medical Record
C3L-00010,Chronic Hepatitis C without hepatic coma,Unknown,Medical Record
C3L-00010,Intravenous Drug User,Unknown,Medical Record
...,...,...,...
C3N-03021,cirrhosis,Yes,Self Report
C3N-03021,varicose veins,Yes,Self Report
C3N-03021,inferior vena cava thrombosis,Yes,Self Report
C3N-03021,erythremia,Yes,Self Report


In [49]:
cd = data['cancer_diagnosis']
cdv = cd.dropna(axis =1, how = 'all')
print('cols cancer_diagnosis:', len(cd.columns))
print('cols with values (not all NA):', len(cdv.columns))
cd

cols cancer_diagnosis: 37
cols with values (not all NA): 34


Unnamed: 0_level_0,tumor_site,tumor_site_other,tumor_laterality,tumor_focality,tumor_size_cm,histologic_type,histologic_grade,tumor_necrosis,margin_status,ajcc_tnm_cancer_staging_edition,...,blood_collection_number_of_blood_tubes_collected,tumor_tissue_collection_tumor_type,tumor_tissue_collection_number_of_tumor_segments_collected,tumor_tissue_collection_clamps_used,tumor_tissue_collection_frozen_with_oct,normal_adjacent_tissue_collection_number_of_normal_segments_collected,Recurrence-free survival,Overall survial,"Recurrence status (1, yes; 0, no)","Survial status (1, dead; 0, alive)"
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,,,Left Kidney,Unifocal,12.0,Clear cell renal cell carcinoma,"G3: Nuclei very irregular, approximately 20µm;...",Present,Margin(s) involved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,384.0,0,0.0
C3L-00010,,,Left Kidney,Unifocal,6.5,Clear cell renal cell carcinoma,"G3: Nuclei very irregular, approximately 20µm;...",Not identified,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,896.0,0,0.0
C3L-00011,,,Right Kidney,Unifocal,12.0,Clear cell renal cell carcinoma,"G4: Nuclei bizarre and multilobulated, 20µm or...",Present,Margin(s) involved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,241.0,0,1.0
C3L-00026,,,Right Kidney,Unifocal,2.0,Clear cell renal cell carcinoma,"G3: Nuclei very irregular, approximately 20µm;...",Not identified,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,Yes,No,1.0,,1458.0,0,0.0
C3L-00079,,,Right Kidney,Unifocal,14.0,Clear cell renal cell carcinoma,"G3: Nuclei very irregular, approximately 20µm;...",Present,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,274.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02945,,,Right Kidney,Unifocal,11.6,Clear cell renal cell carcinoma,"G4: Nuclei bizarre and multilobulated, 20µm or...",Present,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,2.0,Kidney: Clear Cell Renal Cell Carcinoma,3.0,No,No,2.0,,95.0,0,0.0
C3N-03018,,,Right Kidney,Unifocal,5.0,Clear cell renal cell carcinoma,"G2: Nuclei slightly irregular, approximately 1...",Not identified,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,736.0,0,0.0
C3N-03019,,,Right Kidney,Unifocal,2.5,Clear cell renal cell carcinoma,"G2: Nuclei slightly irregular, approximately 1...",Not identified,Margins uninvolved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,1.0,,742.0,0,0.0
C3N-03020,,,Right Kidney,Unifocal,2.0,Clear cell renal cell carcinoma,"G1: Nuclei round, uniform, approximately 10µm;...",Not identified,Margin(s) involved by invasive carcinoma,Seventh Edition (2010),...,1.0,Kidney: Clear Cell Renal Cell Carcinoma,1.0,No,No,0.0,,776.0,0,0.0


In [51]:
fol = data['followup']
folv = fol.dropna(axis =1, how = 'all')
print('cols followup:', len(fol.columns))
print('cols with values (not all NA):', len(folv.columns))
fol

cols followup: 31
cols with values (not all NA): 26


Unnamed: 0_level_0,follow_up_period,is_this_patient_lost_to_follow-up,vital_status_at_date_of_last_contact,days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact,adjuvant_post-operative_radiation_therapy,adjuvant_post-operative_pharmaceutical_therapy,adjuvant_post-operative_immunological_therapy,tumor_status_at_date_of_last_contact_or_death,measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment,measure_of_success_of_outcome_at_last_available_follow-up,...,other_site_of_new_tumor,diagnostic_evidence_of_recurrence_or_relapse,additional_surgery_for_new_tumor,residual_tumor_after_surgery_for_new_tumor,additional_treatment_for_new_tumor_radiation,additional_treatment_for_new_tumor_pharmaceutical,additional_treatment_for_new_tumor_immunological,days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor,cause_of_death,days_from_date_of_initial_pathologic_diagnosis_to_date_of_death
Patient_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C3L-00004,24 Months,Yes,Living,384.0,No,No,No,Tumor Free,Not Applicable,Unknown,...,,,,,,,,,,
C3L-00010,48 Months,Yes,Living,896.0,No,No,No,Tumor Free,Not Applicable,Complete Remission,...,,,,,,,,,,
C3L-00011,12 Months,No,Deceased,229.0,No,Yes,Yes,With Tumor,Persistent Disease,Patient Deceased,...,,,,,,,,,Main malignancy,241.0
C3L-00026,48 Months,No,Living,1458.0,No,No,No,Tumor Free,Not Applicable,Complete Remission,...,,,,,,,,,,
C3L-00079,12 Months,No,Deceased,245.0,Yes,Yes,Yes,With Tumor,Patient Deceased,Patient Deceased,...,,,,,,,,,Unknown,274.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C3N-02945,12 Months,Yes,Living,95.0,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,...,,,,,,,,,,
C3N-03018,24 Months,No,Living,736.0,No,No,No,Tumor Free,Complete Remission,Complete Remission,...,,,,,,,,,,
C3N-03019,24 Months,No,Living,742.0,No,No,No,Tumor Free,Complete Remission,Complete Remission,...,,,,,,,,,,
C3N-03020,24 Months,No,Living,776.0,No,No,No,Tumor Free,Persistent Disease,Complete Remission,...,,,,,,,,,,


In [None]:
n = dem[dem.index.str.contains('#')]
len(n)

In [None]:
dem.index.duplicated().any()

In [None]:
# Keep cols with general category name (in case need later)
def load_clin():
    data = {}
    if file_name == "clinical_Pan-cancer.Dec2020.tsv":
        df = pd.read_csv(file_path, sep = "\t") # check na_vals 
        df = df.loc[df['tumor_code'] == 'BR']
        df = df.set_index("case_id")
        df.index.name = 'Patient_ID'
        df = df.sort_values(by=["Patient_ID"])
        #self._data["clinical"] = df
        
        # Separate out demographic, general_medical_history, cancer_diagnosis, and followup dfs
        #all_clinical = self._data["clinical"]
        demographic_df = df[['discovery_study', 'discovery_study/type_of_analyzed_samples', 'consent/age', 
                          'consent/sex', 'consent/race', 'consent/ethnicity', 'consent/ethnicity_race_ancestry_identified',
                          'consent/collection_in_us', 'consent/participant_country', 'consent/maternal_grandmother_country',
                          'consent/maternal_grandfather_country', 'consent/paternal_grandmother_country', 
                          'consent/paternal_grandfather_country', 'consent/deaf_or_difficulty_hearing', 
                          'consent/blind_or_difficulty_seeing', 'consent/difficulty_concentrating_remembering_or_making_decisions',
                          'consent/difficulty_walking_or_climbing_stairs', 'consent/difficulty_dressing_or_bathing',
                          'consent/difficulty_doing_errands', 'consent/consent_form_signed', 'consent/case_stopped',
                          'medical_history/history_of_cancer', 'medical_history/alcohol_consumption', 
                          'medical_history/tobacco_smoking_history', 'medical_history/age_at_which_the_participant_started_smoking',
                          'medical_history/age_at_which_the_participant_stopped_smoking', 'medical_history/on_the_days_participant_smoked_how_many_cigarettes_did_he_she_usually_smoke',
                          'medical_history/number_of_pack_years_smoked', 'medical_history/was_the_participant_exposed_to_secondhand_smoke',
                          'medical_history/exposure_to_secondhand_smoke_in_household_during_participants_childhood',
                          'medical_history/exposure_to_secondhand_smoke_in_participants_current_household',
                          'medical_history/number_of_years_participant_has_consumed_more_than_2_drinks_per_day_for_men_and_more_than_1_drink_per_day_for_women']] # all_clinical[[
                #'''cancer_history/cancer_type
                #cancer_history/history_source
                #cancer_history/history_of_any_treatment
                #cancer_history/medical_record_documentation_of_this_history_of_cancer_and_treatment'''

        data['demographic'] = demographic_df
            
        general_medical_history_df = df[['general_medical_history/medical_condition', 'general_medical_history/history_of_treatment',
                                      'general_medical_history/history_source', 'medications/medication_name_vitamins_supplements', 
                                      'medications/history_source']]
        data['general_medical_history'] = general_medical_history_df
            
        cancer_diagnosis_df = df[['baseline/tumor_site', 'baseline/tumor_site_other', 'baseline/tumor_laterality',
                               'baseline/tumor_focality', 'baseline/tumor_size_cm', 'baseline/histologic_type',
                               'cptac_path/histologic_grade', 'baseline/tumor_necrosis', 'baseline/margin_status',
                               'baseline/ajcc_tnm_cancer_staging_edition', 'baseline/pathologic_staging_primary_tumor',
                               'baseline/pathologic_staging_regional_lymph_nodes', 'baseline/number_of_lymph_nodes_examined',
                               'baseline/ihc_staining_done', 'baseline/he_staining_done', 'baseline/number_of_positive_lymph_nodes_by_he_staining',
                               'baseline/clinical_staging_distant_metastasis', 'baseline/pathologic_staging_distant_metastasis',
                               'baseline/specify_distant_metastasis_documented_sites', 'baseline/residual_tumor',
                               'baseline/tumor_stage_pathological', 'baseline/paraneoplastic_syndrome_present',
                               'baseline/performance_status_assessment_ecog_performance_status_score', 'baseline/performance_status_assessment_karnofsky_performance_status_score',
                               'baseline/number_of_positive_lymph_nodes_by_ihc_staining', 'baseline/perineural_invasion',
                               'procurement/blood_collection_minimum_required_blood_collected', 'procurement/blood_collection_number_of_blood_tubes_collected',
                               'procurement/tumor_tissue_collection_tumor_type', 'procurement/tumor_tissue_collection_number_of_tumor_segments_collected', 
                               'procurement/tumor_tissue_collection_clamps_used', 'procurement/tumor_tissue_collection_frozen_with_oct',
                               'procurement/normal_adjacent_tissue_collection_number_of_normal_segments_collected', 
                               'Recurrence-free survival', 'Overall survial', 'Recurrence status (1, yes; 0, no)',
                               'Survial status (1, dead; 0, alive)']]
        data['cancer_diagnosis'] = cancer_diagnosis_df # Maps dataframe name to dataframe (self._data)
            
        followup_df = df[['follow-up/follow_up_period','follow-up/is_this_patient_lost_to_follow-up',
                       'follow-up/vital_status_at_date_of_last_contact', 'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_last_contact',
                       'follow-up/adjuvant_post-operative_radiation_therapy','follow-up/adjuvant_post-operative_pharmaceutical_therapy', 
                       'follow-up/adjuvant_post-operative_immunological_therapy', 'follow-up/tumor_status_at_date_of_last_contact_or_death',
                       'follow-up/measure_of_success_of_outcome_at_the_completion_of_initial_first_course_treatment',
                       'follow-up/measure_of_success_of_outcome_at_last_available_follow-up','follow-up/eastern_cooperative_oncology_group_at_last_available_follow-up',
                       'follow-up/karnofsky_score_preoperative_at_last_available_follow-up', 'follow-up/performance_status_scale_timing_at_last_available_follow-up',
                       'follow-up/measure_of_success_of_outcome_at_first_NTE', 'follow-up/eastern_cooperative_oncology_group_at_first_NTE',
                       'follow-up/karnofsky_score_preoperative_at_first_NTE', 'follow-up/performance_status_scale_timing_at_first_NTE',
                       'follow-up/new_tumor_after_initial_treatment', 'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_new_tumor_after_initial_treatment',
                       'follow-up/type_of_new_tumor', 'follow-up/site_of_new_tumor', 'follow-up/other_site_of_new_tumor',
                       'follow-up/diagnostic_evidence_of_recurrence_or_relapse', 'follow-up/additional_surgery_for_new_tumor',
                       'follow-up/residual_tumor_after_surgery_for_new_tumor', 'follow-up/additional_treatment_for_new_tumor_radiation',
                       'follow-up/additional_treatment_for_new_tumor_pharmaceutical', 'follow-up/additional_treatment_for_new_tumor_immunological',
                       'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_additional_surgery_for_new_tumor',
                       'follow-up/cause_of_death', 'follow-up/days_from_date_of_initial_pathologic_diagnosis_to_date_of_death']]
        data['followup'] =followup_df
        return data 
        