In [5]:
import pandas as pd
import os

base_dir = "/data_isilon_main/isilon_images/10_MetaSystems/MetaSystemsData/Multimodal_Imaging_Daria/_Evaluations/Metadata"

samples_metadata_file_1 = os.path.join(base_dir, "Samples_1.xlsx")
samples_metadata_file_2 = os.path.join(base_dir, "Samples_2.xlsx")

patients_metadata_file = os.path.join(base_dir, "20211228_Sample_cohort_with_infiltration.xlsx")
aberrations_file = os.path.join(base_dir, "20211213_Patient_Aberrations.xlsx")
clinical_data_file = os.path.join(base_dir, "20211213_Patient_ClinicalData.xlsx")

In [7]:
#Load data 
samples_metadata_1 = pd.read_excel(samples_metadata_file_1)
samples_metadata_2 = pd.read_excel(samples_metadata_file_2)

samples_metadata = pd.concat([samples_metadata_1, samples_metadata_2]) # Make sure they have the same column names and remove evaluation scoring scheme
samples_metadata['Position'] = [i for i in range(1, len(samples_metadata)+1)]

patients_metadata = pd.read_excel(patients_metadata_file)
aberrations_metadata = pd.read_excel(aberrations_file)
clinical_metadata = pd.read_excel(clinical_data_file)

#Only use the ones that worked
samples_metadata = samples_metadata.drop(samples_metadata.index[samples_metadata['exclude'] == 'yes'])
samples_metadata = samples_metadata.dropna(subset=['exclude'])

#Load technical data and eliminate unncessecary information
metadata = pd.DataFrame(columns = ['Sample', 'Puncture side', 'Staining date', 'Stainer', 'IMC date', 'IMC score', 'exclude'], 
    data=samples_metadata[['Slide ', 'Side [R/L]', 'Date of analysis', 'Staining done by', 'Date2', 'mean IMC score', 'exclude']].values)

#Split into positive and negative samples - check whether clinical data and aberrations should also be determined for negative samples
metadata_neg = metadata[metadata['Sample'].str.contains('N')]
metadata = metadata.drop(metadata.index[metadata['Sample'].str.contains('N')])

#Load patient data and infiltration - here REL is wrongly called RE2 in excel sheet
metadata = metadata.reindex(columns=metadata.columns.tolist() +['Study ID', 'Infiltration right', 'Infiltration left', 'DE date', 'Timepoint'])

for _i, s in enumerate(metadata['Sample']):
    tmp = [i for i in ['DE', 'RE1', 'RE2', 'REL'] if patients_metadata['BM (' + i + ') ID'].str.contains(s).any()][0]

    metadata.at[_i, 'Study ID'] = patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'ID'].values[0]
    metadata.at[_i, 'Infiltration right'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'BM (' + tmp + ') DTCs (Right)'].values[0]
    metadata.at[_i, 'Infiltration left'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'BM (' + tmp + ') DTCs (Left)'].values[0]
    metadata.at[_i, 'DE date'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'DE_date'].values[0]
    metadata.at[_i, 'Timepoint'] = tmp

#Load aberrations - Following patients are missing in aberrations: AT-0060, AT-0100, AT-0130 (double value), AT-0131
aberrations = ['MYCN amp.', 'ALK amp.', 'ALK mut.', 'ATRX del.', 'TERT rear.', '1p loss', '1q gain', '11q loss', '17q gain']

aberrations_metadata['Patient ID'] = aberrations_metadata['Patient ID'].astype("string")

for a in aberrations: 
    metadata[a] = [aberrations_metadata.loc[aberrations_metadata['Patient ID'].str.contains(i, na=False), a].values[0] for i in metadata['Study ID']]

#Load clinical data
clinical_col = ['stage4', 'rez_dat', 'prog_dat', 'event_dat', 'tod_dat1', 'efs', 'tod', 'efs_dur', 'tod_dur', 'info']
for c in clinical_col: 
    metadata[c] = [clinical_metadata.loc[clinical_metadata['uid'].str.contains(i, na=False), c].values[0] for i in metadata['Study ID']]

#metadata['Study ID'] = [patients_metadata.loc[patients_metadata['BM (DE) ID'].str.contains(i) | patients_metadata['BM (RE1) ID'].str.contains(i) 
#    | patients_metadata['BM (RE2) ID'].str.contains(i) | patients_metadata['BM (REL) ID'].str.contains(i), 'ID'].values[0] for i in metadata['Study ID']]

metadata.to_csv(os.path.join(base_dir, 'metadata.csv'))
