In [1]:
import pandas as pd
import os
import numpy as np

base_dir = "/data_isilon_main/isilon_images/10_MetaSystems/MetaSystemsData/Multimodal_Imaging_Daria/_Evaluations/Metadata"

samples_metadata_file_1 = os.path.join(base_dir, "Samples_1.xlsx")
samples_metadata_file_2 = os.path.join(base_dir, "Samples_2.xlsx")

patients_metadata_file = os.path.join(base_dir, "20211228_Sample_cohort_with_infiltration.xlsx")
aberrations_file = os.path.join(base_dir, "20211213_Patient_Aberrations.xlsx")
clinical_data_file = os.path.join(base_dir, "20211213_Patient_ClinicalData.xlsx")

In [2]:
#Load data 
samples_metadata_1 = pd.read_excel(samples_metadata_file_1)
samples_metadata_2 = pd.read_excel(samples_metadata_file_2)

samples_metadata = pd.concat([samples_metadata_1, samples_metadata_2], ignore_index=True) # Make sure they have the same column names and remove evaluation scoring scheme
samples_metadata['Position'] = [i for i in range(1, len(samples_metadata)+1)]

patients_metadata = pd.read_excel(patients_metadata_file)
aberrations_metadata = pd.read_excel(aberrations_file)
clinical_metadata = pd.read_excel(clinical_data_file)

#Only use the ones that worked
samples_metadata = samples_metadata.drop(samples_metadata.index[samples_metadata['exclude'] == 'yes'])
#samples_metadata = samples_metadata.dropna(subset=['exclude']) #also include samples not yet ablated
samples_metadata = samples_metadata.dropna(subset=['Slide ']) #exclude empty slots
samples_metadata.reset_index(drop=True, inplace=True) #reindex after droping otherwise indices are missing

#Load technical data and eliminate unncessecary information
metadata = pd.DataFrame(columns = ['Sample', 'Puncture_side', 'Staining_date', 'Stainer', 'IMC_date', 'IMC_score', 'exclude'], 
    data=samples_metadata[['Slide ', 'Side [R/L]', 'Date of analysis', 'Staining done by', 'Date2', 'mean IMC score', 'exclude']].values)

#Split into positive and negative samples - check whether clinical data and aberrations should also be determined for negative samples
metadata_neg = metadata[metadata['Sample'].str.contains('N', na=False)]
metadata = metadata.drop(metadata.index[metadata['Sample'].str.contains('N', na=False)])
metadata.reset_index(drop=True, inplace=True)
metadata['Sample'] = [i.split(' (CZ)')[0] if i is not np.nan else i for i in metadata['Sample']]

#Load patient data and infiltration
metadata = metadata.reindex(columns=metadata.columns.tolist() +['Study_ID', 'Infiltration_right', 'Infiltration_left', 'DE_date', 'Timepoint'])

for _i, s in enumerate(metadata['Sample']):
    tmp = [i for i in ['DE', 'RE1', 'RE2', 'REL'] if patients_metadata['BM (' + i + ') ID'].str.contains(s).any()][0]

    metadata.at[_i, 'Study_ID'] = patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'ID'].values[0]
    metadata.at[_i, 'Infiltration_right'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'BM (' + tmp + ') DTCs (Right)'].values[0]
    metadata.at[_i, 'Infiltration_left'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'BM (' + tmp + ') DTCs (Left)'].values[0]
    metadata.at[_i, 'DE_date'] =  patients_metadata.loc[patients_metadata['BM (' + tmp + ') ID'].str.contains(s, na=False), 'DE_date'].values[0]
    metadata.at[_i, 'Timepoint'] = tmp

#Load aberrations - Following patients are missing in aberrations: AT-0060, AT-0100, AT-0130 (double value), AT-0131
aberrations = ['MYCN amp.', 'ALK amp.', 'ALK mut.', 'ATRX del.', 'TERT rear.', '1p loss', '1q gain', '11q loss', '17q gain']

aberrations_metadata['Patient ID'] = aberrations_metadata['Patient ID'].astype("string")

for a in aberrations: 
    metadata[a] = [aberrations_metadata.loc[aberrations_metadata['Patient ID'].str.contains(i, na=False), a].values[0] for i in metadata['Study_ID']]

metadata.rename(columns = {'MYCN amp.':'MYCN_amp', 'ALK amp.':'ALK_amp', 'ALK mut.':'ALK_mut', 'ATRX del.':'ATRX_del', 'TERT rear.':'TERT_rear', 
    '1p loss':'X1p_loss', '1q gain':'X1q_gain', '11q loss':'X11q_loss', '17q gain':'X17q_gain'}, inplace = True)

#Load clinical data
clinical_col = ['stage4', 'rez_dat', 'prog_dat', 'event_dat', 'tod_dat1', 'efs', 'tod', 'efs_dur', 'tod_dur', 'info']
for c in clinical_col: 
    metadata[c] = [clinical_metadata.loc[clinical_metadata['uid'].str.contains(i, na=False), c].values[0] for i in metadata['Study_ID']]

#metadata['Study ID'] = [patients_metadata.loc[patients_metadata['BM (DE) ID'].str.contains(i) | patients_metadata['BM (RE1) ID'].str.contains(i) 
#    | patients_metadata['BM (RE2) ID'].str.contains(i) | patients_metadata['BM (REL) ID'].str.contains(i), 'ID'].values[0] for i in metadata['Study ID']]

metadata.to_csv(os.path.join(base_dir, 'metadata_all.csv'))


03-0313
0
06-2373
1
17-3502
2
15-2666
3
07-1861
4
06-2847
5
11-0262
6
11-3238
7
06-3069
8
14-3450
9
09-0920
10
18-5369
11
11-2786
12
07-2453
13
14-1331
14
04-0968
15
15-1320
16
18-2796
17
14-0309
18
13-4339
19
19-2548
20
18-2306
21
15-0229
22
07-3302
23
04-2136
24
13-4453
25
04-0813
26
18-1571
27
16-3300
28
14-2253
29
18-4876
30
16-1454
31
16-5841
32
04-0435
33
14-3965
34
19-4756
35
04-0767
36
04-2066
37
16-0124
38
18-2231
39
15-0950
40
03-0298
41
18-3728
42
09-1021
43
18-2600
44
04-1614
45
15-3617
46
09-0092
47
14-2892
48
13-2083
49
15-1576
50
18-1404
51
18-2341
52
09-1107
53
15-2770
54
15-4510
55
15-1549
56
12-2237
57
06-3859
58
15-1814
59
15-4368
60
15-0813
61
16-1783
62
17-0492
63
15-2468
64
17-0390
65
18-3107
66
03-3812
67
17-4599
68
11-0521
69
14-0734
70
18-5770
71
17-2022
72
10-1771
73
03-4975
74
17-3394
75
15-2444
76
14-3093
77
15-2176
78
06-2655
79
16-0213
80
16-3184
81
15-3531
82
04-0164
83
14-0025
84
14-2675
85
18-1462
86
16-1370
87
11-1272
88
14-2108
89
15-2831
90
18-3011
9