In [28]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [29]:
save_dir = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/metadata_2025_july_05'

In [30]:
def extract_subject_factors_dataframe(data):
    '''
    From the Json files saved from Metabolomics Workbench
    '''
    samples = []
    for sample in data['SUBJECT_SAMPLE_FACTORS']:
        sample_info = {
            'Subject ID': sample['Subject ID'],
            'Sample ID': sample['Sample ID']
        }
        
        # Extract subkeys from 'Factors'
        for key, value in sample['Factors'].items():
            sample_info[key] = value
        
        # Extract subkeys from 'Additional sample data'
        for key, value in sample['Additional sample data'].items():
            sample_info[key] = value
        
        samples.append(sample_info)

    # Create a DataFrame from the extracted information
    df = pd.DataFrame(samples)
    return df

def increment_the_last_number(df, column_name, amount=2):
    def decrement(s):
        head, _, tail = s.rpartition('_')
        return f"{head}_{int(tail) + int(amount):03}"
    
    df[column_name] = df[column_name].apply(decrement)
    return df


sex_label_matching_dict = {
    '1': 'M',
    '2': 'F',
    'Male': 'M',
    'Female': 'F',
    'M': 'M',
    'F': 'F',
    1 : 'M',
    2 : 'F'}


In [31]:
old_data_loc = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/April_05_Data'

new_data_loc = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/May_28_Data'


In [32]:
metadata_file = f'{old_data_loc}/metadata.csv'
# Read the metadata file
metadata = pd.read_csv(metadata_file, index_col=0)

In [33]:
metadata.columns

Index(['cohort_id', 'Study ID', 'Cohort Label', 'Cohort ID', 'OS', 'OS_Event',
       'Age', 'subject ID', 'study_week', 'Region', 'Sex', 'Race',
       'Dose (mg/kg)', 'phase', 'Treatment', 'Prior_2', 'batch_id',
       'runtime_hour', 'run_order', 'MSKCC', 'ORR', 'Benefit',
       'ExtremeResponder', 'PFS', 'PFS_Event', 'MV', 'Age_Group',
       'Benefit BINARY', 'Benefit ORDINAL', 'Nivo Benefit BINARY',
       'MSKCC BINARY', 'MSKCC ORDINAL', 'Matt Set', 'Set', 'Pretrain',
       'is Pediatric', 'Cohort Label ENC', 'Study ID ENC', 'file id',
       'Age Range (min)', 'Age Range (max)', 'IMDC', 'IMDC ORDINAL',
       'IMDC BINARY', 'is Female'],
      dtype='object')

In [34]:
metadata['Study ID'].value_counts()

Study ID
ST001932    4482
ST001422    2051
ST001931    2044
ST001428    1522
ST001237    1379
ST002331    1315
ST001423    1192
ST000909     742
ST001849     691
ST002027     356
ST001408     349
ST002112     335
ST001236     271
ST001918     271
ST002251     242
ST001519     166
ST002244     122
ST000388      95
ST000422      60
Name: count, dtype: int64

## ST000422
no useful metadata

### Correct the cohort label for ST000422
previously the cohort label for ST000422 was set to adult_cancer, but it should be adult_other

In [35]:
metadata.groupby('Cohort Label')['Cohort Label ENC'].mean()

Cohort Label
adult_cancer       0.0
adult_other        1.0
pediatric_CMD      2.0
pediatric_other    3.0
Name: Cohort Label ENC, dtype: float64

In [36]:
print(metadata[metadata['Study ID'] == 'ST000422']['Cohort Label'].value_counts())

Cohort Label
adult_cancer    60
Name: count, dtype: int64


In [37]:
metadata.loc[metadata['Study ID'] == 'ST000422','Cohort Label'] = 'adult_other'
metadata.loc[metadata['Study ID'] == 'ST000422','Cohort Label ENC'] = 1

In [38]:
existing_ST000422 = metadata[metadata['Study ID'] == 'ST000422'].copy()
existing_ST000422.dropna(axis=1, how='all', inplace=True)
existing_ST000422.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
10jan12_26-r001.mzML,550,ST000422,adult_other,550,80.226195,Pretrain,Test,False,1,1,10jan12_26-r001.mzML
10jan12_56-r001.mzML,550,ST000422,adult_other,550,79.095221,Pretrain,Test,False,1,1,10jan12_56-r001.mzML
10jan12_13-r002.mzML,550,ST000422,adult_other,550,78.912806,Pretrain,Test,False,1,1,10jan12_13-r002.mzML
10jan12_81-r002.mzML,550,ST000422,adult_other,550,77.891281,Pretrain,Test,False,1,1,10jan12_81-r002.mzML
10jan12_55-r002.mzML,550,ST000422,adult_other,550,77.818314,Pretrain,Test,False,1,1,10jan12_55-r002.mzML


In [39]:
metadata_ST000422 = existing_ST000422[['Study ID']].copy()
metadata_ST000422['Cohort Label v0'] = existing_ST000422['Cohort Label'].iloc[0]
metadata_ST000422['Job ID'] = existing_ST000422['Cohort ID'].iloc[0]
metadata_ST000422['is Pediatric'] = existing_ST000422['is Pediatric'].iloc[0]
metadata_ST000422['Cancer Risk'] = False
metadata_ST000422['Column'] = 'Hilic'
metadata_ST000422['Polarity'] = 'Positive'

In [40]:
metadata_ST000422.to_csv(f'{save_dir}/metadata_ST000422.csv', index=True)

In [41]:
metadata_ST000422.head()

Unnamed: 0,Study ID,Cohort Label v0,Job ID,is Pediatric,Cancer Risk,Column,Polarity
10jan12_26-r001.mzML,ST000422,adult_other,550,False,False,Hilic,Positive
10jan12_56-r001.mzML,ST000422,adult_other,550,False,False,Hilic,Positive
10jan12_13-r002.mzML,ST000422,adult_other,550,False,False,Hilic,Positive
10jan12_81-r002.mzML,ST000422,adult_other,550,False,False,Hilic,Positive
10jan12_55-r002.mzML,ST000422,adult_other,550,False,False,Hilic,Positive


### Leila Note (Friday May 24th):
So I realized we don't need to know baseline and 3 years for cancer labeling
Because if a patient has "-" value in the cancer detection columns, its label would be non-cancer
and if a patient has "1" in in the cancer detection columns, its label would be "cancer", even the cancer diagnosed later, the earlier time point is very interesting, as it reflect cancer risk

## ST001422, ST001423
- lots of patients have gender metadata
- many have Age, BMI and cancer status


In [42]:
existing_ST001422 = metadata[metadata['Study ID'] == 'ST001422'].copy()
existing_ST001423 = metadata[metadata['Study ID'] == 'ST001423'].copy()


In [43]:
asprin_metadata_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/Aspirin_PR000730/combined_patient_samples.xlsx'

asprin_metadata = pd.read_excel(asprin_metadata_file,sheet_name='combined_patient_samples (2)', index_col=0, header=0)
asprin_metadata = asprin_metadata[~(asprin_metadata['IonizationMode'] == 'C18neg')].copy()

### Compare to paper

2020_Metabolomics Analysis of Aspirin’s Effects in Human Colon Tissue and Associations with Adenoma Risk
In table 1 of the supplements

Total number of patients considered: 100+114+111 = 325 patients

"≥1 Adenoma at Year 3 Colonoscopy" has 33+39+47 = 119 patients

"≥1 Advanced Adenoma at Year 3 Colonoscopy" has 6+8+13 = 27 patients

"High-risk Findings at Year 3 Colonoscopy" has 9+11+16= 36 patients

In [44]:
asprin_patients = asprin_metadata.groupby('PatientID').first()
asprin_patients = asprin_patients[~asprin_patients['BMI'].isna()].copy()
patient_total = asprin_patients.shape[0]
print(f'Number of patients in aspirin study with BMI: {patient_total}')


Tradyn_total = asprin_patients['Tradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Tradyn: {Tradyn_total}')

Advtradyn_total = asprin_patients['Advtradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Advtradyn: {Advtradyn_total}')

Hradvtradyn_total = asprin_patients['Hradvtradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Hradvtradyn: {Hradvtradyn_total}')

Number of patients in aspirin study with BMI: 325
Number of patients in aspirin study with Tradyn: 119
Number of patients in aspirin study with Advtradyn: 27
Number of patients in aspirin study with Hradvtradyn: 36


In [45]:
subset_ST001423 = asprin_metadata[~asprin_metadata['RAW_FILE_NAME_ST001423'].isna()].copy()
subset_ST001423.index = subset_ST001423['RAW_FILE_NAME_ST001423'].apply(lambda x: x+'.mzML')

subset_ST001422 = asprin_metadata[~asprin_metadata['RAW_FILE_NAME_ST001422'].isna()].copy()
subset_ST001422.index = subset_ST001422['RAW_FILE_NAME_ST001422'].apply(lambda x: x+'.mzML')


In [46]:
missing_files_ST001423 = subset_ST001423.index.difference(metadata.index)
missing_files_ST001422 = subset_ST001422.index.difference(metadata.index)

print(f'Number of missing files in ST001423: {len(missing_files_ST001423)}')
print(f'Number of missing files in ST001422: {len(missing_files_ST001422)}')

Number of missing files in ST001423: 86
Number of missing files in ST001422: 8


In [47]:
print('Missing with Sex info', np.sum(~subset_ST001423.loc[missing_files_ST001423,'Sex'].isna()))
print('Missing with Age info', np.sum(~subset_ST001423.loc[missing_files_ST001423,'Age'].isna()))

print('Missing with Sex info', np.sum(~subset_ST001422.loc[missing_files_ST001422,'Sex'].isna()))
print('Missing with Age info', np.sum(~subset_ST001422.loc[missing_files_ST001422,'Age'].isna()))



Missing with Sex info 86
Missing with Age info 46
Missing with Sex info 8
Missing with Age info 2


#### Note 

<span style="color:blue">I think I was too quick to remove outliers from this dataset. 
between these two studies, there are another 94 samples that have Sex
with 48 of those having Age, BMI and cancer information, </span>

Leila says we can add some of these back in, focus more on removing the outliers due to the missing values



In [48]:
Patient_total = (subset_ST001423[~subset_ST001423['BMI'].isna()].shape[0] + subset_ST001422[~subset_ST001422['BMI'].isna()].shape[0])/2
print(f'Patient total: {Patient_total}')

Tradyn_total = (subset_ST001423['Tradyn'].value_counts()[1] + subset_ST001422['Tradyn'].value_counts()[1])/2
print(f'Tradyn total: {Tradyn_total}') #119 

Advtradyn_total = (subset_ST001423['Advtradyn'].value_counts()[1] + subset_ST001422['Advtradyn'].value_counts()[1])/2
print(f'Advtradyn total: {Advtradyn_total}')

Hradvtradyn_total = (subset_ST001423['Hradvtradyn'].value_counts()[1] + subset_ST001422['Hradvtradyn'].value_counts()[1])/2
print(f'Hradvtradyn total: {Hradvtradyn_total}')

Patient total: 293.0
Tradyn total: 109.0
Advtradyn total: 24.0
Hradvtradyn total: 33.0


### Update the metadata based on the paper knowledge

In [49]:
# we want to add columns for has_cancer, bmi, 
# update the age, update the Sex
# Leila says we can add some of these 


subset_ST001423['Cancer Risk'] = (subset_ST001423['Tradyn']==1) | (subset_ST001423['Advtradyn']==1) | (subset_ST001423['Hradvtradyn']==1)
subset_ST001422['Cancer Risk'] = (subset_ST001422['Tradyn']==1) | (subset_ST001422['Advtradyn']==1) | (subset_ST001422['Hradvtradyn']==1)

In [50]:
subset_ST001423['Diagnosis'] = subset_ST001423['Cancer Risk'].apply(lambda x: 'Colon Cancer Risk' if x else 'Healthy')
subset_ST001422['Diagnosis'] = subset_ST001422['Cancer Risk'].apply(lambda x: 'Colon Cancer Risk' if x else 'Healthy')

In [51]:
subset_ST001423.groupby(['Tradyn','Advtradyn','Hradvtradyn']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PatientID,Treatment,Race,Age,BMI,Sex,Center,IonizationMode,Date Analyzed,Batch,Available data ST001091,Raw File - colon tissue - ST001091,Available data_ST001422,RAW_FILE_NAME_ST001422,Available data_ST001423,RAW_FILE_NAME_ST001423,Cancer Risk,Diagnosis
Tradyn,Advtradyn,Hradvtradyn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,1,20,20,20,20,20,20,20,20,20,20,20,20,0,0,20,20,20,20
1,-,1,4,4,4,4,4,4,4,4,4,4,4,4,0,0,4,4,4,4
1,-,-,64,64,64,64,64,64,64,64,64,64,64,64,0,0,64,64,64,64
-,-,-,148,148,148,148,148,148,148,148,148,148,148,148,0,0,148,148,148,148


In [52]:
subset_ST001423['Diagnosis Type'] = ''
subset_ST001423.loc[subset_ST001423['Tradyn']==1,'Diagnosis Type'] = '≥1 Adenoma at Year 3 Colonoscopy'
subset_ST001423.loc[subset_ST001423['Advtradyn']==1,'Diagnosis Type'] = '≥1 Advanced Adenoma at Year 3 Colonoscopy'
subset_ST001423.loc[subset_ST001423['Hradvtradyn']==1,'Diagnosis Type'] = '≥1 High Risk Findings at Year 3 Colonoscopy'
subset_ST001423.loc[(subset_ST001423['Hradvtradyn']==1) & (subset_ST001423['Tradyn']==1),'Diagnosis Type'] = '≥1 High Risk Findings and ≥1 Advanced Adenoma at Year 3 Colonoscopy'

subset_ST001422['Diagnosis Type'] = ''
subset_ST001422.loc[subset_ST001422['Tradyn']==1,'Diagnosis Type'] = '≥1 Adenoma at Year 3 Colonoscopy'
subset_ST001422.loc[subset_ST001422['Advtradyn']==1,'Diagnosis Type'] = '≥1 Advanced Adenoma at Year 3 Colonoscopy'
subset_ST001422.loc[subset_ST001422['Hradvtradyn']==1,'Diagnosis Type'] = '≥1 High Risk Findings at Year 3 Colonoscopy'
subset_ST001422.loc[(subset_ST001422['Hradvtradyn']==1) & (subset_ST001422['Tradyn']==1),'Diagnosis Type'] = '≥1 High Risk Findings and ≥1 Advanced Adenoma at Year 3 Colonoscopy'

In [79]:
metadata_ST001423 = subset_ST001423[['PatientID','Treatment','Race','Age','BMI','Sex','Cancer Risk','Diagnosis','Diagnosis Type']].copy()
metadata_ST001423.index = subset_ST001423.index.to_list()
metadata_ST001423['Study ID'] = 'ST001423'
metadata_ST001423.rename(columns={'PatientID':'Subject ID'}, inplace=True)

metadata_ST001422 = subset_ST001422[['PatientID','Treatment','Race','Age','BMI','Sex','Cancer Risk','Diagnosis','Diagnosis Type']].copy()
metadata_ST001422.index = subset_ST001422.index.to_list()
metadata_ST001422['Study ID'] = 'ST001422' 
metadata_ST001422.rename(columns={'PatientID':'Subject ID'}, inplace=True)

In [80]:
metadata_ST001423['Job ID'] = existing_ST001422['Cohort ID'].iloc[0]
metadata_ST001422['Job ID'] = existing_ST001423['Cohort ID'].iloc[0]

metadata_ST001423['Cohort Label v0'] = existing_ST001423['Cohort Label'].iloc[0]
metadata_ST001422['Cohort Label v0'] = existing_ST001422['Cohort Label'].iloc[0]

metadata_ST001423['is Pediatric'] = existing_ST001423['is Pediatric'].iloc[0]
metadata_ST001422['is Pediatric'] = existing_ST001422['is Pediatric'].iloc[0]

# add column and polarity
metadata_ST001423['Column'] = 'Hilic'
metadata_ST001422['Column'] = 'Hilic'

metadata_ST001423['Polarity'] = 'Positive'
metadata_ST001422['Polarity'] = 'Positive'

In [81]:
metadata_ST001423['mzml_file'] = metadata_ST001423.index.to_list()
metadata_ST001423['file_id']  = metadata_ST001423['mzml_file'].str.replace('.mzML','')
metadata_ST001423_part1 = metadata_ST001423.copy()


In [82]:
metadata_ST001423_part2 = increment_the_last_number(metadata_ST001423_part1.copy(), 'file_id', 2)
metadata_ST001423_part2['mzml_file'] = metadata_ST001423_part2['file_id'].apply(lambda x: x+'.mzML')
metadata_ST001423_part2.index = metadata_ST001423_part2['mzml_file'].to_list()

In [83]:
metadata_ST001423_part3 = increment_the_last_number(metadata_ST001423_part1.copy(), 'file_id', 4)
metadata_ST001423_part3['mzml_file'] = metadata_ST001423_part3['file_id'].apply(lambda x: x+'.mzML')
metadata_ST001423_part3.index = metadata_ST001423_part3['mzml_file'].to_list()

In [84]:
metadata_ST001423 = pd.concat([metadata_ST001423_part1, metadata_ST001423_part2, metadata_ST001423_part3])
metadata_ST001423.drop(columns=['mzml_file','file_id'], inplace=True)

In [85]:
metadata_ST001423

Unnamed: 0,Subject ID,Treatment,Race,Age,BMI,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Study ID,Job ID,Cohort Label v0,is Pediatric,Column,Polarity
VT_160802_M198_115.mzML,10550,Aspirin 81 mg,1.0,55.0,27.322199,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160802_M198_121.mzML,10550,Aspirin 81 mg,1.0,55.0,27.322199,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160730_M198_031.mzML,10056,Aspirin 81 mg,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160730_M198_037.mzML,10056,Aspirin 81 mg,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160801_M198_079.mzML,10640,Aspirin 325 mg,1.0,66.0,25.109628,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT_160717_M198_101.mzML,91000,Placebo,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160722_M198_095.mzML,91065,Aspirin 81 mg,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160722_M198_101.mzML,91065,Aspirin 81 mg,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive
VT_160721_M198_071.mzML,91559,Aspirin 81 mg,,,,M,False,Healthy,,ST001423,502,adult_cancer,False,Hilic,Positive


In [86]:
metadata_ST001422['mzml_file'] = metadata_ST001422.index.to_list()
metadata_ST001422['file_id']  = metadata_ST001422['mzml_file'].str.replace('.mzML','')
metadata_ST001422_part1 = metadata_ST001422.copy()

metadata_ST001422_part2 = increment_the_last_number(metadata_ST001422_part1.copy(), 'file_id', 2)
metadata_ST001422_part2['mzml_file'] = metadata_ST001422_part2['file_id'].apply(lambda x: x+'.mzML')
metadata_ST001422_part2.index = metadata_ST001422_part2['mzml_file'].to_list()
metadata_ST001422_part3 = increment_the_last_number(metadata_ST001422_part1.copy(), 'file_id', 4)
metadata_ST001422_part3['mzml_file'] = metadata_ST001422_part3['file_id'].apply(lambda x: x+'.mzML')
metadata_ST001422_part3.index = metadata_ST001422_part3['mzml_file'].to_list()
metadata_ST001422 = pd.concat([metadata_ST001422_part1, metadata_ST001422_part2, metadata_ST001422_part3])
metadata_ST001422.drop(columns=['mzml_file','file_id'], inplace=True)

In [87]:
existing_ST001422['is Female'].value_counts()

is Female
0.0    1176
1.0     599
Name: count, dtype: int64

In [88]:
existing_ST001423['is Female'].value_counts()

is Female
0.0    234
1.0    126
Name: count, dtype: int64

In [110]:

metadata_ST001423_all = metadata_ST001423.join(existing_ST001423[['is Female']],how='outer')
metadata_ST001422_all = metadata_ST001422.join(existing_ST001422[['is Female']],how='outer')

metadata_ST001423_all['Column'] = 'Hilic'
metadata_ST001422_all['Column'] = 'Hilic'

metadata_ST001423_all['Polarity'] = 'Positive'
metadata_ST001422_all['Polarity'] = 'Positive'

metadata_ST001423_all['Study ID'] = 'ST001423'
metadata_ST001422_all['Study ID'] = 'ST001422'

metadata_ST001422_all['Job ID'] = metadata_ST001422_all['Job ID'].mode().iloc[0]
metadata_ST001423_all['Job ID'] = metadata_ST001423_all['Job ID'].mode().iloc[0]

metadata_ST001422_all['Cohort Label v0'] = metadata_ST001422_all['Cohort Label v0'].mode().iloc[0]
metadata_ST001423_all['Cohort Label v0'] = metadata_ST001423_all['Cohort Label v0'].mode().iloc[0]

metadata_ST001422_all['is Pediatric'] = metadata_ST001422_all['is Pediatric'].mode().iloc[0]
metadata_ST001423_all['is Pediatric'] = metadata_ST001423_all['is Pediatric'].mode().iloc[0]

metadata_ST001422_all['Cancer Risk'] = metadata_ST001422_all['Cancer Risk'].mode().iloc[0]
metadata_ST001423_all['Cancer Risk'] = metadata_ST001423_all['Cancer Risk'].mode().iloc[0]

In [113]:
metadata_ST001423_all['Sample_Class'] = 'Study_Sample'
metadata_ST001422_all['Sample_Class'] = 'Study_Sample'

metadata_ST001422_all.loc[metadata_ST001422_all['Sex'].isna(),'Sample_Class'] = 'Study_QC_Sample'
metadata_ST001423_all.loc[metadata_ST001423_all['Sex'].isna(),'Sample_Class'] = 'Study_QC_Sample'

In [95]:
existing_ST001423.shape

(1192, 45)

In [93]:
existing_ST001422.shape

(2051, 45)

In [96]:
metadata_ST001423_all.shape

(1455, 16)

In [97]:
metadata_ST001422_all.shape

(2076, 16)

In [114]:
# save the metadata
metadata_ST001423_all.to_csv(f'{save_dir}/metadata_ST001423.csv', index=True)
metadata_ST001422_all.to_csv(f'{save_dir}/metadata_ST001422.csv', index=True)

In [112]:
metadata_ST001422_all.head(10)

Unnamed: 0,Subject ID,Treatment,Race,Age,BMI,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Study ID,Job ID,Cohort Label v0,is Pediatric,Column,Polarity,is Female
VT_160120_001.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_003.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_005.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_007.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_009.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_011.mzML,,,,,,,False,,,ST001422,526.0,adult_cancer,False,Hilic,Positive,
VT_160120_013.mzML,21115.0,Aspirin 325 mg,1.0,54.0,37.305733,M,False,Colon Cancer Risk,≥1 Adenoma at Year 3 Colonoscopy,ST001422,526.0,adult_cancer,False,Hilic,Positive,0.0
VT_160120_015.mzML,21115.0,Aspirin 325 mg,1.0,54.0,37.305733,M,False,Colon Cancer Risk,≥1 Adenoma at Year 3 Colonoscopy,ST001422,526.0,adult_cancer,False,Hilic,Positive,0.0
VT_160120_017.mzML,21115.0,Aspirin 325 mg,1.0,54.0,37.305733,M,False,Colon Cancer Risk,≥1 Adenoma at Year 3 Colonoscopy,ST001422,526.0,adult_cancer,False,Hilic,Positive,0.0
VT_160120_019.mzML,20966.0,Placebo,,,,M,False,Healthy,,ST001422,526.0,adult_cancer,False,Hilic,Positive,0.0


In [108]:
metadata_ST001423_all.head(10)

Unnamed: 0,Subject ID,Treatment,Race,Age,BMI,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Study ID,Job ID,Cohort Label v0,is Pediatric,Column,Polarity,is Female
VT_160715_M198_001.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_003.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_005.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_007.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_009.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_011.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_013.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_015.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_017.mzML,,,,,,,,,,,,,,,,
VT_160715_M198_019.mzML,70905.0,Aspirin 325 mg,,,,F,False,Healthy,,ST001423,502.0,adult_cancer,False,Hilic,Positive,1.0


In [30]:
metadata_ST001422.head()

Unnamed: 0,Subject ID,Treatment,Race,Age,BMI,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Study ID,Job ID,Cohort Label v0,is Pediatric,Column,Polarity
VT_160123_079.mzML,10203,Aspirin 81 mg,1.0,48.0,30.083829,M,True,Colon Cancer Risk,≥1 High Risk Findings and ≥1 Advanced Adenoma ...,ST001422,526,adult_cancer,False,Hilic,Positive
VT_160123_085.mzML,10203,Aspirin 81 mg,1.0,48.0,30.083829,M,True,Colon Cancer Risk,≥1 High Risk Findings and ≥1 Advanced Adenoma ...,ST001422,526,adult_cancer,False,Hilic,Positive
VT_160213_019.mzML,10023,Aspirin 325 mg,,,,M,False,Healthy,,ST001422,526,adult_cancer,False,Hilic,Positive
VT_160213_055.mzML,10023,Aspirin 325 mg,,,,M,False,Healthy,,ST001422,526,adult_cancer,False,Hilic,Positive
VT_160213_049.mzML,10034,Aspirin 81 mg,,,,M,False,Healthy,,ST001422,526,adult_cancer,False,Hilic,Positive


## ST000388
- there should already be gender
- we need to add cancer status
- we can add smoking status

- three smoking categories: never, former, current

In [31]:
existing_ST000388 = metadata[metadata['Study ID'] == 'ST000388'].copy()
existing_ST000388.dropna(axis=1, how='all', inplace=True)

In [32]:
new_ST000388_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/Lung-Cancer_ST000388_metadata.tsv'
new_ST000388 = pd.read_csv(new_ST000388_file, sep='\t', index_col=0)

In [33]:
new_ST000388['mzml_file'] = new_ST000388['file name'].apply(lambda x: x+'.mzML')
new_ST000388['Sex'] = new_ST000388['Gender'].map({'Female':'F', 'Male': 'M'})
new_ST000388['subject ID'] = new_ST000388.index

In [34]:
# new_ST000388['Diagnosis'] = new_ST000388['Group'].copy()
new_ST000388['Diagnosis'] = new_ST000388['Group'].map({'Cancer':'Lung Cancer','Benign':'Benign'})
new_ST000388['Cancer Risk'] = new_ST000388['Group'].map({'Cancer':True,'Benign':False})
new_ST000388['Diagnosis Type'] = new_ST000388['Cancer Type'].copy()
new_ST000388['Diagnosis Details'] = new_ST000388['Nodule Classification'].apply(lambda x: f'Nodule Classification: {x}')

In [35]:
metadata_ST000388 = new_ST000388[['subject ID','Sex','Cancer Risk','Diagnosis','Diagnosis Type',\
                                  'Diagnosis Details','Smoking Status','Emphysema/COPD']].copy()
metadata_ST000388.index = new_ST000388['mzml_file'].to_list()
metadata_ST000388['Study ID'] = 'ST000388'
metadata_ST000388.rename(columns={'subject ID':'Subject ID'}, inplace=True)
metadata_ST000388['Job ID'] = existing_ST000388['Cohort ID'].iloc[0]
metadata_ST000388['Cohort Label v0'] = existing_ST000388['Cohort Label'].iloc[0]
metadata_ST000388['is Pediatric'] = existing_ST000388['is Pediatric'].iloc[0]

# add column and polarity
metadata_ST000388['Column'] = 'Hilic'
metadata_ST000388['Polarity'] = 'Positive'

In [36]:
# save the metadata
metadata_ST000388.to_csv(f'{save_dir}/metadata_ST000388.csv', index=True)

In [37]:
metadata_ST000388['Smoking Status'].value_counts()

Smoking Status
Former     58
Current    36
Name: count, dtype: int64

In [38]:
metadata_ST000388.head()

Unnamed: 0,Subject ID,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Diagnosis Details,Smoking Status,Emphysema/COPD,Study ID,Job ID,Cohort Label v0,is Pediatric,Column,Polarity
LungNodule_HILIC_Pos_23.mzML,SA018129,F,False,Benign,,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581,adult_cancer,False,Hilic,Positive
LungNodule_HILIC_Pos_74.mzML,SA018130,F,False,Benign,,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581,adult_cancer,False,Hilic,Positive
LungNodule_HILIC_Pos_113.mzML,SA018134,F,True,Lung Cancer,adeno stage 2,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581,adult_cancer,False,Hilic,Positive
LungNodule_HILIC_Pos_14.mzML,SA018131,F,True,Lung Cancer,adeno 1a,Nodule Classification: No NODULES,Current,No,ST000388,581,adult_cancer,False,Hilic,Positive
LungNodule_HILIC_Pos_46.mzML,SA018133,F,True,Lung Cancer,adeno stage 2,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581,adult_cancer,False,Hilic,Positive


In [39]:
# json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST000388_AN000624.json'

# with open(json_file) as f:
#     data = json.load(f)
# data['SUBJECT_SAMPLE_FACTORS']

## ST001408
- we need to add gender, all patients are men
- all patients have cancer

In [40]:
existing_ST001408 = metadata[metadata['Study ID'] == 'ST001408'].copy()
existing_ST001408.dropna(axis=1, how='all', inplace=True)
existing_ST001408.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
P_HA_PL25_B1_045_50510.mzML,522,ST001408,adult_cancer,522,58.847136,Pretrain,Test,False,0,5,P_HA_PL25_B1_045_50510.mzML
P_HA_PL25_B1_055_56908.mzML,522,ST001408,adult_cancer,522,58.482306,Pretrain,Test,False,0,5,P_HA_PL25_B1_055_56908.mzML
P_HA_PL25_B1_097_50161.mzML,522,ST001408,adult_cancer,522,58.40934,Pretrain,Test,False,0,5,P_HA_PL25_B1_097_50161.mzML
P_HA_PL25_B1_067_47200.mzML,522,ST001408,adult_cancer,522,58.299891,Pretrain,Test,False,0,5,P_HA_PL25_B1_067_47200.mzML
P_HA_PL25_B1_041_52819.mzML,522,ST001408,adult_cancer,522,58.080992,Pretrain,Test,False,0,5,P_HA_PL25_B1_041_52819.mzML


In [41]:
metadata_ST001408 = existing_ST001408[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST001408.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001408['Sex'] = 'M'
metadata_ST001408['Cancer Risk'] = True

# add column and polarity
metadata_ST001408['Column'] = 'Hilic'
metadata_ST001408['Polarity'] = 'Positive'

In [42]:
# save the metadata
metadata_ST001408.to_csv(f'{save_dir}/metadata_ST001408.csv', index=True)

In [43]:
metadata_ST001408.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk,Column,Polarity
P_HA_PL25_B1_045_50510.mzML,ST001408,False,adult_cancer,522,M,True,Hilic,Positive
P_HA_PL25_B1_055_56908.mzML,ST001408,False,adult_cancer,522,M,True,Hilic,Positive
P_HA_PL25_B1_097_50161.mzML,ST001408,False,adult_cancer,522,M,True,Hilic,Positive
P_HA_PL25_B1_067_47200.mzML,ST001408,False,adult_cancer,522,M,True,Hilic,Positive
P_HA_PL25_B1_041_52819.mzML,ST001408,False,adult_cancer,522,M,True,Hilic,Positive


## ST001236, ST001237
- all patients have cancer


In [127]:
existing_ST001236 = metadata[metadata['Study ID'] == 'ST001236'].copy()
existing_ST001236.dropna(axis=1, how='all', inplace=True)
existing_ST001236.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,OS,OS_Event,Age,subject ID,study_week,Region,...,Matt Set,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
0196_Marios_RCC_HP-001300051-2.mzML,541,ST001236,adult_cancer,541,32.233,0.0,45.0,CA209009-13-51,week 4,US/CANADA,...,Other,Pretrain,Test,False,0,3,0196_Marios_RCC_HP-001300051-2.mzML,45.0,45.0,0.0
0019_Marios_RCC_HP-000100118-1.mzML,541,ST001236,adult_cancer,541,5.9,1.0,82.0,CA209009-1-118,baseline,US/CANADA,...,Other,Pretrain,Test,False,0,3,0019_Marios_RCC_HP-000100118-1.mzML,82.0,82.0,0.0
0016_Marios_RCC_HP-000100086-1.mzML,541,ST001236,adult_cancer,541,15.467,1.0,71.0,CA209009-1-86,baseline,US/CANADA,...,Other,Pretrain,Test,False,0,3,0016_Marios_RCC_HP-000100086-1.mzML,71.0,71.0,0.0
0232_Marios_RCC_HP-001500075-3.mzML,541,ST001236,adult_cancer,541,27.367,1.0,70.0,CA209009-15-75,week 9,OTHER,...,Other,Pretrain,Test,False,0,3,0232_Marios_RCC_HP-001500075-3.mzML,70.0,70.0,1.0
0245_Marios_RCC_HP-001500094-3.mzML,541,ST001236,adult_cancer,541,30.4,0.0,41.0,CA209009-15-94,week 9,OTHER,...,Other,Pretrain,Test,False,0,3,0245_Marios_RCC_HP-001500094-3.mzML,41.0,41.0,1.0


In [128]:
existing_ST001237 = metadata[metadata['Study ID'] == 'ST001237'].copy()
existing_ST001237.dropna(axis=1, how='all', inplace=True)
existing_ST001237.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,OS,OS_Event,Age,subject ID,study_week,Region,...,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),IMDC,IMDC ORDINAL,IMDC BINARY,is Female
0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,541,ST001237,adult_cancer,541,50.562628,0.0,62.0,CA209025-111-657,baseline,REST OF WORLD,...,False,0,4,0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,62.0,62.0,FAVORABLE,2.0,1.0,1.0
0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,541,ST001237,adult_cancer,541,24.607803,1.0,59.0,CA209025-33-12,baseline,WESTERN EUROPE,...,False,0,4,0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,59.0,59.0,INTERMEDIATE,1.0,,0.0
0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,541,ST001237,adult_cancer,541,52.073922,0.0,66.0,CA209025-111-778,baseline,REST OF WORLD,...,False,0,4,0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,66.0,66.0,INTERMEDIATE,1.0,,0.0
0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,541,ST001237,adult_cancer,541,24.246407,1.0,80.0,CA209025-180-1001,baseline,REST OF WORLD,...,False,0,4,0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,80.0,80.0,POOR,0.0,0.0,1.0
0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,541,ST001237,adult_cancer,541,3.614,1.0,60.0,CA209025-113-50,baseline,WESTERN EUROPE,...,False,0,4,0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,60.0,60.0,NOT REPORTED,,,0.0


In [129]:
metadata_ST001236 = existing_ST001236[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                       'OS','OS_Event','Age','Sex','subject ID','study_week','Region',\
                                        'Treatment','Dose (mg/kg)','Race','PFS','PFS_Event',\
                                        'ORR','Benefit','phase']].copy()

metadata_ST001236.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'study_week': 'Timepoint',
                                  'subject ID': 'Subject ID',
                                  'phase': 'Clinical Trial Phase',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001236['Cancer Risk'] = True
# add column and polarity
metadata_ST001236['Column'] = 'Hilic'
metadata_ST001236['Polarity'] = 'Positive'

In [130]:
metadata_ST001236.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,OS,OS_Event,Age,Sex,Subject ID,Timepoint,...,Dose (mg/kg),Race,PFS,PFS_Event,ORR,Benefit,Clinical Trial Phase,Cancer Risk,Column,Polarity
0196_Marios_RCC_HP-001300051-2.mzML,ST001236,False,adult_cancer,541,32.233,0.0,45.0,M,CA209009-13-51,week 4,...,10.0,WHITE,,,,,RCC1,True,Hilic,Positive
0019_Marios_RCC_HP-000100118-1.mzML,ST001236,False,adult_cancer,541,5.9,1.0,82.0,M,CA209009-1-118,baseline,...,0.3,WHITE,,,,,RCC1,True,Hilic,Positive
0016_Marios_RCC_HP-000100086-1.mzML,ST001236,False,adult_cancer,541,15.467,1.0,71.0,M,CA209009-1-86,baseline,...,10.0,WHITE,,,,,RCC1,True,Hilic,Positive
0232_Marios_RCC_HP-001500075-3.mzML,ST001236,False,adult_cancer,541,27.367,1.0,70.0,F,CA209009-15-75,week 9,...,2.0,WHITE,,,,,RCC1,True,Hilic,Positive
0245_Marios_RCC_HP-001500094-3.mzML,ST001236,False,adult_cancer,541,30.4,0.0,41.0,F,CA209009-15-94,week 9,...,10.0,WHITE,,,,,RCC1,True,Hilic,Positive


In [131]:
metadata_ST001237 = existing_ST001237[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                       'OS','OS_Event','Age','Sex','subject ID','study_week','Region',\
                                        'IMDC','MSKCC','Treatment','Dose (mg/kg)','Race','PFS','PFS_Event',\
                                        'ORR','Benefit','Prior_2','phase']].copy()
                                        
metadata_ST001237.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'study_week': 'Timepoint',
                                'subject ID': 'Subject ID',
                                  'phase': 'Clinical Trial Phase',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001237['Cancer Risk'] = True

# add column and polarity
metadata_ST001237['Column'] = 'Hilic'
metadata_ST001237['Polarity'] = 'Positive'

In [132]:
metadata_ST001237.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,OS,OS_Event,Age,Sex,Subject ID,Timepoint,...,Race,PFS,PFS_Event,ORR,Benefit,Prior_2,Clinical Trial Phase,Cancer Risk,Column,Polarity
0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,ST001237,False,adult_cancer,541,50.562628,0.0,62.0,F,CA209025-111-657,baseline,...,WHITE,18.299795,1.0,SD,ICB,True,RCC3,True,Hilic,Positive
0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,ST001237,False,adult_cancer,541,24.607803,1.0,59.0,M,CA209025-33-12,baseline,...,WHITE,1.905544,1.0,PD,NCB,False,RCC3,True,Hilic,Positive
0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,ST001237,False,adult_cancer,541,52.073922,0.0,66.0,M,CA209025-111-778,baseline,...,WHITE,29.470226,1.0,CRPR,CB,False,RCC3,True,Hilic,Positive
0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,ST001237,False,adult_cancer,541,24.246407,1.0,80.0,F,CA209025-180-1001,baseline,...,WHITE,3.712526,1.0,SD,ICB,True,RCC3,True,Hilic,Positive
0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,ST001237,False,adult_cancer,541,3.614,1.0,60.0,M,CA209025-113-50,baseline,...,WHITE,2.759754,1.0,SD,ICB,False,RCC3,True,Hilic,Positive


In [50]:
# save the metadata
metadata_ST001236.to_csv(f'{save_dir}/metadata_ST001236.csv', index=True)
metadata_ST001237.to_csv(f'{save_dir}/metadata_ST001237.csv', index=True)

## ST002244
- all patients are non-cancer

In [51]:
existing_ST002244 = metadata[metadata['Study ID'] == 'ST002244'].copy()
existing_ST002244.dropna(axis=1, how='all', inplace=True)
existing_ST002244.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
Positive_QC_2.mzML,557,ST002244,adult_other,557,78.657424,Pretrain,Test,False,1,16,Positive_QC_2.mzML
Tech_Plasma_Pos_1.mzML,557,ST002244,adult_other,557,75.11857,Pretrain,Test,False,1,16,Tech_Plasma_Pos_1.mzML
Positive_QC_4.mzML,557,ST002244,adult_other,557,74.75374,Pretrain,Test,False,1,16,Positive_QC_4.mzML
Tech_Plasma_Pos_3.mzML,557,ST002244,adult_other,557,74.425392,Pretrain,Test,False,1,16,Tech_Plasma_Pos_3.mzML
Tech_Plasma_Pos_5.mzML,557,ST002244,adult_other,557,72.309376,Pretrain,Test,False,1,16,Tech_Plasma_Pos_5.mzML


In [52]:
metadata_ST002244 = existing_ST002244[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()

metadata_ST002244.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)

metadata_ST002244['Cancer Risk'] = False
# add column and polarity
metadata_ST002244['Column'] = 'Hilic'
metadata_ST002244['Polarity'] = 'Positive'

In [53]:
metadata_ST002244.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk,Column,Polarity
Positive_QC_2.mzML,ST002244,False,adult_other,557,False,Hilic,Positive
Tech_Plasma_Pos_1.mzML,ST002244,False,adult_other,557,False,Hilic,Positive
Positive_QC_4.mzML,ST002244,False,adult_other,557,False,Hilic,Positive
Tech_Plasma_Pos_3.mzML,ST002244,False,adult_other,557,False,Hilic,Positive
Tech_Plasma_Pos_5.mzML,ST002244,False,adult_other,557,False,Hilic,Positive


In [54]:
# save the metadata
metadata_ST002244.to_csv(f'{save_dir}/metadata_ST002244.csv', index=True)

## ST002112
- we shoud already have gender
- all patients are non-cancer

In [55]:
existing_ST002112 = metadata[metadata['Study ID'] == 'ST002112'].copy()
existing_ST002112.dropna(axis=1, how='all', inplace=True)
existing_ST002112.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
pHILIC_2_284.mzML,507,ST002112,adult_other,507,F,64.976286,Pretrain,Test,False,1,15,pHILIC_2_284.mzML,1.0
pHILIC_2_245.mzML,507,ST002112,adult_other,507,M,64.027727,Pretrain,Test,False,1,15,pHILIC_2_245.mzML,0.0
pHILIC_2_93.mzML,507,ST002112,adult_other,507,F,62.823787,Pretrain,Test,False,1,15,pHILIC_2_93.mzML,1.0
pHILIC_2_75.mzML,507,ST002112,adult_other,507,F,62.349507,Pretrain,Test,False,1,15,pHILIC_2_75.mzML,1.0
pHILIC_2_50.mzML,507,ST002112,adult_other,507,M,61.510398,Pretrain,Test,False,1,15,pHILIC_2_50.mzML,0.0


In [56]:
metadata_ST002112 = existing_ST002112[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                        'Sex']].copy()

metadata_ST002112.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)

metadata_ST002112['Cancer Risk'] = False

# add column and polarity
metadata_ST002112['Column'] = 'Hilic'
metadata_ST002112['Polarity'] = 'Positive'

In [57]:
metadata_ST002112.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk,Column,Polarity
pHILIC_2_284.mzML,ST002112,False,adult_other,507,F,False,Hilic,Positive
pHILIC_2_245.mzML,ST002112,False,adult_other,507,M,False,Hilic,Positive
pHILIC_2_93.mzML,ST002112,False,adult_other,507,F,False,Hilic,Positive
pHILIC_2_75.mzML,ST002112,False,adult_other,507,F,False,Hilic,Positive
pHILIC_2_50.mzML,ST002112,False,adult_other,507,M,False,Hilic,Positive


In [58]:
# save the metadata
metadata_ST002112.to_csv(f'{save_dir}/metadata_ST002112.csv', index=True)

## ST002027
- add gender info, all patients are women
- all patients are non cancer

In [59]:
existing_ST002027 = metadata[metadata['Study ID'] == 'ST002027'].copy()
existing_ST002027.dropna(axis=1, how='all', inplace=True)
existing_ST002027.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
McCann_812029070_011_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.751915,Pretrain,Test,False,1,14,McCann_812029070_011_MX452646_posHILIC.mzML
McCann_812239150_349_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.423568,Pretrain,Test,False,1,14,McCann_812239150_349_MX452646_posHILIC.mzML
McCann_812222070_334_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.277636,Pretrain,Test,False,1,14,McCann_812222070_334_MX452646_posHILIC.mzML
McCann_812001070_009_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,78.985772,Pretrain,Test,False,1,14,McCann_812001070_009_MX452646_posHILIC.mzML
McCann_812222000_333_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,78.876323,Pretrain,Test,False,1,14,McCann_812222000_333_MX452646_posHILIC.mzML


In [60]:
metadata_ST002027 = existing_ST002027[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST002027.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST002027['Sex'] = 'F'
metadata_ST002027['Cancer Risk'] = False

# add column and polarity
metadata_ST002027['Column'] = 'Hilic'
metadata_ST002027['Polarity'] = 'Positive'


In [61]:
metadata_ST002027.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk,Column,Polarity
McCann_812029070_011_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False,Hilic,Positive
McCann_812239150_349_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False,Hilic,Positive
McCann_812222070_334_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False,Hilic,Positive
McCann_812001070_009_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False,Hilic,Positive
McCann_812222000_333_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False,Hilic,Positive


In [62]:
# save the metadata
metadata_ST002027.to_csv(f'{save_dir}/metadata_ST002027.csv', index=True)

## ST001918
- add BMI
- we should already have gender info
- can optionally add smoking status
- all patients are non-cancer
- associated paper:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8598381/

- Smoking Status: for this data set "Smoking status defined as having smoked more than 20 cigarettes over the lifetime"
    - 1 is No, which we translate to "Never"
    - 2 is Yes, which we translate to "Current or Former"

In [26]:
existing_ST001918 = metadata[metadata['Study ID'] == 'ST001918'].copy()
existing_ST001918.dropna(axis=1, how='all', inplace=True)
existing_ST001918.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
VT_170729_M021_179.mzML,559,ST001918,adult_other,559,,89.675301,Pretrain,Test,False,1,11,VT_170729_M021_179.mzML,
VT_170728_M021_103.mzML,559,ST001918,adult_other,559,,87.887632,Pretrain,Test,False,1,11,VT_170728_M021_103.mzML,
VT_170729_M021_153.mzML,559,ST001918,adult_other,559,F,69.35425,Pretrain,Test,False,1,11,VT_170729_M021_153.mzML,1.0
VT_170728_M021_127.mzML,559,ST001918,adult_other,559,F,68.843488,Pretrain,Test,False,1,11,VT_170728_M021_127.mzML,1.0
VT_170729_M021_085.mzML,559,ST001918,adult_other,559,F,68.697556,Pretrain,Test,False,1,11,VT_170729_M021_085.mzML,1.0


In [27]:
existing_ST001918.sort_index()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
VT_170728_M021_001.mzML,559,ST001918,adult_other,559,,76.760306,Pretrain,Train,False,1,11,VT_170728_M021_001.mzML,
VT_170728_M021_003.mzML,559,ST001918,adult_other,559,,67.603065,Pretrain,Train,False,1,11,VT_170728_M021_003.mzML,
VT_170728_M021_005.mzML,559,ST001918,adult_other,559,,67.457132,Pretrain,Train,False,1,11,VT_170728_M021_005.mzML,
VT_170728_M021_007.mzML,559,ST001918,adult_other,559,,69.901496,Pretrain,Train,False,1,11,VT_170728_M021_007.mzML,
VT_170728_M021_009.mzML,559,ST001918,adult_other,559,,68.296242,Pretrain,Val,False,1,11,VT_170728_M021_009.mzML,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT_170729_M021_255.mzML,559,ST001918,adult_other,559,,68.843488,Pretrain,Val,False,1,11,VT_170729_M021_255.mzML,
VT_170729_M021_257.mzML,559,ST001918,adult_other,559,,69.244801,Pretrain,Train,False,1,11,VT_170729_M021_257.mzML,
VT_170729_M021_259.mzML,559,ST001918,adult_other,559,,68.770522,Pretrain,Train,False,1,11,VT_170729_M021_259.mzML,
VT_170729_M021_261.mzML,559,ST001918,adult_other,559,,67.894929,Pretrain,Train,False,1,11,VT_170729_M021_261.mzML,


In [28]:
existing_ST001918.shape

(271, 13)

In [29]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001918_AN003116.json'

with open(json_file) as f:
    data = json.load(f)

# data['SUBJECT_SAMPLE_FACTORS']
# convert dictionary to dataframe

new_ST001918 = extract_subject_factors_dataframe(data)
# new_ST001918 = pd.DataFrame(data['SUBJECT_SAMPLE_FACTORS'])

In [30]:
new_ST001918.head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_002,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_008,1,Pooled_Plasma,,
2,q3June2014_1b,q3June2014_1b_1,,,VT_170728_M021_014,1,Pooled_Plasma,,
3,CZ.0067.005,CZ.0067.005_1,1.0,2.0,VT_170728_M021_020,1,Study_Sample,1.0,23.183391
4,CZ.0002.004,CZ.0002.004_1,2.0,2.0,VT_170728_M021_026,1,Study_Sample,2.0,22.265625


In [68]:
# new_ST001918['mzml file'] = new_ST001918['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
# new_ST001918.index = new_ST001918['mzml file'].tolist()

In [31]:
new_ST001918_RP_part1 = new_ST001918.copy()
new_ST001918_RP_part2 = increment_the_last_number(new_ST001918.copy(), 'RAW_FILE_NAME', 2)
new_ST001918_RP_part3 = increment_the_last_number(new_ST001918.copy(), 'RAW_FILE_NAME', 4)

new_ST001918_RP = pd.concat([new_ST001918_RP_part1, new_ST001918_RP_part2, new_ST001918_RP_part3], axis=0)
new_ST001918_RP.sort_values('RAW_FILE_NAME').head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_002,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_004,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_006,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_008,1,Pooled_Plasma,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_010,1,Pooled_Plasma,,


In [32]:
new_ST001918_Hilic = increment_the_last_number(new_ST001918_RP.copy(),'RAW_FILE_NAME',-1)

In [33]:
new_ST001918_Hilic.sort_values('RAW_FILE_NAME').head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_001,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_003,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_005,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_007,1,Pooled_Plasma,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_009,1,Pooled_Plasma,,


In [34]:
new_ST001918_Hilic['Column'] = 'Hilic'
new_ST001918_Hilic['Polarity'] = 'Positive'
new_ST001918_RP['Column'] = 'RP'
new_ST001918_RP['Polarity'] = 'Negative'

new_ST001918 = pd.concat([new_ST001918_Hilic, new_ST001918_RP], axis=0)

In [35]:
new_ST001918['mzml file'] = new_ST001918['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001918.index = new_ST001918['mzml file'].tolist()

In [36]:
new_ST001918['Sex'] = new_ST001918['Sex'].map(sex_label_matching_dict)

In [37]:
metadata_ST001918 = new_ST001918[['Column','Polarity','Sex','Subject ID','Sample_Class','Batch','bmi','Benzene_Exposure_Category','Smoking_status']].copy()
metadata_ST001918.rename(columns=
    {'Smoking_status': 'Smoking Status',
    'bmi': 'BMI'}, inplace=True)



In [38]:
metadata_ST001918

Unnamed: 0,Column,Polarity,Sex,Subject ID,Sample_Class,Batch,BMI,Benzene_Exposure_Category,Smoking Status
VT_170728_M021_001.mzML,Hilic,Positive,,nist1,NIST1950,1,,,
VT_170728_M021_007.mzML,Hilic,Positive,,q3June2014_1a,Pooled_Plasma,1,,,
VT_170728_M021_013.mzML,Hilic,Positive,,q3June2014_1b,Pooled_Plasma,1,,,
VT_170728_M021_019.mzML,Hilic,Positive,M,CZ.0067.005,Study_Sample,1,23.183391,1,2
VT_170728_M021_025.mzML,Hilic,Positive,F,CZ.0002.004,Study_Sample,1,22.265625,2,2
...,...,...,...,...,...,...,...,...,...
VT_170729_M021_240.mzML,RP,Negative,M,CZ.0180.002,Study_Sample,2,22.64737696,-,2
VT_170729_M021_246.mzML,RP,Negative,M,CZ.0072.004,Study_Sample,2,19.71332153,1,1
VT_170729_M021_252.mzML,RP,Negative,,q3June2014_2e,Pooled_Plasma,2,,,
VT_170729_M021_258.mzML,RP,Negative,,q3June2014_2f,Pooled_Plasma,2,,,


In [39]:
metadata_ST001918['Study ID'] = existing_ST001918['Study ID'].iloc[0]
metadata_ST001918['Job ID'] = existing_ST001918['Cohort ID'].iloc[0]
metadata_ST001918['Cohort Label v0'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','Cohort Label v0'] = existing_ST001918['Cohort Label'].iloc[0]
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','Cohort Label v0'] = existing_ST001918['Cohort Label'].iloc[0]

metadata_ST001918['is Pediatric'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','is Pediatric'] = False
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','is Pediatric'] = False

metadata_ST001918['Cancer Risk'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','Cancer Risk'] = False
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','Cancer Risk'] = False

metadata_ST001918['Smoking Status'] = metadata_ST001918['Smoking Status'].map({'2': 'Current or Former', '1': 'Never', 2: 'Current or Former', 1: 'Never'})

In [40]:
metadata_ST001918

Unnamed: 0,Column,Polarity,Sex,Subject ID,Sample_Class,Batch,BMI,Benzene_Exposure_Category,Smoking Status,Study ID,Job ID,Cohort Label v0,is Pediatric,Cancer Risk
VT_170728_M021_001.mzML,Hilic,Positive,,nist1,NIST1950,1,,,,ST001918,559,,,
VT_170728_M021_007.mzML,Hilic,Positive,,q3June2014_1a,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_013.mzML,Hilic,Positive,,q3June2014_1b,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_019.mzML,Hilic,Positive,M,CZ.0067.005,Study_Sample,1,23.183391,1,Current or Former,ST001918,559,adult_other,False,False
VT_170728_M021_025.mzML,Hilic,Positive,F,CZ.0002.004,Study_Sample,1,22.265625,2,Current or Former,ST001918,559,adult_other,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT_170729_M021_240.mzML,RP,Negative,M,CZ.0180.002,Study_Sample,2,22.64737696,-,Current or Former,ST001918,559,adult_other,False,False
VT_170729_M021_246.mzML,RP,Negative,M,CZ.0072.004,Study_Sample,2,19.71332153,1,Never,ST001918,559,adult_other,False,False
VT_170729_M021_252.mzML,RP,Negative,,q3June2014_2e,Pooled_Plasma,2,,,,ST001918,559,adult_other,False,False
VT_170729_M021_258.mzML,RP,Negative,,q3June2014_2f,Pooled_Plasma,2,,,,ST001918,559,adult_other,False,False


In [41]:
metadata_ST001918['Smoking Status'].value_counts()

Smoking Status
Current or Former    192
Never                156
Name: count, dtype: int64

In [42]:
metadata_ST001918['Sex'].value_counts()

Sex
M    180
F    168
Name: count, dtype: int64

In [43]:
metadata_ST001918['Sample_Class'].value_counts()

Sample_Class
Study_Sample       366
Study_QC_Sample     96
Pooled_Plasma       72
NIST1950            12
Name: count, dtype: int64

In [44]:
metadata_ST001918.head()

Unnamed: 0,Column,Polarity,Sex,Subject ID,Sample_Class,Batch,BMI,Benzene_Exposure_Category,Smoking Status,Study ID,Job ID,Cohort Label v0,is Pediatric,Cancer Risk
VT_170728_M021_001.mzML,Hilic,Positive,,nist1,NIST1950,1,,,,ST001918,559,,,
VT_170728_M021_007.mzML,Hilic,Positive,,q3June2014_1a,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_013.mzML,Hilic,Positive,,q3June2014_1b,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_019.mzML,Hilic,Positive,M,CZ.0067.005,Study_Sample,1,23.183391,1.0,Current or Former,ST001918,559,adult_other,False,False
VT_170728_M021_025.mzML,Hilic,Positive,F,CZ.0002.004,Study_Sample,1,22.265625,2.0,Current or Former,ST001918,559,adult_other,False,False


In [45]:
# save the metadata
metadata_ST001918.to_csv(f'{save_dir}/metadata_ST001918.csv', index=True)

## ST001849
- we should already have age and gender information
- we can add BMI
- we can optionally add smoking status
- we have some cancer labels, but since patients have covid, unclear if we can use all of them


for cancer labels
- cancer without covid : cancer label
- no-cancer w/wo covid: no-cancer label
- cancer with covid: NA label

In [84]:
existing_ST001849 = metadata[metadata['Study ID'] == 'ST001849'].copy()
existing_ST001849.dropna(axis=1, how='all', inplace=True)
existing_ST001849.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Age,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
B8_WU350-355_d0_polar_pos.mzML,504,ST001849,adult_other,504,47.9,M,83.035389,Pretrain,Test,False,1,10,B8_WU350-355_d0_polar_pos.mzML,47.9,47.9,0.0
B9_WU350-347_d7_polar_pos.mzML,504,ST001849,adult_other,504,46.1,M,82.925939,Pretrain,Test,False,1,10,B9_WU350-347_d7_polar_pos.mzML,46.1,46.1,0.0
B8_WU350-323_d0_polar_pos.mzML,504,ST001849,adult_other,504,30.4,F,82.81649,Pretrain,Test,False,1,10,B8_WU350-323_d0_polar_pos.mzML,30.4,30.4,1.0
B8_WU350-345_d0_polar_pos.mzML,504,ST001849,adult_other,504,68.2,M,82.743524,Pretrain,Test,False,1,10,B8_WU350-345_d0_polar_pos.mzML,68.2,68.2,0.0
B8_WU350-311_d0_polar_pos.mzML,504,ST001849,adult_other,504,76.8,M,82.196279,Pretrain,Test,False,1,10,B8_WU350-311_d0_polar_pos.mzML,76.8,76.8,0.0


In [85]:
existing_ST001849.shape

(691, 16)

In [86]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001849_AN002993.json'

with open(json_file) as f:
    data = json.load(f)

new_ST001849 = extract_subject_factors_dataframe(data)    

In [87]:
new_ST001849

Unnamed: 0,Subject ID,Sample ID,batch,WU day of presentation,SARS-CoV-2 Positive,Admitted to the ICU,REMDESIVIR on,DEXAMETHOSONE on,RAW_FILE_NAME,Days post symptom onset,...,MIP1a,GMCSF,MCP1,IL15,HGF,VEGF,IL1Ra,IL2R,MIG,IL8
0,WU350-013,B1_WU350-013_d0,1,d0,Yes,Yes,-,-,B1_WU350-013_d0_polar_pos.mzML B1_WU350-013_d0...,11,...,10.0981817,10.04841148,1303.844512,22.82324687,726.5621765,6.876694719,1048.239677,124.3491115,84.900486,25.72906886
1,WU350-014,B1_WU350-014_d0,1,d0,Yes,Yes,-,-,B1_WU350-014_d0_polar_pos.mzML B1_WU350-014_d0...,3,...,27.25334281,1.497099241,505.525744,61.77101181,66.59597601,0.975492592,54.78235533,35.01993895,43.56103392,26.27460047
2,WU350-021,B1_WU350-021_d0,1,d0,Yes,Yes,-,-,B1_WU350-021_d0_polar_pos.mzML B1_WU350-021_d0...,15,...,17.92951592,1.306346239,318.8497179,82.76911121,1851.039214,5.260831608,1464.274665,71.55218583,147.4940306,20.04562796
3,WU350-029,B1_WU350-029_d0,1,d0,Yes,Yes,-,-,B1_WU350-029_d0_polar_pos.mzML B1_WU350-029_d0...,2,...,12.36174818,18.83099743,570.1185771,52.50121538,119.1798849,1.87128378,62.29341051,93.5288825,37.41580246,17.5441893
4,WU350-031,B1_WU350-031_d0,1,d0,Yes,Yes,-,-,B1_WU350-031_d0_polar_pos.mzML B1_WU350-031_d0...,2,...,1.00684193,0.950151926,803.1537554,<8.3,218.3478923,0.466090286,68.23701628,107.2469566,45.53312978,47.66419023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,WU350-376,B9_WU350-376_d7,9,d7,Yes,Yes,-,1,B9_WU350-376_d7_polar_pos.mzML B9_WU350-376_d7...,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
696,WU350-377,B9_WU350-377_d7,9,d7,Yes,Yes,-,1,B9_WU350-377_d7_polar_pos.mzML B9_WU350-377_d7...,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
697,WU350-393,B9_WU350-393_d7,9,d7,Yes,Yes,-,1,B9_WU350-393_d7_polar_pos.mzML B9_WU350-393_d7...,-,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
698,WU350-398,B9_WU350-398_d7,9,d7,Yes,Yes,-,1,B9_WU350-398_d7_polar_pos.mzML B9_WU350-398_d7...,11,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [88]:
new_ST001849_polar_pos = new_ST001849.copy()
new_ST001849_polar_pos['RAW_FILE_NAME'] = new_ST001849_polar_pos['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[0])
# new_ST001849_polar_pos['mzml file'] = new_ST001849_polar_pos['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_polar_pos['Column'] = 'Hilic'
new_ST001849_polar_pos['Polarity'] = 'Positive'
new_ST001849_polar_pos.index = new_ST001849_polar_pos['RAW_FILE_NAME'].tolist()

new_ST001849_polar_neg = new_ST001849.copy()
new_ST001849_polar_neg['RAW_FILE_NAME'] = new_ST001849_polar_neg['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[1])
# new_ST001849_polar_neg['mzml file'] = new_ST001849_polar_neg['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_polar_neg['Column'] = 'Hilic'
new_ST001849_polar_neg['Polarity'] = 'Negative'
new_ST001849_polar_neg.index = new_ST001849_polar_neg['RAW_FILE_NAME'].tolist()


new_ST001849_lipid_pos = new_ST001849.copy()
new_ST001849_lipid_pos['RAW_FILE_NAME'] = new_ST001849_lipid_pos['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[2])
# new_ST001849_lipid_pos['mzml file'] = new_ST001849_lipid_pos['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_lipid_pos['Column'] = 'RP'
new_ST001849_lipid_pos['Polarity'] = 'Positive'
new_ST001849_lipid_pos.index = new_ST001849_lipid_pos['RAW_FILE_NAME'].tolist()


new_ST001849_lipid_neg = new_ST001849.copy()
new_ST001849_lipid_neg['RAW_FILE_NAME'] = new_ST001849_lipid_neg['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[3])
# new_ST001849_lipid_neg['mzml file'] = new_ST001849_lipid_neg['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_lipid_neg['Column'] = 'RP'
new_ST001849_lipid_neg['Polarity'] = 'Negative'
new_ST001849_lipid_neg.index = new_ST001849_lipid_neg['RAW_FILE_NAME'].tolist()


new_ST001849_combined = pd.concat([new_ST001849_polar_pos, new_ST001849_polar_neg, new_ST001849_lipid_pos, new_ST001849_lipid_neg], axis=0)

In [89]:
new_ST001849_combined['Sex'] = new_ST001849_combined['Sex'].map(sex_label_matching_dict)

In [90]:
new_ST001849_combined[['SARS-CoV-2 Positive', 'Diabetes','Acute renal failure','Chronic kidney disease','Cancer','Age','Smoker']]

Unnamed: 0,SARS-CoV-2 Positive,Diabetes,Acute renal failure,Chronic kidney disease,Cancer,Age,Smoker
B1_WU350-013_d0_polar_pos.mzML,Yes,-,1,1,-,80.8,-1
B1_WU350-014_d0_polar_pos.mzML,Yes,1,1,-,-,66.9,-1
B1_WU350-021_d0_polar_pos.mzML,Yes,1,1,-,-,68,-1
B1_WU350-029_d0_polar_pos.mzML,Yes,1,1,1,-,88.3,-
B1_WU350-031_d0_polar_pos.mzML,Yes,1,1,1,-,88.3,-
...,...,...,...,...,...,...,...
B9_WU350-376_d7_lipid_neg.mzML,Yes,1,-,1,-,70.3,1
B9_WU350-377_d7_lipid_neg.mzML,Yes,1,-,-,-,63.9,-1
B9_WU350-393_d7_lipid_neg.mzML,Yes,1,1,-,-,83.5,-
B9_WU350-398_d7_lipid_neg.mzML,Yes,-,-,-,-,53.7,-1


In [91]:
new_ST001849['RAW_FILE_NAME'].iloc[0]

'B1_WU350-013_d0_polar_pos.mzML B1_WU350-013_d0_polar_neg.mzML B1_WU350-013_d0_lipid_pos.mzML B1_WU350-013_d0_lipid_neg.mzML'

In [92]:
new_ST001849['Sex'].value_counts()

Sex
Male      405
Female    295
Name: count, dtype: int64

In [93]:
new_ST001849.columns.tolist()

['Subject ID',
 'Sample ID',
 'batch',
 'WU day of presentation',
 'SARS-CoV-2 Positive',
 'Admitted to the ICU',
 'REMDESIVIR on',
 'DEXAMETHOSONE on',
 'RAW_FILE_NAME',
 'Days post symptom onset',
 'Sex',
 'BMI',
 'Age at Symptom onset (years)',
 'Hospitalized?',
 'Time from symptom onset to ICU (days)',
 'Subject on ventilation at any point after d0',
 'Mortality Status',
 'Symptom Fever',
 'Symptom Headache',
 'Symptom Cough',
 'Symptom Shortness of breath',
 'Symptom: Sore throat',
 'Asymptomatic',
 'Death due to COVID-19?',
 '30 day mortality',
 '60 day mortality',
 '90 day mortality',
 'CRP',
 'D-dimer',
 'Neutrophil %',
 'CO2',
 'Acute respiratory failure',
 'Diabetes',
 'Acute renal failure',
 'Chronic kidney disease',
 'Cancer',
 'Age',
 'Smoker',
 'High/Low arterial pH',
 'Neutrophil absolute',
 'Lymphocyte absolute',
 'Lymphocyte %',
 'IL1b',
 'IL10',
 'IL6',
 'RANTES',
 'MIP1a',
 'GMCSF',
 'MCP1',
 'IL15',
 'HGF',
 'VEGF',
 'IL1Ra',
 'IL2R',
 'MIG',
 'IL8']

In [None]:
new_ST001849.shape

(700, 56)

In [None]:
new_ST001849['WU day of presentation'].value_counts()

WU day of presentation
d0     322
d3     164
d7     110
d14     54
d28     31
d84     19
Name: count, dtype: int64

In [None]:
new_ST001849['Subject ID'].nunique()

339

In [131]:
new_ST001849_combined['BMI']

B1_WU350-013_d0_polar_pos.mzML     30.6
B1_WU350-014_d0_polar_pos.mzML     25.5
B1_WU350-021_d0_polar_pos.mzML     30.1
B1_WU350-029_d0_polar_pos.mzML     32.8
B1_WU350-031_d0_polar_pos.mzML     35.5
                                   ... 
B9_WU350-376_d7_lipid_neg.mzML     24.1
B9_WU350-377_d7_lipid_neg.mzML       54
B9_WU350-393_d7_lipid_neg.mzML     23.9
B9_WU350-398_d7_lipid_neg.mzML     27.9
B9_WU350-229_d84_lipid_neg.mzML    19.4
Name: BMI, Length: 2800, dtype: object

In [154]:
metadata_ST001849 = new_ST001849_combined[['Column','Polarity','Sex','Subject ID','batch','BMI','Smoker','Age at Symptom onset (years)',
                                           'Cancer','SARS-CoV-2 Positive']].copy()
metadata_ST001849.rename(columns=
    {'Smoker': 'Smoking Status',
     'batch' : 'Batch',
    'Age at Symptom onset (years)': 'Age',
    'Cancer' : 'Cancer Risk',
    'bmi': 'BMI'}, inplace=True)



In [155]:

metadata_ST001849['Smoking Status'].value_counts()

Smoking Status
-1    1516
-      924
1      360
Name: count, dtype: int64

In [156]:
metadata_ST001849['Study ID'] = existing_ST001849['Study ID'].iloc[0]
metadata_ST001849['Job ID'] = existing_ST001849['Cohort ID'].iloc[0]
metadata_ST001849['Cohort Label v0'] = existing_ST001849['Cohort Label'].iloc[0]

metadata_ST001849['Age'] = metadata_ST001849['Age'].astype(float)
metadata_ST001849['is Pediatric'] = False
metadata_ST001849['Cancer Risk'] = metadata_ST001849['Cancer Risk'].map({'-': False, '1': True, 1: True})

# There is some uncertainty about how to assign this smoking status
metadata_ST001849['Smoking Status'] = metadata_ST001849['Smoking Status'].map({'1': 'Current or Former', '-1': 'Never', 1: 'Current or Former', -1: 'Never'})

In [157]:
# save the metadata
metadata_ST001849.to_csv(f'{save_dir}/metadata_ST001849.csv', index=True)

## ST001519
- we should already have age and gender labels
- add BMI
- all patients are non-cancer

In [143]:
existing_ST001519 = metadata[metadata['Study ID'] == 'ST001519'].copy()
existing_ST001519.dropna(axis=1, how='all', inplace=True)
existing_ST001519.head()


Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Age,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
HP_mzml/Plasma/0133_WUG_FARMM_HIL-9009-2-PC.mzML,605,ST001519,adult_other,605,23.0,M,41.116381,Pretrain,Test,False,1,9,HP_mzml/Plasma/0133_WUG_FARMM_HIL-9009-2-PC.mzML,23.0,23.0,0.0
HP_mzml/Plasma/0088_WUG_FARMM_HIL-9038-2-PC.mzML,605,ST001519,adult_other,605,59.0,F,40.788034,Pretrain,Test,False,1,9,HP_mzml/Plasma/0088_WUG_FARMM_HIL-9038-2-PC.mzML,59.0,59.0,1.0
HP_mzml/Plasma/0119_WUG_FARMM_HIL-9040-1-PC.mzML,605,ST001519,adult_other,605,22.0,M,40.642101,Pretrain,Test,False,1,9,HP_mzml/Plasma/0119_WUG_FARMM_HIL-9040-1-PC.mzML,22.0,22.0,0.0
HP_mzml/Plasma/0127_WUG_FARMM_HIL-9029-2-PD.mzML,605,ST001519,adult_other,605,26.0,M,40.423203,Pretrain,Test,False,1,9,HP_mzml/Plasma/0127_WUG_FARMM_HIL-9029-2-PD.mzML,26.0,26.0,0.0
HP_mzml/Plasma/0074_WUG_FARMM_HIL-9013-2-PB.mzML,605,ST001519,adult_other,605,40.0,M,40.423203,Pretrain,Test,False,1,9,HP_mzml/Plasma/0074_WUG_FARMM_HIL-9013-2-PB.mzML,40.0,40.0,0.0


In [144]:
# All of these are Plasma Samples
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001521_AN002533.json'

with open(json_file) as f:
    data = json.load(f)

new_ST001519 = extract_subject_factors_dataframe(data)

In [145]:
new_ST001519['RAW_FILE_NAME'].iloc[0]
new_ST001519['Sex'] = new_ST001519['Sex'].map(sex_label_matching_dict)

In [146]:
# All of these are Plasma Samples

new_ST001519_combined = None
col_id_list = ['Hilic','RP','Hilic','RP']
pol_id_list = ['Positive','Positive','Negative','Negative']
exp_desc_list = ['HP','CP','HN','CN']
for i in range(4):
    print(i)
    exp_desc = exp_desc_list[i]
    new_ST001519_part = new_ST001519.copy()
    # new_ST001519_part['RAW_FILE_NAME']  = new_ST001519_part['RAW_FILE_NAME'].apply(lambda x: x.split(';')[i])
    new_ST001519_part['RAW_FILE'] = ''
    for ii, x in enumerate(new_ST001519_part['RAW_FILE_NAME'].tolist()):
        x_splits = x.split(';')
        if len(x_splits) > 2:
            x_new = x_splits[i]
            new_ST001519_part.iloc[ii,-1] = x_new

    new_ST001519_part['mzml file'] = new_ST001519_part['RAW_FILE'].apply(lambda x: x.replace('.raw','.mzML'))
    new_ST001519_part['mzml path'] = new_ST001519_part['mzml file'].apply(lambda x: f'{exp_desc}_mzml/Plasma/'+x)
    new_ST001519_part['Column'] = col_id_list[i]
    new_ST001519_part['Polarity'] = pol_id_list[i]
    new_ST001519_part = new_ST001519_part[~(new_ST001519_part['mzml file'] == 'NA')]
    new_ST001519_part.index = new_ST001519_part['mzml file']

    if new_ST001519_combined is None:
        new_ST001519_combined = new_ST001519_part.copy()
    else:
        new_ST001519_combined = pd.concat([new_ST001519_combined,new_ST001519_part], axis=0)

0
1
2
3


In [147]:
new_ST001519_part['Type'].value_counts()

Type
Plasma    157
Name: count, dtype: int64

In [148]:
new_ST001519_combined

Unnamed: 0_level_0,Subject ID,Sample ID,Study_Diet,Age,Sex,Race,Time,Type,BMI,Ethnicity,RAW_FILE_NAME,RAW_FILE,mzml file,mzml path,Column,Polarity
mzml file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0078_WUG_FARMM_HIL-9002-3-PA.mzML,9002,9002-3-PA,Vegan,20,M,White,Day 15,Plasma,21.3,Not Hispanic or Latino,0078_WUG_FARMM_HIL-9002-3-PA.raw;0078_WUG_FARM...,0078_WUG_FARMM_HIL-9002-3-PA.raw,0078_WUG_FARMM_HIL-9002-3-PA.mzML,HP_mzml/Plasma/0078_WUG_FARMM_HIL-9002-3-PA.mzML,Hilic,Positive
0076_WUG_FARMM_HIL-9002-3-PB.mzML,9002,9002-3-PB,Vegan,20,M,White,Day 12,Plasma,21.3,Not Hispanic or Latino,0076_WUG_FARMM_HIL-9002-3-PB.raw;0076_WUG_FARM...,0076_WUG_FARMM_HIL-9002-3-PB.raw,0076_WUG_FARMM_HIL-9002-3-PB.mzML,HP_mzml/Plasma/0076_WUG_FARMM_HIL-9002-3-PB.mzML,Hilic,Positive
0079_WUG_FARMM_HIL-9002-3-PC.mzML,9002,9002-3-PC,Vegan,20,M,White,Day 9,Plasma,21.3,Not Hispanic or Latino,0079_WUG_FARMM_HIL-9002-3-PC.raw;0079_WUG_FARM...,0079_WUG_FARMM_HIL-9002-3-PC.raw,0079_WUG_FARMM_HIL-9002-3-PC.mzML,HP_mzml/Plasma/0079_WUG_FARMM_HIL-9002-3-PC.mzML,Hilic,Positive
0080_WUG_FARMM_HIL-9002-3-PD.mzML,9002,9002-3-PD,Vegan,20,M,White,Day 5,Plasma,21.3,Not Hispanic or Latino,0080_WUG_FARMM_HIL-9002-3-PD.raw;0080_WUG_FARM...,0080_WUG_FARMM_HIL-9002-3-PD.raw,0080_WUG_FARMM_HIL-9002-3-PD.mzML,HP_mzml/Plasma/0080_WUG_FARMM_HIL-9002-3-PD.mzML,Hilic,Positive
0077_WUG_FARMM_HIL-9002-3-PE.mzML,9002,9002-3-PE,Vegan,20,M,White,Baseline,Plasma,21.3,Not Hispanic or Latino,0077_WUG_FARMM_HIL-9002-3-PE.raw;0077_WUG_FARM...,0077_WUG_FARMM_HIL-9002-3-PE.raw,0077_WUG_FARMM_HIL-9002-3-PE.mzML,HP_mzml/Plasma/0077_WUG_FARMM_HIL-9002-3-PE.mzML,Hilic,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0060a_WUG_FARMM_FFA-QCPP04.mzML,QC-pooled_plasma,QPP04,,,,,,Plasma,,,0060a_WUG_FARMM_HIL-QPP04.raw;0060a_WUG_FARMM_...,0060a_WUG_FARMM_FFA-QCPP04.raw,0060a_WUG_FARMM_FFA-QCPP04.mzML,CN_mzml/Plasma/0060a_WUG_FARMM_FFA-QCPP04.mzML,RP,Negative
0080a_WUG_FARMM_FFA-QCPP05.mzML,QC-pooled_plasma,QPP05,,,,,,Plasma,,,0080a_WUG_FARMM_HIL-QPP05.raw;0080a_WUG_FARMM_...,0080a_WUG_FARMM_FFA-QCPP05.raw,0080a_WUG_FARMM_FFA-QCPP05.mzML,CN_mzml/Plasma/0080a_WUG_FARMM_FFA-QCPP05.mzML,RP,Negative
0100a_WUG_FARMM_FFA-QCPP06.mzML,QC-pooled_plasma,QPP06,,,,,,Plasma,,,0100a_WUG_FARMM_HIL-QPP06.raw;0100a_WUG_FARMM_...,0100a_WUG_FARMM_FFA-QCPP06.raw,0100a_WUG_FARMM_FFA-QCPP06.mzML,CN_mzml/Plasma/0100a_WUG_FARMM_FFA-QCPP06.mzML,RP,Negative
0120a_WUG_FARMM_FFA-QCPP07.mzML,QC-pooled_plasma,QPP07,,,,,,Plasma,,,0120a_WUG_FARMM_HIL-QPP07.raw;NA;0100a_WU_FARM...,0120a_WUG_FARMM_FFA-QCPP07.raw,0120a_WUG_FARMM_FFA-QCPP07.mzML,CN_mzml/Plasma/0120a_WUG_FARMM_FFA-QCPP07.mzML,RP,Negative


In [149]:
new_ST001519_combined['Cancer Risk'] = 'NA'
new_ST001519_combined.loc[~(new_ST001519_combined['Subject ID'].str.contains('QC')),'Cancer Risk'] = False

In [103]:
# save the metadata


In [150]:
metadata_ST001519 = new_ST001519_combined[['Column','Polarity', 'Age', 'Sex', 'Subject ID', 'Race', 'Type','BMI','Cancer Risk','Ethnicity','mzml path']].copy()

metadata_ST001519['Study ID'] = existing_ST001519['Study ID'].iloc[0]
metadata_ST001519['Job ID'] = existing_ST001519['Cohort ID'].iloc[0]
metadata_ST001519['Cohort Label v0'] = existing_ST001519['Cohort Label'].iloc[0]
metadata_ST001519['is Pediatric'] = existing_ST001519['is Pediatric'].iloc[0]

# metadata_ST001519['Cancer Risk'] = metadata_ST001519['Cancer Risk'].map({False: False, 'NA': False})

In [151]:
metadata_ST001519.to_csv(f'{save_dir}/metadata_ST001519.csv', index=True)

## ST001932
- all patients are non-cancer

In [105]:
existing_ST001932 = metadata[metadata['Study ID'] == 'ST001932'].copy()
existing_ST001932.dropna(axis=1, how='all', inplace=True)
existing_ST001932.head()


Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max)
batch10/KL_200518_M436_011.mzML,587,ST001932,pediatric_CMD,587,88.763225,Pretrain,Test,True,2,13,batch10/KL_200518_M436_011.mzML,0.0,20.0
batch9/KL_200516_M436_279.mzML,579,ST001932,pediatric_CMD,579,88.070047,Pretrain,Test,True,2,13,batch9/KL_200516_M436_279.mzML,0.0,20.0
batch10/KL_200518_M436_003.mzML,587,ST001932,pediatric_CMD,587,87.449836,Pretrain,Test,True,2,13,batch10/KL_200518_M436_003.mzML,0.0,20.0
batch10/KL_200518_M436_281.mzML,585,ST001932,pediatric_CMD,585,85.625684,Pretrain,Test,True,2,13,batch10/KL_200518_M436_281.mzML,0.0,20.0
batch10/KL_200518_M436_167.mzML,579,ST001932,pediatric_CMD,579,83.692083,Pretrain,Test,True,2,13,batch10/KL_200518_M436_167.mzML,0.0,20.0


In [117]:
metadata_ST001932 = existing_ST001932[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST001932.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST001932['Cancer Risk'] = False

# add column and polarity
metadata_ST001932['Column'] = 'Hilic'
metadata_ST001932['Polarity'] = 'Positive'

In [118]:
metadata_ST001932.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk,Column,Polarity
batch10/KL_200518_M436_011.mzML,ST001932,True,pediatric_CMD,587,False,Hilic,Positive
batch9/KL_200516_M436_279.mzML,ST001932,True,pediatric_CMD,579,False,Hilic,Positive
batch10/KL_200518_M436_003.mzML,ST001932,True,pediatric_CMD,587,False,Hilic,Positive
batch10/KL_200518_M436_281.mzML,ST001932,True,pediatric_CMD,585,False,Hilic,Positive
batch10/KL_200518_M436_167.mzML,ST001932,True,pediatric_CMD,579,False,Hilic,Positive


In [119]:
# save the metadata
metadata_ST001932.to_csv(f'{save_dir}/metadata_ST001932.csv', index=True)

## ST001428
- all patients are non cancer

In [106]:
existing_ST001428 = metadata[metadata['Study ID'] == 'ST001428'].copy()
existing_ST001428.dropna(axis=1, how='all', inplace=True)
existing_ST001428.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max)
YW_200318_M431_001.mzML,503,ST001428,pediatric_CMD,503,88.033564,Pretrain,Test,True,2,8,YW_200318_M431_001.mzML,2.0,18.0
YW_200318_M431_007.mzML,503,ST001428,pediatric_CMD,503,87.449836,Pretrain,Test,True,2,8,YW_200318_M431_007.mzML,2.0,18.0
YW_200307_M431_185.mzML,503,ST001428,pediatric_CMD,503,87.303904,Pretrain,Test,True,2,8,YW_200307_M431_185.mzML,2.0,18.0
YW_200307_M431_187.mzML,503,ST001428,pediatric_CMD,503,87.157972,Pretrain,Test,True,2,8,YW_200307_M431_187.mzML,2.0,18.0
YW_200319_M431_253.mzML,503,ST001428,pediatric_CMD,503,86.099964,Pretrain,Test,True,2,8,YW_200319_M431_253.mzML,2.0,18.0


In [120]:
metadata_ST001428 = existing_ST001428[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST001428.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST001428['Cancer Risk'] = False

# add column and polarity
metadata_ST001428['Column'] = 'Hilic'
metadata_ST001428['Polarity'] = 'Positive'

metadata_ST001428.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk,Column,Polarity
YW_200318_M431_001.mzML,ST001428,True,pediatric_CMD,503,False,Hilic,Positive
YW_200318_M431_007.mzML,ST001428,True,pediatric_CMD,503,False,Hilic,Positive
YW_200307_M431_185.mzML,ST001428,True,pediatric_CMD,503,False,Hilic,Positive
YW_200307_M431_187.mzML,ST001428,True,pediatric_CMD,503,False,Hilic,Positive
YW_200319_M431_253.mzML,ST001428,True,pediatric_CMD,503,False,Hilic,Positive


In [121]:
# save the metadata
metadata_ST001428.to_csv(f'{save_dir}/metadata_ST001428.csv', index=True)

## ST000909
- all patients are non cancer

In [116]:
existing_ST000909 = metadata[metadata['Study ID'] == 'ST000909'].copy()
existing_ST000909.dropna(axis=1, how='all', inplace=True)
existing_ST000909.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max)
VT_171031_M314_009.mzML,556,ST000909,pediatric_CMD,556,71.433783,Pretrain,Test,True,2,2,VT_171031_M314_009.mzML,0.0,18.0
VT_171031_M314_269.mzML,556,ST000909,pediatric_CMD,556,70.886538,Pretrain,Test,True,2,2,VT_171031_M314_269.mzML,0.0,18.0
VT_171031_M314_143.mzML,556,ST000909,pediatric_CMD,556,70.55819,Pretrain,Test,True,2,2,VT_171031_M314_143.mzML,0.0,18.0
VT_171031_M314_141.mzML,556,ST000909,pediatric_CMD,556,70.55819,Pretrain,Test,True,2,2,VT_171031_M314_141.mzML,0.0,18.0
VT_171028_M314_143.mzML,556,ST000909,pediatric_CMD,556,70.302809,Pretrain,Test,True,2,2,VT_171028_M314_143.mzML,0.0,18.0


In [122]:
metadata_ST000909 = existing_ST000909[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST000909.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST000909['Cancer Risk'] = False

# add column and polarity
metadata_ST000909['Column'] = 'Hilic'
metadata_ST000909['Polarity'] = 'Positive'

metadata_ST000909.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk,Column,Polarity
VT_171031_M314_009.mzML,ST000909,True,pediatric_CMD,556,False,Hilic,Positive
VT_171031_M314_269.mzML,ST000909,True,pediatric_CMD,556,False,Hilic,Positive
VT_171031_M314_143.mzML,ST000909,True,pediatric_CMD,556,False,Hilic,Positive
VT_171031_M314_141.mzML,ST000909,True,pediatric_CMD,556,False,Hilic,Positive
VT_171028_M314_143.mzML,ST000909,True,pediatric_CMD,556,False,Hilic,Positive


In [123]:
metadata_ST000909.to_csv(f'{save_dir}/metadata_ST000909.csv', index=True)

## ST002331
- all patients are non cancer
- we should already have gender and age information

In [108]:
existing_ST002331 = metadata[metadata['Study ID'] == 'ST002331'].copy()
existing_ST002331.dropna(axis=1, how='all', inplace=True)
existing_ST002331.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Age,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
VT_181203_M338_055.mzML,509,ST002331,pediatric_other,509,16.0,M,81.174754,Pretrain,Test,True,3,18,VT_181203_M338_055.mzML,16.0,16.0,0.0
VT_181203_M338_057.mzML,509,ST002331,pediatric_other,509,16.0,M,79.605983,Pretrain,Test,True,3,18,VT_181203_M338_057.mzML,16.0,16.0,0.0
VT_181203_M338_059.mzML,509,ST002331,pediatric_other,509,16.0,M,79.241153,Pretrain,Test,True,3,18,VT_181203_M338_059.mzML,16.0,16.0,0.0
VT_181210_M338_047.mzML,509,ST002331,pediatric_other,509,12.1,F,75.994163,Pretrain,Test,True,3,18,VT_181210_M338_047.mzML,12.1,12.1,1.0
VT_181013_M338_069.mzML,509,ST002331,pediatric_other,509,23.9,M,75.921197,Pretrain,Test,True,3,18,VT_181013_M338_069.mzML,23.9,23.9,0.0


In [124]:
metadata_ST002331 = existing_ST002331[['Study ID','is Pediatric','Cohort Label','Cohort ID','Age','Sex']].copy()
metadata_ST002331.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST002331['Cancer Risk'] = False

# add column and polarity
metadata_ST002331['Column'] = 'Hilic'
metadata_ST002331['Polarity'] = 'Positive'

metadata_ST002331.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Age,Sex,Cancer Risk,Column,Polarity
VT_181203_M338_055.mzML,ST002331,True,pediatric_other,509,16.0,M,False,Hilic,Positive
VT_181203_M338_057.mzML,ST002331,True,pediatric_other,509,16.0,M,False,Hilic,Positive
VT_181203_M338_059.mzML,ST002331,True,pediatric_other,509,16.0,M,False,Hilic,Positive
VT_181210_M338_047.mzML,ST002331,True,pediatric_other,509,12.1,F,False,Hilic,Positive
VT_181013_M338_069.mzML,ST002331,True,pediatric_other,509,23.9,M,False,Hilic,Positive


In [125]:
# save the metadata
metadata_ST002331.to_csv(f'{save_dir}/metadata_ST002331.csv', index=True)

## ST002251
- all patients are non cancer
-  we should already have age and gender information
-  add BMI data

In [133]:
existing_ST002251 = metadata[metadata['Study ID'] == 'ST002251'].copy()
existing_ST002251.dropna(axis=1, how='all', inplace=True)
existing_ST002251.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
20200721_031_FIT01-063-pos.mzML,555,ST002251,pediatric_other,555,M,73.294418,Pretrain,Test,True,3,17,20200721_031_FIT01-063-pos.mzML,0.0
20200722_023_FIT01-056-pos.mzML,555,ST002251,pediatric_other,555,F,72.60124,Pretrain,Test,True,3,17,20200722_023_FIT01-056-pos.mzML,1.0
20200724_041_FIT01-194-pos.mzML,555,ST002251,pediatric_other,555,M,72.418825,Pretrain,Test,True,3,17,20200724_041_FIT01-194-pos.mzML,0.0
20200724_013_FIT01-050-pos.mzML,555,ST002251,pediatric_other,555,M,72.418825,Pretrain,Test,True,3,17,20200724_013_FIT01-050-pos.mzML,0.0
20200721_021_FIT01-086-pos.mzML,555,ST002251,pediatric_other,555,M,72.272893,Pretrain,Test,True,3,17,20200721_021_FIT01-086-pos.mzML,0.0


In [134]:
existing_ST002251.shape

(242, 13)

In [135]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST002251_AN003677.json'

with open(json_file) as f:
    data = json.load(f)

new_ST002251 = extract_subject_factors_dataframe(data)    

In [136]:
new_ST002251

Unnamed: 0,Subject ID,Sample ID,ThreeorMoreBursts,ICShighdose,Study,AgeatEnrollment,AsthmaDurationYears,Sex,Ethnicity,Race,Height,Weight,BMI,LABA,Montelukast,Batch,RAW_FILE_NAME
0,Blank,20200715_001_Blank-pos,,,,,,,,,,,,,,StartBatch,20200715_001_Blank-pos.raw
1,Blank,20200715_002_Blank-neg,,,,,,,,,,,,,,StartBatch,20200715_002_Blank-neg.raw
2,SRM1950,20200715_003_srm1950,,,,,,,,,,,,,,StartBatch,20200715_003_srm1950.raw
3,SRM1950,20200715_004_srm1950,,,,,,,,,,,,,,StartBatch,20200715_004_srm1950.raw
4,QC,20200715_005_qc,,,,,,,,,,,,,,StartBatch,20200715_005_qc.raw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,FIT01-171,FIT01-171-neg,Yes,Yes,Dietary SAA,14,13.5,M,Not Hispanic,Black,172,111.1,37.55,Yes,Yes,Batch11,20200725_040_FIT01-171-neg.raw
542,FIT01-008,FIT01-008-pos,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,130,34.9,20.65,No,No,Batch11,20200725_041_FIT01-008-pos.raw
543,FIT01-008,FIT01-008-neg,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,130,34.9,20.65,No,No,Batch11,20200725_042_FIT01-008-neg.raw
544,QC,20200725_043_QC-pos,,,,,,,,,,,,,,Batch11,20200725_043_QC-pos.raw


In [137]:
new_ST002251['mzml file'] = new_ST002251['RAW_FILE_NAME'].apply(lambda x: x.replace('.raw','.mzML'))
new_ST002251.index = new_ST002251['mzml file'].tolist()
new_ST002251['Sex'] = new_ST002251['Sex'].map(sex_label_matching_dict)

In [138]:
def assign_sample_class(x):
    if 'QC' in x:
        return 'Study_QC_Sample'
    elif 'Blank' in x:
        return 'Study_QC_Sample'
    elif 'srm1950' in x:
        return 'NIST1950'
    else:
        return 'Study_Sample'
    
def assign_charge(x):
    if 'pos' in x:
        return 'Positive'
    elif 'neg' in x:
        return 'Negative'
    else:
        return 'Unknown'    

new_ST002251['Sample_Class'] = new_ST002251['Subject ID'].apply(assign_sample_class)


new_ST002251['Column'] = 'Hilic'
new_ST002251['Polarity']= new_ST002251['mzml file'].apply(assign_charge)

In [139]:
new_ST002251

Unnamed: 0,Subject ID,Sample ID,ThreeorMoreBursts,ICShighdose,Study,AgeatEnrollment,AsthmaDurationYears,Sex,Ethnicity,Race,...,Weight,BMI,LABA,Montelukast,Batch,RAW_FILE_NAME,mzml file,Sample_Class,Column,Polarity
20200715_001_Blank-pos.mzML,Blank,20200715_001_Blank-pos,,,,,,,,,...,,,,,StartBatch,20200715_001_Blank-pos.raw,20200715_001_Blank-pos.mzML,Study_QC_Sample,Hilic,Positive
20200715_002_Blank-neg.mzML,Blank,20200715_002_Blank-neg,,,,,,,,,...,,,,,StartBatch,20200715_002_Blank-neg.raw,20200715_002_Blank-neg.mzML,Study_QC_Sample,Hilic,Negative
20200715_003_srm1950.mzML,SRM1950,20200715_003_srm1950,,,,,,,,,...,,,,,StartBatch,20200715_003_srm1950.raw,20200715_003_srm1950.mzML,Study_Sample,Hilic,Unknown
20200715_004_srm1950.mzML,SRM1950,20200715_004_srm1950,,,,,,,,,...,,,,,StartBatch,20200715_004_srm1950.raw,20200715_004_srm1950.mzML,Study_Sample,Hilic,Unknown
20200715_005_qc.mzML,QC,20200715_005_qc,,,,,,,,,...,,,,,StartBatch,20200715_005_qc.raw,20200715_005_qc.mzML,Study_QC_Sample,Hilic,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200725_040_FIT01-171-neg.mzML,FIT01-171,FIT01-171-neg,Yes,Yes,Dietary SAA,14,13.5,M,Not Hispanic,Black,...,111.1,37.55,Yes,Yes,Batch11,20200725_040_FIT01-171-neg.raw,20200725_040_FIT01-171-neg.mzML,Study_Sample,Hilic,Negative
20200725_041_FIT01-008-pos.mzML,FIT01-008,FIT01-008-pos,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,...,34.9,20.65,No,No,Batch11,20200725_041_FIT01-008-pos.raw,20200725_041_FIT01-008-pos.mzML,Study_Sample,Hilic,Positive
20200725_042_FIT01-008-neg.mzML,FIT01-008,FIT01-008-neg,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,...,34.9,20.65,No,No,Batch11,20200725_042_FIT01-008-neg.raw,20200725_042_FIT01-008-neg.mzML,Study_Sample,Hilic,Negative
20200725_043_QC-pos.mzML,QC,20200725_043_QC-pos,,,,,,,,,...,,,,,Batch11,20200725_043_QC-pos.raw,20200725_043_QC-pos.mzML,Study_QC_Sample,Hilic,Positive


In [140]:

metadata_ST002251 = new_ST002251[['Column','Polarity','Sex','Subject ID','Sample_Class','AgeatEnrollment','BMI','Ethnicity','Race','Batch']].copy()
metadata_ST002251.rename(columns=
    {'AgeatEnrollment': 'Age'}, inplace=True)


metadata_ST002251['Study ID'] = existing_ST002251['Study ID'].iloc[0]
metadata_ST002251['Job ID'] = existing_ST002251['Cohort ID'].iloc[0]
metadata_ST002251['Cohort Label v0'] = 'NA'

In [141]:
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Study_Sample','Cohort Label v0'] = existing_ST002251['Cohort Label'].iloc[0]
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Pooled_Plasma','Cohort Label v0'] = existing_ST002251['Cohort Label'].iloc[0]

metadata_ST002251['is Pediatric'] = 'NA'
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Study_Sample','is Pediatric'] = False
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Pooled_Plasma','is Pediatric'] = False

metadata_ST002251['Cancer Risk'] = 'NA'
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Study_Sample','Cancer Risk'] = False
metadata_ST002251.loc[metadata_ST002251['Sample_Class']=='Pooled_Plasma','Cancer Risk'] = False


In [142]:
# save the metadata
metadata_ST002251.to_csv(f'{save_dir}/metadata_ST002251.csv', index=True)

## ST001931
- all patients are non cancer

In [126]:
existing_ST001931 = metadata[metadata['Study ID'] == 'ST001931'].copy()
existing_ST001931.dropna(axis=1, how='all', inplace=True)
existing_ST001931.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max)
F088_210524_M462_279.mzML,505,ST001931,pediatric_other,505,86.610726,Pretrain,Test,True,3,12,F088_210524_M462_279.mzML,12.0,17.0
F088_210524_M462_017.mzML,505,ST001931,pediatric_other,505,86.501277,Pretrain,Test,True,3,12,F088_210524_M462_017.mzML,12.0,17.0
F088_210524_M462_293.mzML,505,ST001931,pediatric_other,505,86.136447,Pretrain,Test,True,3,12,F088_210524_M462_293.mzML,12.0,17.0
F088_210524_M462_011.mzML,505,ST001931,pediatric_other,505,85.917548,Pretrain,Test,True,3,12,F088_210524_M462_011.mzML,12.0,17.0
F088_210524_M462_289.mzML,505,ST001931,pediatric_other,505,85.917548,Pretrain,Test,True,3,12,F088_210524_M462_289.mzML,12.0,17.0


In [127]:
metadata_ST001931 = existing_ST001931[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST001931.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST001931['Cancer Risk'] = False

# add column and polarity
metadata_ST001931['Column'] = 'Hilic'
metadata_ST001931['Polarity'] = 'Positive'

metadata_ST001931.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk,Column,Polarity
F088_210524_M462_279.mzML,ST001931,True,pediatric_other,505,False,Hilic,Positive
F088_210524_M462_017.mzML,ST001931,True,pediatric_other,505,False,Hilic,Positive
F088_210524_M462_293.mzML,ST001931,True,pediatric_other,505,False,Hilic,Positive
F088_210524_M462_011.mzML,ST001931,True,pediatric_other,505,False,Hilic,Positive
F088_210524_M462_289.mzML,ST001931,True,pediatric_other,505,False,Hilic,Positive


In [128]:
metadata_ST001931.to_csv(f'{save_dir}/metadata_ST001931.csv', index=True)

## ST000601

In [46]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST000601_AN000920.json'

with open(json_file) as f:
    data = json.load(f)

new_ST000601 = extract_subject_factors_dataframe(data)    

In [5]:
ST000601_files_path = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/MS_Data/ST000601/job_id_547/filenames_order.txt'

ST000601_files = pd.read_csv(ST000601_files_path, sep='\t', header=None)[0].to_list()

In [6]:
ST000601_sample_info_path = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/MS_Data/ST000601/job_id_547/sample_info/sample_info.csv'
ST000601_sample_info = pd.read_csv(ST000601_sample_info_path, index_col=0)

In [16]:
ST000601_sample_info.head()

Unnamed: 0_level_0,timestamp,Ref,runtime_hour,batch_id,run_order,mzml_file
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10300S_4x_Batch_3_B.mzML,2013-07-18 00:17:29,0,0.0,0,0,10300S_4x_Batch_3_B.mzML
103280_1X_Batch_3_C.mzML,2013-07-18 00:43:02,0,0.425833,0,1,103280_1X_Batch_3_C.mzML
22753T_2X_Batch_3_A.mzML,2013-07-18 01:08:33,0,0.851111,0,2,22753T_2X_Batch_3_A.mzML
10300S_4x_Batch_3_A.mzML,2013-07-18 01:34:04,0,1.276389,0,3,10300S_4x_Batch_3_A.mzML
10300S_4x_Batch_3_C.mzML,2013-07-18 01:59:35,0,1.701667,0,4,10300S_4x_Batch_3_C.mzML


In [47]:
new_ST000601['Column'] = 'Hilic'
new_ST000601['Polarity'] = 'Positive'
new_ST000601['Cohort Label v0'] = 'adult_other'
new_ST000601['is Pediatric'] = False
new_ST000601['Study ID'] = 'ST000601'
new_ST000601['Job ID'] = 547
new_ST000601['Cancer Risk'] = False
new_ST000601['Sex'] = new_ST000601['Gender'].map(sex_label_matching_dict)
new_ST000601['Subject ID'] = new_ST000601['Sample ID'].values

new_ST000601['Smoking Status'] = new_ST000601['Smoking Status'].map({'1': 'Current', '-':'Former'})


In [48]:
new_ST000601

Unnamed: 0,Subject ID,Sample ID,Smoking Status,Gold Stage,Age,Gender,BMI,FEV1/FVC,FEV1 percent predicted,BDR,% Emphysema,Exacerbation Frequency,Column,Polarity,Cohort Label v0,is Pediatric,Study ID,Job ID,Cancer Risk,Sex
0,10062C,10062C,Former,3,64.6,1,27.71,0.46,36.6,1,,-,Hilic,Positive,adult_other,False,ST000601,547,False,M
1,10071D,10071D,Former,2,66.1,1,32.64,0.65,54,-,4.0476,-,Hilic,Positive,adult_other,False,ST000601,547,False,M
2,10087S,10087S,Former,2,65.5,2,26.58,0.49,52.7,1,21.2527,1,Hilic,Positive,adult_other,False,ST000601,547,False,F
3,10097V,10097V,Former,3,75.7,2,23.62,0.43,31.7,-,16.3921,-,Hilic,Positive,adult_other,False,ST000601,547,False,F
4,10102O,10102O,Former,-,61.2,1,30.82,0.78,94,-,1.5828,-,Hilic,Positive,adult_other,False,ST000601,547,False,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,25036G,25036G,Former,-,65.5,1,23.68,0.75,106.7,-,0.4214,-,Hilic,Positive,adult_other,False,ST000601,547,False,M
127,25214E,25214E,Former,-,73.3,1,27.74,0.73,84.5,-,0.2548,-,Hilic,Positive,adult_other,False,ST000601,547,False,M
128,25229R,25229R,Former,-,59.9,1,32.09,0.73,80.2,1,4.4928,1,Hilic,Positive,adult_other,False,ST000601,547,False,M
129,25237Q,25237Q,Former,3,67.9,1,27.29,0.52,45.4,-,1.5127,-,Hilic,Positive,adult_other,False,ST000601,547,False,M


In [49]:
new_ST000601['Smoking Status'].value_counts()

Smoking Status
Former     101
Current     30
Name: count, dtype: int64

In [50]:
[x for x in ST000601_files if '10087S' in x]

['10087S_2x_Batch_15_A.mzML',
 '10087S_2x_Batch_15_B.mzML',
 '10087S_2x_Batch_15_C.mzML']

In [51]:
ST000601_sample_info['mzml_file'] = ST000601_sample_info.index

In [52]:
def extract_sample_id(x):
    return x.split('_')[0]

def extract_batch_number(x):
    return int(x.split('_')[3])

In [53]:
ST000601_sample_info['Batch'] = ST000601_sample_info['mzml_file'].apply(extract_batch_number)
ST000601_sample_info['Sample ID'] = ST000601_sample_info['mzml_file'].apply(extract_sample_id)

In [54]:
ST000601_sample_info

Unnamed: 0_level_0,timestamp,Ref,runtime_hour,batch_id,run_order,mzml_file,Batch,Sample ID
file_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10300S_4x_Batch_3_B.mzML,2013-07-18 00:17:29,0,0.000000,0,0,10300S_4x_Batch_3_B.mzML,3,10300S
103280_1X_Batch_3_C.mzML,2013-07-18 00:43:02,0,0.425833,0,1,103280_1X_Batch_3_C.mzML,3,103280
22753T_2X_Batch_3_A.mzML,2013-07-18 01:08:33,0,0.851111,0,2,22753T_2X_Batch_3_A.mzML,3,22753T
10300S_4x_Batch_3_A.mzML,2013-07-18 01:34:04,0,1.276389,0,3,10300S_4x_Batch_3_A.mzML,3,10300S
10300S_4x_Batch_3_C.mzML,2013-07-18 01:59:35,0,1.701667,0,4,10300S_4x_Batch_3_C.mzML,3,10300S
...,...,...,...,...,...,...,...,...
10136F_2x_Batch_20_A.mzML,2013-08-26 11:24:40,0,947.119722,0,388,10136F_2x_Batch_20_A.mzML,20,10136F
10136F_2x_Batch_20_C.mzML,2013-08-26 12:41:22,0,948.398056,0,389,10136F_2x_Batch_20_C.mzML,20,10136F
23090C_2x_Batch_20_C.mzML,2013-08-26 13:06:56,0,948.824167,0,390,23090C_2x_Batch_20_C.mzML,20,23090C
23090C_2x_Batch_20_A.mzML,2013-08-26 13:32:31,0,949.250556,0,391,23090C_2x_Batch_20_A.mzML,20,23090C


In [55]:
combined_ST000601 = pd.merge(new_ST000601, ST000601_sample_info, on='Sample ID', how='inner')

In [57]:
combined_ST000601.index = combined_ST000601['mzml_file'].tolist()
combined_ST000601.head()

Unnamed: 0,Subject ID,Sample ID,Smoking Status,Gold Stage,Age,Gender,BMI,FEV1/FVC,FEV1 percent predicted,BDR,...,Job ID,Cancer Risk,Sex,timestamp,Ref,runtime_hour,batch_id,run_order,mzml_file,Batch
10062C_1x_Batch_6_C.mzML,10062C,10062C,Former,3,64.6,1,27.71,0.46,36.6,1,...,547,False,M,2013-07-23 07:51:45,0,127.571111,0,73,10062C_1x_Batch_6_C.mzML,6
10062C_1x_Batch_6_A.mzML,10062C,10062C,Former,3,64.6,1,27.71,0.46,36.6,1,...,547,False,M,2013-07-23 09:33:50,0,129.2725,0,75,10062C_1x_Batch_6_A.mzML,6
10062C_1x_Batch_6_B.mzML,10062C,10062C,Former,3,64.6,1,27.71,0.46,36.6,1,...,547,False,M,2013-07-23 13:49:05,0,133.526667,0,82,10062C_1x_Batch_6_B.mzML,6
10071D_4x_Batch_18_C.mzML,10071D,10071D,Former,2,66.1,1,32.64,0.65,54.0,-,...,547,False,M,2013-08-19 03:12:49,0,770.922222,0,333,10071D_4x_Batch_18_C.mzML,18
10071D_4x_Batch_18_A.mzML,10071D,10071D,Former,2,66.1,1,32.64,0.65,54.0,-,...,547,False,M,2013-08-19 13:01:01,0,780.725556,0,343,10071D_4x_Batch_18_A.mzML,18


In [58]:
combined_ST000601.columns.tolist()

['Subject ID',
 'Sample ID',
 'Smoking Status',
 'Gold Stage',
 'Age',
 'Gender',
 'BMI',
 'FEV1/FVC',
 'FEV1 percent predicted',
 'BDR',
 '% Emphysema',
 'Exacerbation Frequency',
 'Column',
 'Polarity',
 'Cohort Label v0',
 'is Pediatric',
 'Study ID',
 'Job ID',
 'Cancer Risk',
 'Sex',
 'timestamp',
 'Ref',
 'runtime_hour',
 'batch_id',
 'run_order',
 'mzml_file',
 'Batch']

In [60]:
metadata_ST000601 = combined_ST000601[['Column','Polarity','Subject ID','BMI','Sex',\
                                       'Smoking Status','Cohort Label v0','is Pediatric',\
                                        'Study ID', 'Job ID', 'Cancer Risk']].copy()


In [62]:
metadata_ST000601.to_csv(f'{save_dir}/metadata_ST000601.csv', index=True)

## ST002773

In [106]:
load_dir = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/July_05_Data'

In [107]:
new_ST002773 = pd.read_csv(f'{load_dir}/shanghai_lung_cancer_metadata.csv', index_col=0)

In [108]:
new_ST002773

Unnamed: 0,Sample ID,Factors.Subject_ID,Factors.LungCancer,Batch,Raw_files,AGE,Column Name,mzml_file,timestamp,npy_file,...,Group,QC,Pool,Age,Cancer,Sex,Region,Smoker,Ref,batch_id
F076_200714_M331_001.mzML,q3June2014_2a,PooledQAQC,-,2,F076_200714_M331_001,-,Hilic Positive,F076_200714_M331_001.mzML,2020-07-15 13:51:16,F076_200714_M331_001.npy,...,Odds Group 0,True,True,,,F,China,Never,1,2
F076_200714_M331_009.mzML,q3June2014_2b,PooledQAQC,-,2,F076_200714_M331_009,-,Hilic Positive,F076_200714_M331_009.mzML,2020-07-15 14:44:23,F076_200714_M331_009.npy,...,Odds Group 0,True,True,,,F,China,Never,1,2
F076_200714_M331_017.mzML,SWHS0787,Study_sample,control,2,F076_200714_M331_017,58,Hilic Positive,F076_200714_M331_017.mzML,2020-07-15 15:37:34,F076_200714_M331_017.npy,...,Odds Group 0,False,False,58.0,0.0,F,China,Never,0,2
F076_200714_M331_025.mzML,SWHS0108,Study_sample,control,2,F076_200714_M331_025,66,Hilic Positive,F076_200714_M331_025.mzML,2020-07-15 16:30:38,F076_200714_M331_025.npy,...,Odds Group 0,False,False,66.0,0.0,F,China,Never,0,2
F076_200714_M331_033.mzML,SWHS0185,Study_sample,case,2,F076_200714_M331_033,65,Hilic Positive,F076_200714_M331_033.mzML,2020-07-15 17:23:42,F076_200714_M331_033.npy,...,Odds Group 0,False,False,65.0,1.0,F,China,Never,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F076_201011_M331_249.mzML,SWHS0085,Study_sample,control,25,F076_201011_M331_249,45,Hilic Positive,F076_201011_M331_249.mzML,2020-10-25 22:29:14,F076_201011_M331_249.npy,...,Odds Group 3,False,False,45.0,0.0,F,China,Never,0,25
F076_201011_M331_257.mzML,SWHS0646,Study_sample,case,25,F076_201011_M331_257,65,Hilic Positive,F076_201011_M331_257.mzML,2020-10-25 23:34:43,F076_201011_M331_257.npy,...,Odds Group 3,False,False,65.0,1.0,F,China,Never,0,25
F076_201011_M331_265.mzML,q3June2014_25e,PooledQAQC,-,25,F076_201011_M331_265,-,Hilic Positive,F076_201011_M331_265.mzML,2020-10-26 00:40:20,F076_201011_M331_265.npy,...,Odds Group 3,True,True,,,F,China,Never,1,25
F076_201011_M331_273.mzML,q3June2014_25f,PooledQAQC,-,25,F076_201011_M331_273,-,Hilic Positive,F076_201011_M331_273.mzML,2020-10-26 01:45:55,F076_201011_M331_273.npy,...,Odds Group 3,True,True,,,F,China,Never,1,25


In [109]:
new_ST002773['Column'] = 'Hilic'
new_ST002773['Polarity'] = 'Positive'
new_ST002773.rename(columns={'Sample ID':'Subject ID',
                             'Factors.Subject_ID':'Sample_Class',
                             'Factors.LungCancer': 'Diagnosis',
                             'Smoker' : 'Smoking Status'}, inplace=True)

new_ST002773['Cancer Risk'] = new_ST002773['Cancer'].map({0.0: False, 1.0: True})
new_ST002773.head()

Unnamed: 0,Subject ID,Sample_Class,Diagnosis,Batch,Raw_files,AGE,Column Name,mzml_file,timestamp,npy_file,...,Age,Cancer,Sex,Region,Smoking Status,Ref,batch_id,Column,Polarity,Cancer Risk
F076_200714_M331_001.mzML,q3June2014_2a,PooledQAQC,-,2,F076_200714_M331_001,-,Hilic Positive,F076_200714_M331_001.mzML,2020-07-15 13:51:16,F076_200714_M331_001.npy,...,,,F,China,Never,1,2,Hilic,Positive,
F076_200714_M331_009.mzML,q3June2014_2b,PooledQAQC,-,2,F076_200714_M331_009,-,Hilic Positive,F076_200714_M331_009.mzML,2020-07-15 14:44:23,F076_200714_M331_009.npy,...,,,F,China,Never,1,2,Hilic,Positive,
F076_200714_M331_017.mzML,SWHS0787,Study_sample,control,2,F076_200714_M331_017,58,Hilic Positive,F076_200714_M331_017.mzML,2020-07-15 15:37:34,F076_200714_M331_017.npy,...,58.0,0.0,F,China,Never,0,2,Hilic,Positive,False
F076_200714_M331_025.mzML,SWHS0108,Study_sample,control,2,F076_200714_M331_025,66,Hilic Positive,F076_200714_M331_025.mzML,2020-07-15 16:30:38,F076_200714_M331_025.npy,...,66.0,0.0,F,China,Never,0,2,Hilic,Positive,False
F076_200714_M331_033.mzML,SWHS0185,Study_sample,case,2,F076_200714_M331_033,65,Hilic Positive,F076_200714_M331_033.mzML,2020-07-15 17:23:42,F076_200714_M331_033.npy,...,65.0,1.0,F,China,Never,0,2,Hilic,Positive,True


In [110]:
new_ST002773.columns

Index(['Subject ID', 'Sample_Class', 'Diagnosis', 'Batch', 'Raw_files', 'AGE',
       'Column Name', 'mzml_file', 'timestamp', 'npy_file', 'run_order',
       'ms precision', 'npy file size (mb)', 'mzml file size (mb)',
       'runtime_hour', 'Group', 'QC', 'Pool', 'Age', 'Cancer', 'Sex', 'Region',
       'Smoking Status', 'Ref', 'batch_id', 'Column', 'Polarity',
       'Cancer Risk'],
      dtype='object')

In [111]:
metadata_ST002773 = new_ST002773[['Column','Polarity','Subject ID','Age','Cancer Risk','Diagnosis',\
                                  'Region','Smoking Status','Sex','Sample_Class']].copy()

# drop duplicate columns



metadata_ST002773['Study ID'] = 'ST002773'
metadata_ST002773['Job ID'] = -1
metadata_ST002773['Cohort Label v0'] = 'adult_cancer'
metadata_ST002773['is Pediatric'] = False

# remove the extra age column
# metadata_ST002773.drop('AGE', axis=1, inplace=True)

metadata_ST002773.to_csv(f'{save_dir}/metadata_ST002773.csv', index=True)

In [112]:
metadata_ST002773

Unnamed: 0,Column,Polarity,Subject ID,Age,Cancer Risk,Diagnosis,Region,Smoking Status,Sex,Sample_Class,Study ID,Job ID,Cohort Label v0,is Pediatric
F076_200714_M331_001.mzML,Hilic,Positive,q3June2014_2a,,,-,China,Never,F,PooledQAQC,ST002773,-1,adult_cancer,False
F076_200714_M331_009.mzML,Hilic,Positive,q3June2014_2b,,,-,China,Never,F,PooledQAQC,ST002773,-1,adult_cancer,False
F076_200714_M331_017.mzML,Hilic,Positive,SWHS0787,58.0,False,control,China,Never,F,Study_sample,ST002773,-1,adult_cancer,False
F076_200714_M331_025.mzML,Hilic,Positive,SWHS0108,66.0,False,control,China,Never,F,Study_sample,ST002773,-1,adult_cancer,False
F076_200714_M331_033.mzML,Hilic,Positive,SWHS0185,65.0,True,case,China,Never,F,Study_sample,ST002773,-1,adult_cancer,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F076_201011_M331_249.mzML,Hilic,Positive,SWHS0085,45.0,False,control,China,Never,F,Study_sample,ST002773,-1,adult_cancer,False
F076_201011_M331_257.mzML,Hilic,Positive,SWHS0646,65.0,True,case,China,Never,F,Study_sample,ST002773,-1,adult_cancer,False
F076_201011_M331_265.mzML,Hilic,Positive,q3June2014_25e,,,-,China,Never,F,PooledQAQC,ST002773,-1,adult_cancer,False
F076_201011_M331_273.mzML,Hilic,Positive,q3June2014_25f,,,-,China,Never,F,PooledQAQC,ST002773,-1,adult_cancer,False


## Stanford BMI Data

In [81]:
new_stanfordbmi = pd.read_csv(f'{load_dir}/stanford_hmp2_metadata.csv', index_col=0)


In [82]:
new_stanfordbmi.head()

Unnamed: 0_level_0,subject ID,Study,Race,Age,BMI,SSPG,IR_IS_classification,CollectionDate,Event,Event_Note1,Event_Note2,Event_Note3,SubStudy,file name,mzml_file.1,timestamp,Ref,runtime_hour,batch_id,run_order
mzml_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ZN3TBJM-1013_HILIC-pos-B1.mzML,ZN3TBJM,HMP,C,67.47,30.18,133.5,IS,623.0,Infection,D7,,Infection_Late,HMP,ZN3TBJM-1013_HILIC-pos-B1,ZN3TBJM-1013_HILIC-pos-B1.mzML,2016-12-15 05:17:10,0,8.908333,0,9
ZNED4XZ-4014_HILIC-pos-B1.mzML,ZNED4XZ,HMP,C,59.21,31.64,,Unknown,503.0,Ant_L,D21,amoxicillin,Ant_Recovery_Early,HMP,ZNED4XZ-4014_HILIC-pos-B1,ZNED4XZ-4014_HILIC-pos-B1.mzML,2016-12-15 06:37:19,0,10.244167,0,10
ZLZNCLZ-2013_HILIC-pos-B1.mzML,ZLZNCLZ,HMP,C,54.74,27.33,130.0,IS,622.0,Imz,D3,flu,Imz_Middle,HMP,ZLZNCLZ-2013_HILIC-pos-B1,ZLZNCLZ-2013_HILIC-pos-B1.mzML,2016-12-15 07:04:03,0,10.689722,0,11
ZLZQMEV-07_HILIC-pos-B1.mzML,ZLZQMEV,HMP,H,61.0,29.99,221.0,IR,727.0,Healthy,,,,HMP,ZLZQMEV-07_HILIC-pos-B1,ZLZQMEV-07_HILIC-pos-B1.mzML,2016-12-15 08:24:11,0,12.025278,0,12
ZN0JE53-02_HILIC-pos-B1.mzML,ZN0JE53,HMP,B,66.81,33.32,148.0,IS,327.0,Weight-gain,,,,HMP,ZN0JE53-02_HILIC-pos-B1,ZN0JE53-02_HILIC-pos-B1.mzML,2016-12-15 10:11:02,0,13.806111,0,15


In [83]:
new_stanfordbmi['Column'] = 'Hilic'
new_stanfordbmi['Polarity'] = 'Positive'
new_stanfordbmi.rename(columns={'subject ID':'Subject ID',
                             'Event' : 'Diagnosis'}, inplace=True)

new_stanfordbmi['Cancer Risk'] = False
new_stanfordbmi.head()

Unnamed: 0_level_0,Subject ID,Study,Race,Age,BMI,SSPG,IR_IS_classification,CollectionDate,Diagnosis,Event_Note1,...,file name,mzml_file.1,timestamp,Ref,runtime_hour,batch_id,run_order,Column,Polarity,Cancer Risk
mzml_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ZN3TBJM-1013_HILIC-pos-B1.mzML,ZN3TBJM,HMP,C,67.47,30.18,133.5,IS,623.0,Infection,D7,...,ZN3TBJM-1013_HILIC-pos-B1,ZN3TBJM-1013_HILIC-pos-B1.mzML,2016-12-15 05:17:10,0,8.908333,0,9,Hilic,Positive,False
ZNED4XZ-4014_HILIC-pos-B1.mzML,ZNED4XZ,HMP,C,59.21,31.64,,Unknown,503.0,Ant_L,D21,...,ZNED4XZ-4014_HILIC-pos-B1,ZNED4XZ-4014_HILIC-pos-B1.mzML,2016-12-15 06:37:19,0,10.244167,0,10,Hilic,Positive,False
ZLZNCLZ-2013_HILIC-pos-B1.mzML,ZLZNCLZ,HMP,C,54.74,27.33,130.0,IS,622.0,Imz,D3,...,ZLZNCLZ-2013_HILIC-pos-B1,ZLZNCLZ-2013_HILIC-pos-B1.mzML,2016-12-15 07:04:03,0,10.689722,0,11,Hilic,Positive,False
ZLZQMEV-07_HILIC-pos-B1.mzML,ZLZQMEV,HMP,H,61.0,29.99,221.0,IR,727.0,Healthy,,...,ZLZQMEV-07_HILIC-pos-B1,ZLZQMEV-07_HILIC-pos-B1.mzML,2016-12-15 08:24:11,0,12.025278,0,12,Hilic,Positive,False
ZN0JE53-02_HILIC-pos-B1.mzML,ZN0JE53,HMP,B,66.81,33.32,148.0,IS,327.0,Weight-gain,,...,ZN0JE53-02_HILIC-pos-B1,ZN0JE53-02_HILIC-pos-B1.mzML,2016-12-15 10:11:02,0,13.806111,0,15,Hilic,Positive,False


In [85]:
new_stanfordbmi.columns

Index(['Subject ID', 'Study', 'Race', 'Age', 'BMI', 'SSPG',
       'IR_IS_classification', 'CollectionDate', 'Diagnosis', 'Event_Note1',
       'Event_Note2', 'Event_Note3', 'SubStudy', 'file name', 'mzml_file.1',
       'timestamp', 'Ref', 'runtime_hour', 'batch_id', 'run_order', 'Column',
       'Polarity', 'Cancer Risk'],
      dtype='object')

In [86]:
metadata_stanfordbmi = new_stanfordbmi[['Column','Polarity','BMI','Race','Subject ID','Age','Cancer Risk','Diagnosis']].copy()
metadata_stanfordbmi['is Pediatric'] = False
metadata_stanfordbmi['Cohort Label v0'] = 'adult_other'
metadata_stanfordbmi['Study ID'] = 'stanford-hmp2'
metadata_stanfordbmi['Job ID'] = 627

metadata_stanfordbmi.to_csv(f'{save_dir}/metadata_stanford-hmp2.csv', index=True)

In [87]:
metadata_stanfordbmi

Unnamed: 0_level_0,Column,Polarity,BMI,Race,Subject ID,Age,Cancer Risk,Diagnosis,is Pediatric,Cohort Label v0,Study ID,Job ID
mzml_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ZN3TBJM-1013_HILIC-pos-B1.mzML,Hilic,Positive,30.18,C,ZN3TBJM,67.47,False,Infection,False,adult_other,stanfordbmi,627
ZNED4XZ-4014_HILIC-pos-B1.mzML,Hilic,Positive,31.64,C,ZNED4XZ,59.21,False,Ant_L,False,adult_other,stanfordbmi,627
ZLZNCLZ-2013_HILIC-pos-B1.mzML,Hilic,Positive,27.33,C,ZLZNCLZ,54.74,False,Imz,False,adult_other,stanfordbmi,627
ZLZQMEV-07_HILIC-pos-B1.mzML,Hilic,Positive,29.99,H,ZLZQMEV,61.00,False,Healthy,False,adult_other,stanfordbmi,627
ZN0JE53-02_HILIC-pos-B1.mzML,Hilic,Positive,33.32,B,ZN0JE53,66.81,False,Weight-gain,False,adult_other,stanfordbmi,627
...,...,...,...,...,...,...,...,...,...,...,...,...
Exercise_HILIC-pos_ZN3TBJM-E17.mzML,Hilic,Positive,30.18,C,ZN3TBJM,67.47,False,Exercise,False,adult_other,stanfordbmi,627
Exercise_HILIC-pos_ZPMBHPS-E18.mzML,Hilic,Positive,32.29,C,ZPMBHPS,54.39,False,Exercise,False,adult_other,stanfordbmi,627
Exercise_HILIC-pos_ZPEL6L3-E18.mzML,Hilic,Positive,24.44,C,ZPEL6L3,52.33,False,Exercise,False,adult_other,stanfordbmi,627
Exercise_HILIC-pos_ZWFDEY0-E18.mzML,Hilic,Positive,30.79,A,ZWFDEY0,62.92,False,Exercise,False,adult_other,stanfordbmi,627
