In [30]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [154]:
def extract_subject_factors_dataframe(data):
    '''
    From the Json files saved from Metabolomics Workbench
    '''
    samples = []
    for sample in data['SUBJECT_SAMPLE_FACTORS']:
        sample_info = {
            'Subject ID': sample['Subject ID'],
            'Sample ID': sample['Sample ID']
        }
        
        # Extract subkeys from 'Factors'
        for key, value in sample['Factors'].items():
            sample_info[key] = value
        
        # Extract subkeys from 'Additional sample data'
        for key, value in sample['Additional sample data'].items():
            sample_info[key] = value
        
        samples.append(sample_info)

    # Create a DataFrame from the extracted information
    df = pd.DataFrame(samples)
    return df

def increment_the_last_number(df, column_name, amount=2):
    def decrement(s):
        head, _, tail = s.rpartition('_')
        return f"{head}_{int(tail) + int(amount):03}"
    
    df[column_name] = df[column_name].apply(decrement)
    return df


sex_label_matching_dict = {
    '1': 'M',
    '2': 'F',
    'Male': 'M',
    'Female': 'F',
    'M': 'M',
    'F': 'F',
    1 : 'M',
    2 : 'F'}


In [3]:
old_data_loc = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/April_05_Data'

new_data_loc = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/development_CohortCombination/alignment_RCC_2024_Feb_27/May_28_Data'


In [99]:
metadata_file = f'{old_data_loc}/metadata.csv'
# Read the metadata file
metadata = pd.read_csv(metadata_file, index_col=0)

In [100]:
metadata.columns

Index(['cohort_id', 'Study ID', 'Cohort Label', 'Cohort ID', 'OS', 'OS_Event',
       'Age', 'subject ID', 'study_week', 'Region', 'Sex', 'Race',
       'Dose (mg/kg)', 'phase', 'Treatment', 'Prior_2', 'batch_id',
       'runtime_hour', 'run_order', 'MSKCC', 'ORR', 'Benefit',
       'ExtremeResponder', 'PFS', 'PFS_Event', 'MV', 'Age_Group',
       'Benefit BINARY', 'Benefit ORDINAL', 'Nivo Benefit BINARY',
       'MSKCC BINARY', 'MSKCC ORDINAL', 'Matt Set', 'Set', 'Pretrain',
       'is Pediatric', 'Cohort Label ENC', 'Study ID ENC', 'file id',
       'Age Range (min)', 'Age Range (max)', 'IMDC', 'IMDC ORDINAL',
       'IMDC BINARY', 'is Female'],
      dtype='object')

In [6]:
metadata['Study ID'].value_counts()

Study ID
ST001932    4482
ST001422    2051
ST001931    2044
ST001428    1522
ST001237    1379
ST002331    1315
ST001423    1192
ST000909     742
ST001849     691
ST002027     356
ST001408     349
ST002112     335
ST001236     271
ST001918     271
ST002251     242
ST001519     166
ST002244     122
ST000388      95
ST000422      60
Name: count, dtype: int64

## ST000422
no useful metadata

### Correct the cohort label for ST000422
previously the cohort label for ST000422 was set to adult_cancer, but it should be adult_other

In [7]:
metadata.groupby('Cohort Label')['Cohort Label ENC'].mean()

Cohort Label
adult_cancer       0.0
adult_other        1.0
pediatric_CMD      2.0
pediatric_other    3.0
Name: Cohort Label ENC, dtype: float64

In [8]:
print(metadata[metadata['Study ID'] == 'ST000422']['Cohort Label'].value_counts())

Cohort Label
adult_cancer    60
Name: count, dtype: int64


In [9]:
metadata.loc[metadata['Study ID'] == 'ST000422','Cohort Label'] = 'adult_other'
metadata.loc[metadata['Study ID'] == 'ST000422','Cohort Label ENC'] = 1

In [118]:
existing_ST000422 = metadata[metadata['Study ID'] == 'ST000422'].copy()
existing_ST000422.dropna(axis=1, how='all', inplace=True)
existing_ST000422.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
10jan12_26-r001.mzML,550,ST000422,adult_cancer,550,80.226195,Pretrain,Test,False,0,1,10jan12_26-r001.mzML
10jan12_56-r001.mzML,550,ST000422,adult_cancer,550,79.095221,Pretrain,Test,False,0,1,10jan12_56-r001.mzML
10jan12_13-r002.mzML,550,ST000422,adult_cancer,550,78.912806,Pretrain,Test,False,0,1,10jan12_13-r002.mzML
10jan12_81-r002.mzML,550,ST000422,adult_cancer,550,77.891281,Pretrain,Test,False,0,1,10jan12_81-r002.mzML
10jan12_55-r002.mzML,550,ST000422,adult_cancer,550,77.818314,Pretrain,Test,False,0,1,10jan12_55-r002.mzML


In [119]:
metadata_ST000422 = existing_ST000422[['Study ID']].copy()
metadata_ST000422['Cohort Label v0'] = existing_ST000422['Cohort Label'].iloc[0]
metadata_ST000422['Job ID'] = existing_ST000422['Cohort ID'].iloc[0]
metadata_ST000422['is Pediatric'] = existing_ST000422['is Pediatric'].iloc[0]
metadata_ST000422['Cancer Risk'] = False

In [None]:
metadata_ST000422.head()

### Leila Note (Friday May 24th):
So I realized we don't need to know baseline and 3 years for cancer labeling
Because if a patient has "-" value in the cancer detection columns, its label would be non-cancer
and if a patient has "1" in in the cancer detection columns, its label would be "cancer", even the cancer diagnosed later, the earlier time point is very interesting, as it reflect cancer risk

## ST001422, ST001423
- lots of patients have gender metadata
- many have Age, BMI and cancer status


In [109]:
existing_ST001422 = metadata[metadata['Study ID'] == 'ST001422'].copy()
existing_ST001423 = metadata[metadata['Study ID'] == 'ST001423'].copy()


In [10]:
asprin_metadata_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/Aspirin_PR000730/combined_patient_samples.xlsx'

asprin_metadata = pd.read_excel(asprin_metadata_file,sheet_name='combined_patient_samples (2)', index_col=0, header=0)
asprin_metadata = asprin_metadata[~(asprin_metadata['IonizationMode'] == 'C18neg')].copy()

### Compare to paper

2020_Metabolomics Analysis of Aspirin’s Effects in Human Colon Tissue and Associations with Adenoma Risk
In table 1 of the supplements

Total number of patients considered: 100+114+111 = 325 patients

"≥1 Adenoma at Year 3 Colonoscopy" has 33+39+47 = 119 patients

"≥1 Advanced Adenoma at Year 3 Colonoscopy" has 6+8+13 = 27 patients

"High-risk Findings at Year 3 Colonoscopy" has 9+11+16= 36 patients

In [11]:
asprin_patients = asprin_metadata.groupby('PatientID').first()
asprin_patients = asprin_patients[~asprin_patients['BMI'].isna()].copy()
patient_total = asprin_patients.shape[0]
print(f'Number of patients in aspirin study with BMI: {patient_total}')


Tradyn_total = asprin_patients['Tradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Tradyn: {Tradyn_total}')

Advtradyn_total = asprin_patients['Advtradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Advtradyn: {Advtradyn_total}')

Hradvtradyn_total = asprin_patients['Hradvtradyn'].value_counts()[1]
print(f'Number of patients in aspirin study with Hradvtradyn: {Hradvtradyn_total}')

Number of patients in aspirin study with BMI: 325
Number of patients in aspirin study with Tradyn: 119
Number of patients in aspirin study with Advtradyn: 27
Number of patients in aspirin study with Hradvtradyn: 36


In [12]:
subset_ST001423 = asprin_metadata[~asprin_metadata['RAW_FILE_NAME_ST001423'].isna()].copy()
subset_ST001423.index = subset_ST001423['RAW_FILE_NAME_ST001423'].apply(lambda x: x+'.mzML')

subset_ST001422 = asprin_metadata[~asprin_metadata['RAW_FILE_NAME_ST001422'].isna()].copy()
subset_ST001422.index = subset_ST001422['RAW_FILE_NAME_ST001422'].apply(lambda x: x+'.mzML')


In [13]:
missing_files_ST001423 = subset_ST001423.index.difference(metadata.index)
missing_files_ST001422 = subset_ST001422.index.difference(metadata.index)

print(f'Number of missing files in ST001423: {len(missing_files_ST001423)}')
print(f'Number of missing files in ST001422: {len(missing_files_ST001422)}')

Number of missing files in ST001423: 86
Number of missing files in ST001422: 8


In [14]:
print('Missing with Sex info', np.sum(~subset_ST001423.loc[missing_files_ST001423,'Sex'].isna()))
print('Missing with Age info', np.sum(~subset_ST001423.loc[missing_files_ST001423,'Age'].isna()))

print('Missing with Sex info', np.sum(~subset_ST001422.loc[missing_files_ST001422,'Sex'].isna()))
print('Missing with Age info', np.sum(~subset_ST001422.loc[missing_files_ST001422,'Age'].isna()))



Missing with Sex info 86
Missing with Age info 46
Missing with Sex info 8
Missing with Age info 2


#### Note 

<span style="color:blue">I think I was too quick to remove outliers from this dataset. 
between these two studies, there are another 94 samples that have Sex
with 48 of those having Age, BMI and cancer information, </span>

Leila says we can add some of these back in, focus more on removing the outliers due to the missing values



In [15]:
Patient_total = (subset_ST001423[~subset_ST001423['BMI'].isna()].shape[0] + subset_ST001422[~subset_ST001422['BMI'].isna()].shape[0])/2
print(f'Patient total: {Patient_total}')

Tradyn_total = (subset_ST001423['Tradyn'].value_counts()[1] + subset_ST001422['Tradyn'].value_counts()[1])/2
print(f'Tradyn total: {Tradyn_total}') #119 

Advtradyn_total = (subset_ST001423['Advtradyn'].value_counts()[1] + subset_ST001422['Advtradyn'].value_counts()[1])/2
print(f'Advtradyn total: {Advtradyn_total}')

Hradvtradyn_total = (subset_ST001423['Hradvtradyn'].value_counts()[1] + subset_ST001422['Hradvtradyn'].value_counts()[1])/2
print(f'Hradvtradyn total: {Hradvtradyn_total}')

Patient total: 293.0
Tradyn total: 109.0
Advtradyn total: 24.0
Hradvtradyn total: 33.0


In [58]:
# we want to add columns for has_cancer, bmi, 
# update the age, update the Sex
# Leila says we can add some of these 


subset_ST001423['Cancer Risk'] = (subset_ST001423['Tradyn']==1) | (subset_ST001423['Advtradyn']==1) | (subset_ST001423['Hradvtradyn']==1)
subset_ST001422['Cancer Risk'] = (subset_ST001422['Tradyn']==1) | (subset_ST001422['Advtradyn']==1) | (subset_ST001422['Hradvtradyn']==1)

In [61]:
subset_ST001423['Diagnosis'] = subset_ST001423['Cancer Risk'].apply(lambda x: 'Colon Cancer Risk' if x else 'Healthy')
subset_ST001422['Diagnosis'] = subset_ST001422['Cancer Risk'].apply(lambda x: 'Colon Cancer Risk' if x else 'Healthy')

In [63]:
subset_ST001423.groupby(['Tradyn','Advtradyn','Hradvtradyn']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PatientID,Treatment,Race,Age,BMI,Sex,Center,IonizationMode,Date Analyzed,Batch,Available data ST001091,Raw File - colon tissue - ST001091,Available data_ST001422,RAW_FILE_NAME_ST001422,Available data_ST001423,RAW_FILE_NAME_ST001423,Cancer Risk,Diagnosis
Tradyn,Advtradyn,Hradvtradyn,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1,1,20,20,20,20,20,20,20,20,20,20,20,20,0,0,20,20,20,20
1,-,1,4,4,4,4,4,4,4,4,4,4,4,4,0,0,4,4,4,4
1,-,-,64,64,64,64,64,64,64,64,64,64,64,64,0,0,64,64,64,64
-,-,-,148,148,148,148,148,148,148,148,148,148,148,148,0,0,148,148,148,148


In [64]:
subset_ST001423['Diagnosis Type'] = ''
subset_ST001423.loc[subset_ST001423['Tradyn']==1,'Diagnosis Type'] = '≥1 Adenoma at Year 3 Colonoscopy'
subset_ST001423.loc[subset_ST001423['Advtradyn']==1,'Diagnosis Type'] = '≥1 Advanced Adenoma at Year 3 Colonoscopy'
subset_ST001423.loc[subset_ST001423['Hradvtradyn']==1,'Diagnosis Type'] = '≥1 High Risk Findings at Year 3 Colonoscopy'
subset_ST001423.loc[(subset_ST001423['Hradvtradyn']==1) & (subset_ST001423['Tradyn']==1),'Diagnosis Type'] = '≥1 High Risk Findings and ≥1 Advanced Adenoma at Year 3 Colonoscopy'

subset_ST001422['Diagnosis Type'] = ''
subset_ST001422.loc[subset_ST001422['Tradyn']==1,'Diagnosis Type'] = '≥1 Adenoma at Year 3 Colonoscopy'
subset_ST001422.loc[subset_ST001422['Advtradyn']==1,'Diagnosis Type'] = '≥1 Advanced Adenoma at Year 3 Colonoscopy'
subset_ST001422.loc[subset_ST001422['Hradvtradyn']==1,'Diagnosis Type'] = '≥1 High Risk Findings at Year 3 Colonoscopy'
subset_ST001422.loc[(subset_ST001422['Hradvtradyn']==1) & (subset_ST001422['Tradyn']==1),'Diagnosis Type'] = '≥1 High Risk Findings and ≥1 Advanced Adenoma at Year 3 Colonoscopy'

In [67]:
metadata_ST001423 = subset_ST001423[['PatientID','Treatment','Race','Age','BMI','Sex','Cancer Risk','Diagnosis','Diagnosis Type']].copy()
metadata_ST001423.index = subset_ST001423.index.to_list()
metadata_ST001423['Study ID'] = 'ST001423'
metadata_ST001423.rename(columns={'PatientID':'Subject ID'}, inplace=True)

metadata_ST001422 = subset_ST001422[['PatientID','Treatment','Race','Age','BMI','Sex','Cancer Risk','Diagnosis','Diagnosis Type']].copy()
metadata_ST001422.index = subset_ST001422.index.to_list()
metadata_ST001422['Study ID'] = 'ST001422' 
metadata_ST001422.rename(columns={'PatientID':'Subject ID'}, inplace=True)

In [110]:
metadata_ST001423['Job ID'] = existing_ST001422['Cohort ID'].iloc[0]
metadata_ST001422['Job ID'] = existing_ST001423['Cohort ID'].iloc[0]

metadata_ST001423['Cohort Label v0'] = existing_ST001423['Cohort Label'].iloc[0]
metadata_ST001422['Cohort Label v0'] = existing_ST001422['Cohort Label'].iloc[0]

metadata_ST001423['is Pediatric'] = existing_ST001423['is Pediatric'].iloc[0]
metadata_ST001422['is Pediatric'] = existing_ST001422['is Pediatric'].iloc[0]

In [None]:
metadata_ST001423.head()

In [111]:
metadata_ST001422.head()

Unnamed: 0,subject ID,Treatment,Race,Age,BMI,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Study ID,Job ID
VT_160123_079.mzML,10203,Aspirin 81 mg,1.0,48.0,30.083829,M,True,Colon Cancer Risk,≥1 High Risk Findings and ≥1 Advanced Adenoma ...,ST001422,526
VT_160123_085.mzML,10203,Aspirin 81 mg,1.0,48.0,30.083829,M,True,Colon Cancer Risk,≥1 High Risk Findings and ≥1 Advanced Adenoma ...,ST001422,526
VT_160213_019.mzML,10023,Aspirin 325 mg,,,,M,False,Healthy,,ST001422,526
VT_160213_055.mzML,10023,Aspirin 325 mg,,,,M,False,Healthy,,ST001422,526
VT_160213_049.mzML,10034,Aspirin 81 mg,,,,M,False,Healthy,,ST001422,526
...,...,...,...,...,...,...,...,...,...,...,...
VT_160201_115.mzML,90738,Aspirin 81 mg,,,,F,False,Healthy,,ST001422,526
VT_160225_025.mzML,90806,Aspirin 81 mg,,,,M,False,Healthy,,ST001422,526
VT_160225_043.mzML,90806,Aspirin 81 mg,,,,M,False,Healthy,,ST001422,526
VT_160225_019.mzML,91111,Placebo,,,,M,False,Healthy,,ST001422,526


## ST000388
- there should already be gender
- we need to add cancer status
- we can add smoking status

- three smoking categories: never, former, current

In [28]:
existing_ST000388 = metadata[metadata['Study ID'] == 'ST000388'].copy()
existing_ST000388.dropna(axis=1, how='all', inplace=True)

In [83]:
new_ST000388_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/Lung-Cancer_ST000388_metadata.tsv'
new_ST000388 = pd.read_csv(new_ST000388_file, sep='\t', index_col=0)

In [84]:
new_ST000388['mzml_file'] = new_ST000388['file name'].apply(lambda x: x+'.mzML')
new_ST000388['Sex'] = new_ST000388['Gender'].map({'Female':'F', 'Male': 'M'})
new_ST000388['subject ID'] = new_ST000388.index

In [89]:
# new_ST000388['Diagnosis'] = new_ST000388['Group'].copy()
new_ST000388['Diagnosis'] = new_ST000388['Group'].map({'Cancer':'Lung Cancer','Benign':'Benign'})
new_ST000388['Cancer Risk'] = new_ST000388['Group'].map({'Cancer':True,'Benign':False})
new_ST000388['Diagnosis Type'] = new_ST000388['Cancer Type'].copy()
new_ST000388['Diagnosis Details'] = new_ST000388['Nodule Classification'].apply(lambda x: f'Nodule Classification: {x}')

In [112]:
metadata_ST000388 = new_ST000388[['subject ID','Sex','Cancer Risk','Diagnosis','Diagnosis Type',\
                                  'Diagnosis Details','Smoking Status','Emphysema/COPD']].copy()
metadata_ST000388.index = new_ST000388['mzml_file'].to_list()
metadata_ST000388['Study ID'] = 'ST000388'
metadata_ST000388.rename(columns={'subject ID':'Subject ID'}, inplace=True)
metadata_ST000388['Job ID'] = existing_ST000388['Cohort ID'].iloc[0]
metadata_ST000388['Cohort Label v0'] = existing_ST000388['Cohort Label'].iloc[0]
metadata_ST000388['is Pediatric'] = existing_ST000388['is Pediatric'].iloc[0]

In [192]:
metadata_ST000388['Smoking Status'].value_counts()

Smoking Status
Former     58
Current    36
Name: count, dtype: int64

In [113]:
metadata_ST000388.head()

Unnamed: 0,subject ID,Sex,Cancer Risk,Diagnosis,Diagnosis Type,Diagnosis Details,Smoking Status,Emphysema/COPD,Study ID,Job ID
LungNodule_HILIC_Pos_23.mzML,SA018129,F,False,Benign,,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581
LungNodule_HILIC_Pos_74.mzML,SA018130,F,False,Benign,,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581
LungNodule_HILIC_Pos_113.mzML,SA018134,F,True,Lung Cancer,adeno stage 2,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581
LungNodule_HILIC_Pos_14.mzML,SA018131,F,True,Lung Cancer,adeno 1a,Nodule Classification: No NODULES,Current,No,ST000388,581
LungNodule_HILIC_Pos_46.mzML,SA018133,F,True,Lung Cancer,adeno stage 2,Nodule Classification: SOLID NODULES ONLY,Current,No,ST000388,581


In [32]:
# json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST000388_AN000624.json'

# with open(json_file) as f:
#     data = json.load(f)
# data['SUBJECT_SAMPLE_FACTORS']

## ST001408
- we need to add gender, all patients are men
- all patients have cancer

In [93]:
existing_ST001408 = metadata[metadata['Study ID'] == 'ST001408'].copy()
existing_ST001408.dropna(axis=1, how='all', inplace=True)
existing_ST001408.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
P_HA_PL25_B1_045_50510.mzML,522,ST001408,adult_cancer,522,58.847136,Pretrain,Test,False,0,5,P_HA_PL25_B1_045_50510.mzML
P_HA_PL25_B1_055_56908.mzML,522,ST001408,adult_cancer,522,58.482306,Pretrain,Test,False,0,5,P_HA_PL25_B1_055_56908.mzML
P_HA_PL25_B1_097_50161.mzML,522,ST001408,adult_cancer,522,58.40934,Pretrain,Test,False,0,5,P_HA_PL25_B1_097_50161.mzML
P_HA_PL25_B1_067_47200.mzML,522,ST001408,adult_cancer,522,58.299891,Pretrain,Test,False,0,5,P_HA_PL25_B1_067_47200.mzML
P_HA_PL25_B1_041_52819.mzML,522,ST001408,adult_cancer,522,58.080992,Pretrain,Test,False,0,5,P_HA_PL25_B1_041_52819.mzML


In [124]:
metadata_ST001408 = existing_ST001408[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST001408.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001408['Sex'] = 'M'
metadata_ST001408['Cancer Risk'] = True

In [125]:
metadata_ST001408.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk
P_HA_PL25_B1_045_50510.mzML,ST001408,False,adult_cancer,522,M,True
P_HA_PL25_B1_055_56908.mzML,ST001408,False,adult_cancer,522,M,True
P_HA_PL25_B1_097_50161.mzML,ST001408,False,adult_cancer,522,M,True
P_HA_PL25_B1_067_47200.mzML,ST001408,False,adult_cancer,522,M,True
P_HA_PL25_B1_041_52819.mzML,ST001408,False,adult_cancer,522,M,True


## ST001236, ST001237
- all patients have cancer


In [130]:
existing_ST001236 = metadata[metadata['Study ID'] == 'ST001236'].copy()
existing_ST001236.dropna(axis=1, how='all', inplace=True)
existing_ST001236.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,OS,OS_Event,Age,subject ID,study_week,Region,...,Matt Set,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
0196_Marios_RCC_HP-001300051-2.mzML,541,ST001236,adult_cancer,541,32.233,0.0,45.0,CA209009-13-51,week 4,US/CANADA,...,Other,Pretrain,Test,False,0,3,0196_Marios_RCC_HP-001300051-2.mzML,45.0,45.0,0.0
0019_Marios_RCC_HP-000100118-1.mzML,541,ST001236,adult_cancer,541,5.9,1.0,82.0,CA209009-1-118,baseline,US/CANADA,...,Other,Pretrain,Test,False,0,3,0019_Marios_RCC_HP-000100118-1.mzML,82.0,82.0,0.0
0016_Marios_RCC_HP-000100086-1.mzML,541,ST001236,adult_cancer,541,15.467,1.0,71.0,CA209009-1-86,baseline,US/CANADA,...,Other,Pretrain,Test,False,0,3,0016_Marios_RCC_HP-000100086-1.mzML,71.0,71.0,0.0
0232_Marios_RCC_HP-001500075-3.mzML,541,ST001236,adult_cancer,541,27.367,1.0,70.0,CA209009-15-75,week 9,OTHER,...,Other,Pretrain,Test,False,0,3,0232_Marios_RCC_HP-001500075-3.mzML,70.0,70.0,1.0
0245_Marios_RCC_HP-001500094-3.mzML,541,ST001236,adult_cancer,541,30.4,0.0,41.0,CA209009-15-94,week 9,OTHER,...,Other,Pretrain,Test,False,0,3,0245_Marios_RCC_HP-001500094-3.mzML,41.0,41.0,1.0


In [123]:
existing_ST001237 = metadata[metadata['Study ID'] == 'ST001237'].copy()
existing_ST001237.dropna(axis=1, how='all', inplace=True)
existing_ST001237.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,OS,OS_Event,Age,subject ID,study_week,Region,...,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),IMDC,IMDC ORDINAL,IMDC BINARY,is Female
0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,541,ST001237,adult_cancer,541,50.562628,0.0,62.0,CA209025-111-657,baseline,REST OF WORLD,...,False,0,4,0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,62.0,62.0,FAVORABLE,2.0,1.0,1.0
0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,541,ST001237,adult_cancer,541,24.607803,1.0,59.0,CA209025-33-12,baseline,WESTERN EUROPE,...,False,0,4,0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,59.0,59.0,INTERMEDIATE,1.0,,0.0
0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,541,ST001237,adult_cancer,541,52.073922,0.0,66.0,CA209025-111-778,baseline,REST OF WORLD,...,False,0,4,0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,66.0,66.0,INTERMEDIATE,1.0,,0.0
0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,541,ST001237,adult_cancer,541,24.246407,1.0,80.0,CA209025-180-1001,baseline,REST OF WORLD,...,False,0,4,0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,80.0,80.0,POOR,0.0,0.0,1.0
0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,541,ST001237,adult_cancer,541,3.614,1.0,60.0,CA209025-113-50,baseline,WESTERN EUROPE,...,False,0,4,0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,60.0,60.0,NOT REPORTED,,,0.0


In [132]:
metadata_ST001236 = existing_ST001236[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                       'OS','OS_Event','Age','Sex','subject ID','study_week','Region',\
                                        'PFS','Treatment','Dose (mg/kg)','Race','PFS','PFS_Event',\
                                        'ORR','Benefit','phase']].copy()

metadata_ST001236.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'study_week': 'Timepoint',
                                  'subject ID': 'Subject ID',
                                  'phase': 'Clinical Trial Phase',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001236['Cancer Risk'] = True

In [135]:
metadata_ST001236.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,OS,OS_Event,Age,Sex,subject ID,Timepoint,...,PFS,Treatment,Dose (mg/kg),Race,PFS.1,PFS_Event,ORR,Benefit,Clinical Trial Phase,Cancer Risk
0196_Marios_RCC_HP-001300051-2.mzML,ST001236,False,adult_cancer,541,32.233,0.0,45.0,M,CA209009-13-51,week 4,...,,NIVOLUMAB,10.0,WHITE,,,,,RCC1,True
0019_Marios_RCC_HP-000100118-1.mzML,ST001236,False,adult_cancer,541,5.9,1.0,82.0,M,CA209009-1-118,baseline,...,,NIVOLUMAB,0.3,WHITE,,,,,RCC1,True
0016_Marios_RCC_HP-000100086-1.mzML,ST001236,False,adult_cancer,541,15.467,1.0,71.0,M,CA209009-1-86,baseline,...,,NIVOLUMAB,10.0,WHITE,,,,,RCC1,True
0232_Marios_RCC_HP-001500075-3.mzML,ST001236,False,adult_cancer,541,27.367,1.0,70.0,F,CA209009-15-75,week 9,...,,NIVOLUMAB,2.0,WHITE,,,,,RCC1,True
0245_Marios_RCC_HP-001500094-3.mzML,ST001236,False,adult_cancer,541,30.4,0.0,41.0,F,CA209009-15-94,week 9,...,,NIVOLUMAB,10.0,WHITE,,,,,RCC1,True


In [133]:
metadata_ST001237 = existing_ST001237[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                       'OS','OS_Event','Age','Sex','subject ID','study_week','Region',\
                                        'IMDC','MSKCC','PFS','Treatment','Dose (mg/kg)','Race','PFS','PFS_Event',\
                                        'ORR','Benefit','Prior_2','phase']].copy()
                                        
metadata_ST001237.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'study_week': 'Timepoint',
                                'subject ID': 'Subject ID',
                                  'phase': 'Clinical Trial Phase',
                                  'Cohort ID':'Job ID'}, inplace=True)

metadata_ST001237['Cancer Risk'] = True

In [134]:
metadata_ST001237.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,OS,OS_Event,Age,Sex,subject ID,Timepoint,...,Treatment,Dose (mg/kg),Race,PFS,PFS_Event,ORR,Benefit,Prior_2,Clinical Trial Phase,Cancer Risk
0582_Marios_PD1_Inhib2_HP-EA003798-7.mzML,ST001237,False,adult_cancer,541,50.562628,0.0,62.0,F,CA209025-111-657,baseline,...,EVEROLIMUS,,WHITE,18.299795,1.0,SD,ICB,True,RCC3,True
0293_Marios_PD1_Inhib2_HP-E9101785-7.mzML,ST001237,False,adult_cancer,541,24.607803,1.0,59.0,M,CA209025-33-12,baseline,...,NIVOLUMAB,3.0,WHITE,1.905544,1.0,PD,NCB,False,RCC3,True
0219_Marios_PD1_Inhib2_HP-EA003799-7.mzML,ST001237,False,adult_cancer,541,52.073922,0.0,66.0,M,CA209025-111-778,baseline,...,NIVOLUMAB,3.0,WHITE,29.470226,1.0,CRPR,CB,False,RCC3,True
0962_Marios_PD1_Inhib2_HP-E9784758-7.mzML,ST001237,False,adult_cancer,541,24.246407,1.0,80.0,F,CA209025-180-1001,baseline,...,EVEROLIMUS,,WHITE,3.712526,1.0,SD,ICB,True,RCC3,True
0218_Marios_PD1_Inhib2_HP-E9123067-7.mzML,ST001237,False,adult_cancer,541,3.614,1.0,60.0,M,CA209025-113-50,baseline,...,NIVOLUMAB,3.0,WHITE,2.759754,1.0,SD,ICB,False,RCC3,True


## ST002244
- all patients are non-cancer

In [136]:
existing_ST002244 = metadata[metadata['Study ID'] == 'ST002244'].copy()
existing_ST002244.dropna(axis=1, how='all', inplace=True)
existing_ST002244.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
Positive_QC_2.mzML,557,ST002244,adult_other,557,78.657424,Pretrain,Test,False,1,16,Positive_QC_2.mzML
Tech_Plasma_Pos_1.mzML,557,ST002244,adult_other,557,75.11857,Pretrain,Test,False,1,16,Tech_Plasma_Pos_1.mzML
Positive_QC_4.mzML,557,ST002244,adult_other,557,74.75374,Pretrain,Test,False,1,16,Positive_QC_4.mzML
Tech_Plasma_Pos_3.mzML,557,ST002244,adult_other,557,74.425392,Pretrain,Test,False,1,16,Tech_Plasma_Pos_3.mzML
Tech_Plasma_Pos_5.mzML,557,ST002244,adult_other,557,72.309376,Pretrain,Test,False,1,16,Tech_Plasma_Pos_5.mzML


In [137]:
metadata_ST002244 = existing_ST002244[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()

metadata_ST002244.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)

metadata_ST002244['Cancer Risk'] = False

In [138]:
metadata_ST002244.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Cancer Risk
Positive_QC_2.mzML,ST002244,False,adult_other,557,False
Tech_Plasma_Pos_1.mzML,ST002244,False,adult_other,557,False
Positive_QC_4.mzML,ST002244,False,adult_other,557,False
Tech_Plasma_Pos_3.mzML,ST002244,False,adult_other,557,False
Tech_Plasma_Pos_5.mzML,ST002244,False,adult_other,557,False


## ST002112
- we shoud already have gender
- all patients are non-cancer

In [139]:
existing_ST002112 = metadata[metadata['Study ID'] == 'ST002112'].copy()
existing_ST002112.dropna(axis=1, how='all', inplace=True)
existing_ST002112.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
pHILIC_2_284.mzML,507,ST002112,adult_other,507,F,64.976286,Pretrain,Test,False,1,15,pHILIC_2_284.mzML,1.0
pHILIC_2_245.mzML,507,ST002112,adult_other,507,M,64.027727,Pretrain,Test,False,1,15,pHILIC_2_245.mzML,0.0
pHILIC_2_93.mzML,507,ST002112,adult_other,507,F,62.823787,Pretrain,Test,False,1,15,pHILIC_2_93.mzML,1.0
pHILIC_2_75.mzML,507,ST002112,adult_other,507,F,62.349507,Pretrain,Test,False,1,15,pHILIC_2_75.mzML,1.0
pHILIC_2_50.mzML,507,ST002112,adult_other,507,M,61.510398,Pretrain,Test,False,1,15,pHILIC_2_50.mzML,0.0


In [140]:
metadata_ST002112 = existing_ST002112[['Study ID','is Pediatric','Cohort Label','Cohort ID',\
                                        'Sex']].copy()

metadata_ST002112.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)

metadata_ST002112['Cancer Risk'] = False

In [141]:
metadata_ST002112.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk
pHILIC_2_284.mzML,ST002112,False,adult_other,507,F,False
pHILIC_2_245.mzML,ST002112,False,adult_other,507,M,False
pHILIC_2_93.mzML,ST002112,False,adult_other,507,F,False
pHILIC_2_75.mzML,ST002112,False,adult_other,507,F,False
pHILIC_2_50.mzML,ST002112,False,adult_other,507,M,False


## ST002027
- add gender info, all patients are women
- all patients are non cancer

In [142]:
existing_ST002027 = metadata[metadata['Study ID'] == 'ST002027'].copy()
existing_ST002027.dropna(axis=1, how='all', inplace=True)
existing_ST002027.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id
McCann_812029070_011_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.751915,Pretrain,Test,False,1,14,McCann_812029070_011_MX452646_posHILIC.mzML
McCann_812239150_349_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.423568,Pretrain,Test,False,1,14,McCann_812239150_349_MX452646_posHILIC.mzML
McCann_812222070_334_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,79.277636,Pretrain,Test,False,1,14,McCann_812222070_334_MX452646_posHILIC.mzML
McCann_812001070_009_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,78.985772,Pretrain,Test,False,1,14,McCann_812001070_009_MX452646_posHILIC.mzML
McCann_812222000_333_MX452646_posHILIC.mzML,558,ST002027,adult_other,558,78.876323,Pretrain,Test,False,1,14,McCann_812222000_333_MX452646_posHILIC.mzML


In [143]:
metadata_ST002027 = existing_ST002027[['Study ID','is Pediatric','Cohort Label','Cohort ID']].copy()
metadata_ST002027.rename(columns={'Cohort Label':'Cohort Label v0',
                                  'subject ID': 'Subject ID',
                                    'Cohort ID':'Job ID'}, inplace=True)


metadata_ST002027['Sex'] = 'F'
metadata_ST002027['Cancer Risk'] = False


In [144]:
metadata_ST002027.head()

Unnamed: 0,Study ID,is Pediatric,Cohort Label v0,Job ID,Sex,Cancer Risk
McCann_812029070_011_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False
McCann_812239150_349_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False
McCann_812222070_334_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False
McCann_812001070_009_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False
McCann_812222000_333_MX452646_posHILIC.mzML,ST002027,False,adult_other,558,F,False


## ST001918
- add BMI
- we should already have gender info
- can optionally add smoking status
- all patients are non-cancer
- associated paper:
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8598381/

- Smoking Status: for this data set "Smoking status defined as having smoked more than 20 cigarettes over the lifetime"
    - 1 is No, which we translate to "Never"
    - 2 is Yes, which we translate to "Current or Former"

In [145]:
existing_ST001918 = metadata[metadata['Study ID'] == 'ST001918'].copy()
existing_ST001918.dropna(axis=1, how='all', inplace=True)
existing_ST001918.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
VT_170729_M021_179.mzML,559,ST001918,adult_other,559,,89.675301,Pretrain,Test,False,1,11,VT_170729_M021_179.mzML,
VT_170728_M021_103.mzML,559,ST001918,adult_other,559,,87.887632,Pretrain,Test,False,1,11,VT_170728_M021_103.mzML,
VT_170729_M021_153.mzML,559,ST001918,adult_other,559,F,69.35425,Pretrain,Test,False,1,11,VT_170729_M021_153.mzML,1.0
VT_170728_M021_127.mzML,559,ST001918,adult_other,559,F,68.843488,Pretrain,Test,False,1,11,VT_170728_M021_127.mzML,1.0
VT_170729_M021_085.mzML,559,ST001918,adult_other,559,F,68.697556,Pretrain,Test,False,1,11,VT_170729_M021_085.mzML,1.0


In [160]:
existing_ST001918.sort_index()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,is Female
VT_170728_M021_001.mzML,559,ST001918,adult_other,559,,76.760306,Pretrain,Train,False,1,11,VT_170728_M021_001.mzML,
VT_170728_M021_003.mzML,559,ST001918,adult_other,559,,67.603065,Pretrain,Train,False,1,11,VT_170728_M021_003.mzML,
VT_170728_M021_005.mzML,559,ST001918,adult_other,559,,67.457132,Pretrain,Train,False,1,11,VT_170728_M021_005.mzML,
VT_170728_M021_007.mzML,559,ST001918,adult_other,559,,69.901496,Pretrain,Train,False,1,11,VT_170728_M021_007.mzML,
VT_170728_M021_009.mzML,559,ST001918,adult_other,559,,68.296242,Pretrain,Val,False,1,11,VT_170728_M021_009.mzML,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT_170729_M021_255.mzML,559,ST001918,adult_other,559,,68.843488,Pretrain,Val,False,1,11,VT_170729_M021_255.mzML,
VT_170729_M021_257.mzML,559,ST001918,adult_other,559,,69.244801,Pretrain,Train,False,1,11,VT_170729_M021_257.mzML,
VT_170729_M021_259.mzML,559,ST001918,adult_other,559,,68.770522,Pretrain,Train,False,1,11,VT_170729_M021_259.mzML,
VT_170729_M021_261.mzML,559,ST001918,adult_other,559,,67.894929,Pretrain,Train,False,1,11,VT_170729_M021_261.mzML,


In [147]:
existing_ST001918.shape

(271, 13)

In [164]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001918_AN003116.json'

with open(json_file) as f:
    data = json.load(f)

# data['SUBJECT_SAMPLE_FACTORS']
# convert dictionary to dataframe

new_ST001918 = extract_subject_factors_dataframe(data)
# new_ST001918 = pd.DataFrame(data['SUBJECT_SAMPLE_FACTORS'])

In [167]:
new_ST001918.head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_002,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_008,1,Pooled_Plasma,,
2,q3June2014_1b,q3June2014_1b_1,,,VT_170728_M021_014,1,Pooled_Plasma,,
3,CZ.0067.005,CZ.0067.005_1,1.0,2.0,VT_170728_M021_020,1,Study_Sample,1.0,23.183391
4,CZ.0002.004,CZ.0002.004_1,2.0,2.0,VT_170728_M021_026,1,Study_Sample,2.0,22.265625


In [155]:
# new_ST001918['mzml file'] = new_ST001918['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
# new_ST001918.index = new_ST001918['mzml file'].tolist()

In [166]:
new_ST001918_RP_part1 = new_ST001918.copy()
new_ST001918_RP_part2 = increment_the_last_number(new_ST001918.copy(), 'RAW_FILE_NAME', 2)
new_ST001918_RP_part3 = increment_the_last_number(new_ST001918.copy(), 'RAW_FILE_NAME', 4)

new_ST001918_RP = pd.concat([new_ST001918_RP_part1, new_ST001918_RP_part2, new_ST001918_RP_part3], axis=0)
new_ST001918_RP.sort_values('RAW_FILE_NAME').head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_002,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_004,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_006,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_008,1,Pooled_Plasma,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_010,1,Pooled_Plasma,,


In [168]:
new_ST001918_Hilic = increment_the_last_number(new_ST001918_RP.copy(),'RAW_FILE_NAME',-1)

In [170]:
new_ST001918_Hilic.sort_values('RAW_FILE_NAME').head()

Unnamed: 0,Subject ID,Sample ID,Sex,Smoking_status,RAW_FILE_NAME,Batch,Sample_Class,Benzene_Exposure_Category,bmi
0,nist1,nist1_1,,,VT_170728_M021_001,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_003,1,NIST1950,,
0,nist1,nist1_1,,,VT_170728_M021_005,1,NIST1950,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_007,1,Pooled_Plasma,,
1,q3June2014_1a,q3June2014_1a_1,,,VT_170728_M021_009,1,Pooled_Plasma,,


In [171]:
new_ST001918_Hilic['Column'] = 'Hilic'
new_ST001918_Hilic['Polarity'] = 'Positive'
new_ST001918_RP['Column'] = 'RP'
new_ST001918_RP['Polarity'] = 'Negative'

new_ST001918 = pd.concat([new_ST001918_Hilic, new_ST001918_RP], axis=0)

In [173]:
new_ST001918['mzml file'] = new_ST001918['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001918.index = new_ST001918['mzml file'].tolist()

In [183]:
new_ST001918['Sex'] = new_ST001918['Sex'].map(sex_label_matching_dict)

In [195]:
metadata_ST001918 = new_ST001918[['Column','Polarity','Sex','Subject ID','Sample_Class','Batch','bmi','Benzene_Exposure_Category','Smoking_status']].copy()
metadata_ST001918.rename(columns=
    {'Smoking_status': 'Smoking Status',
    'bmi': 'BMI'}, inplace=True)



In [207]:
metadata_ST001918['Study ID'] = existing_ST001918['Study ID'].iloc[0]
metadata_ST001918['Job ID'] = existing_ST001918['Cohort ID'].iloc[0]
metadata_ST001918['Cohort Label v0'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','Cohort Label v0'] = existing_ST001918['Cohort Label'].iloc[0]
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','Cohort Label v0'] = existing_ST001918['Cohort Label'].iloc[0]

metadata_ST001918['is Pediatric'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','is Pediatric'] = False
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','is Pediatric'] = False

metadata_ST001918['Cancer Risk'] = 'NA'
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Study_Sample','Cancer Risk'] = False
metadata_ST001918.loc[metadata_ST001918['Sample_Class']=='Pooled_Plasma','Cancer Risk'] = False

metadata_ST001918['Smoking Status'] = metadata_ST001918['Smoking Status'].map({'2': 'Current or Former', '1': 'Never', 2: 'Current or Former', 1: 'Never'})

In [208]:
metadata_ST001918

Unnamed: 0,Column,Polarity,Sex,Subject ID,Sample_Class,Batch,BMI,Benzene_Exposure_Category,Smoking Status,Study ID,Job ID,Cohort Label v0,is Pediatric,Cancer Risk
VT_170728_M021_001.mzML,Hilic,Positive,,nist1,NIST1950,1,,,,ST001918,559,,,
VT_170728_M021_007.mzML,Hilic,Positive,,q3June2014_1a,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_013.mzML,Hilic,Positive,,q3June2014_1b,Pooled_Plasma,1,,,,ST001918,559,adult_other,False,False
VT_170728_M021_019.mzML,Hilic,Positive,M,CZ.0067.005,Study_Sample,1,23.183391,1,,ST001918,559,adult_other,False,False
VT_170728_M021_025.mzML,Hilic,Positive,F,CZ.0002.004,Study_Sample,1,22.265625,2,,ST001918,559,adult_other,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT_170729_M021_240.mzML,RP,Negative,M,CZ.0180.002,Study_Sample,2,22.64737696,-,,ST001918,559,adult_other,False,False
VT_170729_M021_246.mzML,RP,Negative,M,CZ.0072.004,Study_Sample,2,19.71332153,1,,ST001918,559,adult_other,False,False
VT_170729_M021_252.mzML,RP,Negative,,q3June2014_2e,Pooled_Plasma,2,,,,ST001918,559,adult_other,False,False
VT_170729_M021_258.mzML,RP,Negative,,q3June2014_2f,Pooled_Plasma,2,,,,ST001918,559,adult_other,False,False


In [200]:
metadata_ST001918['Smoking Status'] = metadata_ST001918['Smoking Status'].map({'2': 'Current or Former', '1': 'Never', 2: 'Current or Former', 1: 'Never'})

In [201]:
metadata_ST001918['Smoking Status'].value_counts()

Smoking Status
Current or Former    192
Never                156
Name: count, dtype: int64

In [186]:
metadata_ST001918['Sex'].value_counts()

Sex
M    180
F    168
Name: count, dtype: int64

In [187]:
metadata_ST001918['Sample_Class'].value_counts()

Sample_Class
Study_Sample       366
Study_QC_Sample     96
Pooled_Plasma       72
NIST1950            12
Name: count, dtype: int64

In [202]:
metadata_ST001918.head()

Unnamed: 0,Column,Polarity,Sex,Subject ID,Sample_Class,Batch,BMI,Benzene_Exposure_Category,Smoking Status,Study ID,Job ID,Cohort Label v0,is Pediatric,Cancer Risk
VT_170728_M021_001.mzML,Hilic,Positive,,nist1,NIST1950,1,,,,ST001918,559,,,
VT_170728_M021_007.mzML,Hilic,Positive,,q3June2014_1a,Pooled_Plasma,1,,,,ST001918,559,adult_other,0.0,0.0
VT_170728_M021_013.mzML,Hilic,Positive,,q3June2014_1b,Pooled_Plasma,1,,,,ST001918,559,adult_other,0.0,0.0
VT_170728_M021_019.mzML,Hilic,Positive,M,CZ.0067.005,Study_Sample,1,23.183391,1.0,Current or Former,ST001918,559,adult_other,0.0,0.0
VT_170728_M021_025.mzML,Hilic,Positive,F,CZ.0002.004,Study_Sample,1,22.265625,2.0,Current or Former,ST001918,559,adult_other,0.0,0.0


## ST001849
- we should already have age and gender information
- we can add BMI
- we can optionally add smoking status
- we have some cancer labels, but since patients have covid, unclear if we can use all of them


for cancer labels
- cancer without covid : cancer label
- no-cancer w/wo covid: no-cancer label
- cancer with covid: NA label

In [209]:
existing_ST001849 = metadata[metadata['Study ID'] == 'ST001849'].copy()
existing_ST001849.dropna(axis=1, how='all', inplace=True)
existing_ST001849.head()

Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Age,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
B8_WU350-355_d0_polar_pos.mzML,504,ST001849,adult_other,504,47.9,M,83.035389,Pretrain,Test,False,1,10,B8_WU350-355_d0_polar_pos.mzML,47.9,47.9,0.0
B9_WU350-347_d7_polar_pos.mzML,504,ST001849,adult_other,504,46.1,M,82.925939,Pretrain,Test,False,1,10,B9_WU350-347_d7_polar_pos.mzML,46.1,46.1,0.0
B8_WU350-323_d0_polar_pos.mzML,504,ST001849,adult_other,504,30.4,F,82.81649,Pretrain,Test,False,1,10,B8_WU350-323_d0_polar_pos.mzML,30.4,30.4,1.0
B8_WU350-345_d0_polar_pos.mzML,504,ST001849,adult_other,504,68.2,M,82.743524,Pretrain,Test,False,1,10,B8_WU350-345_d0_polar_pos.mzML,68.2,68.2,0.0
B8_WU350-311_d0_polar_pos.mzML,504,ST001849,adult_other,504,76.8,M,82.196279,Pretrain,Test,False,1,10,B8_WU350-311_d0_polar_pos.mzML,76.8,76.8,0.0


In [216]:
existing_ST001849.shape

(691, 16)

In [210]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001849_AN002993.json'

with open(json_file) as f:
    data = json.load(f)

new_ST001849 = extract_subject_factors_dataframe(data)    

In [220]:
new_ST001849

Unnamed: 0,Subject ID,Sample ID,batch,WU day of presentation,SARS-CoV-2 Positive,Admitted to the ICU,REMDESIVIR on,DEXAMETHOSONE on,RAW_FILE_NAME,Days post symptom onset,...,MIP1a,GMCSF,MCP1,IL15,HGF,VEGF,IL1Ra,IL2R,MIG,IL8
0,WU350-013,B1_WU350-013_d0,1,d0,Yes,Yes,-,-,B1_WU350-013_d0_polar_pos.mzML B1_WU350-013_d0...,11,...,10.0981817,10.04841148,1303.844512,22.82324687,726.5621765,6.876694719,1048.239677,124.3491115,84.900486,25.72906886
1,WU350-014,B1_WU350-014_d0,1,d0,Yes,Yes,-,-,B1_WU350-014_d0_polar_pos.mzML B1_WU350-014_d0...,3,...,27.25334281,1.497099241,505.525744,61.77101181,66.59597601,0.975492592,54.78235533,35.01993895,43.56103392,26.27460047
2,WU350-021,B1_WU350-021_d0,1,d0,Yes,Yes,-,-,B1_WU350-021_d0_polar_pos.mzML B1_WU350-021_d0...,15,...,17.92951592,1.306346239,318.8497179,82.76911121,1851.039214,5.260831608,1464.274665,71.55218583,147.4940306,20.04562796
3,WU350-029,B1_WU350-029_d0,1,d0,Yes,Yes,-,-,B1_WU350-029_d0_polar_pos.mzML B1_WU350-029_d0...,2,...,12.36174818,18.83099743,570.1185771,52.50121538,119.1798849,1.87128378,62.29341051,93.5288825,37.41580246,17.5441893
4,WU350-031,B1_WU350-031_d0,1,d0,Yes,Yes,-,-,B1_WU350-031_d0_polar_pos.mzML B1_WU350-031_d0...,2,...,1.00684193,0.950151926,803.1537554,<8.3,218.3478923,0.466090286,68.23701628,107.2469566,45.53312978,47.66419023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,WU350-376,B9_WU350-376_d7,9,d7,Yes,Yes,-,1,B9_WU350-376_d7_polar_pos.mzML B9_WU350-376_d7...,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
696,WU350-377,B9_WU350-377_d7,9,d7,Yes,Yes,-,1,B9_WU350-377_d7_polar_pos.mzML B9_WU350-377_d7...,10,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
697,WU350-393,B9_WU350-393_d7,9,d7,Yes,Yes,-,1,B9_WU350-393_d7_polar_pos.mzML B9_WU350-393_d7...,-,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
698,WU350-398,B9_WU350-398_d7,9,d7,Yes,Yes,-,1,B9_WU350-398_d7_polar_pos.mzML B9_WU350-398_d7...,11,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [223]:
new_ST001849_polar_pos = new_ST001849.copy()
new_ST001849_polar_pos['RAW_FILE_NAME'] = new_ST001849_polar_pos['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[0])
# new_ST001849_polar_pos['mzml file'] = new_ST001849_polar_pos['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_polar_pos['Column'] = 'Hilic'
new_ST001849_polar_pos['Polarity'] = 'Positive'
new_ST001849_polar_pos.index = new_ST001849_polar_pos['RAW_FILE_NAME'].tolist()

new_ST001849_polar_neg = new_ST001849.copy()
new_ST001849_polar_neg['RAW_FILE_NAME'] = new_ST001849_polar_neg['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[1])
# new_ST001849_polar_neg['mzml file'] = new_ST001849_polar_neg['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_polar_neg['Column'] = 'Hilic'
new_ST001849_polar_neg['Polarity'] = 'Negative'
new_ST001849_polar_neg.index = new_ST001849_polar_neg['RAW_FILE_NAME'].tolist()


new_ST001849_lipid_pos = new_ST001849.copy()
new_ST001849_lipid_pos['RAW_FILE_NAME'] = new_ST001849_lipid_pos['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[2])
# new_ST001849_lipid_pos['mzml file'] = new_ST001849_lipid_pos['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_lipid_pos['Column'] = 'RP'
new_ST001849_lipid_pos['Polarity'] = 'Positive'
new_ST001849_lipid_pos.index = new_ST001849_lipid_pos['RAW_FILE_NAME'].tolist()


new_ST001849_lipid_neg = new_ST001849.copy()
new_ST001849_lipid_neg['RAW_FILE_NAME'] = new_ST001849_lipid_neg['RAW_FILE_NAME'].apply(lambda x: x.split(' ')[3])
# new_ST001849_lipid_neg['mzml file'] = new_ST001849_lipid_neg['RAW_FILE_NAME'].apply(lambda x: x+'.mzML')
new_ST001849_lipid_neg['Column'] = 'RP'
new_ST001849_lipid_neg['Polarity'] = 'Negative'
new_ST001849_lipid_neg.index = new_ST001849_lipid_neg['RAW_FILE_NAME'].tolist()


new_ST001849_combined = pd.concat([new_ST001849_polar_pos, new_ST001849_polar_neg, new_ST001849_lipid_pos, new_ST001849_lipid_neg], axis=0)

In [226]:
new_ST001849_combined['Sex'] = new_ST001849_combined['Sex'].map(sex_label_matching_dict)

In [228]:
new_ST001849_combined[['SARS-CoV-2 Positive', 'Diabetes','Acute renal failure','Chronic kidney disease','Cancer','Age','Smoker']]

Unnamed: 0,SARS-CoV-2 Positive,Diabetes,Acute renal failure,Chronic kidney disease,Cancer,Age,Smoker
B1_WU350-013_d0_polar_pos.mzML,Yes,-,1,1,-,80.8,-1
B1_WU350-014_d0_polar_pos.mzML,Yes,1,1,-,-,66.9,-1
B1_WU350-021_d0_polar_pos.mzML,Yes,1,1,-,-,68,-1
B1_WU350-029_d0_polar_pos.mzML,Yes,1,1,1,-,88.3,-
B1_WU350-031_d0_polar_pos.mzML,Yes,1,1,1,-,88.3,-
...,...,...,...,...,...,...,...
B9_WU350-376_d7_lipid_neg.mzML,Yes,1,-,1,-,70.3,1
B9_WU350-377_d7_lipid_neg.mzML,Yes,1,-,-,-,63.9,-1
B9_WU350-393_d7_lipid_neg.mzML,Yes,1,1,-,-,83.5,-
B9_WU350-398_d7_lipid_neg.mzML,Yes,-,-,-,-,53.7,-1


In [218]:
new_ST001849['RAW_FILE_NAME'].iloc[0]

'B1_WU350-013_d0_polar_pos.mzML B1_WU350-013_d0_polar_neg.mzML B1_WU350-013_d0_lipid_pos.mzML B1_WU350-013_d0_lipid_neg.mzML'

In [225]:
new_ST001849['Sex'].value_counts()

Sex
Male      405
Female    295
Name: count, dtype: int64

In [212]:
new_ST001849.columns.tolist()

['Subject ID',
 'Sample ID',
 'batch',
 'WU day of presentation',
 'SARS-CoV-2 Positive',
 'Admitted to the ICU',
 'REMDESIVIR on',
 'DEXAMETHOSONE on',
 'RAW_FILE_NAME',
 'Days post symptom onset',
 'Sex',
 'BMI',
 'Age at Symptom onset (years)',
 'Hospitalized?',
 'Time from symptom onset to ICU (days)',
 'Subject on ventilation at any point after d0',
 'Mortality Status',
 'Symptom Fever',
 'Symptom Headache',
 'Symptom Cough',
 'Symptom Shortness of breath',
 'Symptom: Sore throat',
 'Asymptomatic',
 'Death due to COVID-19?',
 '30 day mortality',
 '60 day mortality',
 '90 day mortality',
 'CRP',
 'D-dimer',
 'Neutrophil %',
 'CO2',
 'Acute respiratory failure',
 'Diabetes',
 'Acute renal failure',
 'Chronic kidney disease',
 'Cancer',
 'Age',
 'Smoker',
 'High/Low arterial pH',
 'Neutrophil absolute',
 'Lymphocyte absolute',
 'Lymphocyte %',
 'IL1b',
 'IL10',
 'IL6',
 'RANTES',
 'MIP1a',
 'GMCSF',
 'MCP1',
 'IL15',
 'HGF',
 'VEGF',
 'IL1Ra',
 'IL2R',
 'MIG',
 'IL8']

In [None]:
metadata_ST001849

In [214]:
new_ST001849.shape

(700, 56)

In [215]:
new_ST001849['WU day of presentation'].value_counts()

WU day of presentation
d0     322
d3     164
d7     110
d14     54
d28     31
d84     19
Name: count, dtype: int64

In [213]:
new_ST001849['Subject ID'].nunique()

339

## ST001519
- we should already have age and gender labels
- add BMI
- all patients are non-cancer

In [229]:
existing_ST001519 = metadata[metadata['Study ID'] == 'ST001519'].copy()
existing_ST001519.dropna(axis=1, how='all', inplace=True)
existing_ST001519.head()


Unnamed: 0,cohort_id,Study ID,Cohort Label,Cohort ID,Age,Sex,MV,Set,Pretrain,is Pediatric,Cohort Label ENC,Study ID ENC,file id,Age Range (min),Age Range (max),is Female
HP_mzml/Plasma/0133_WUG_FARMM_HIL-9009-2-PC.mzML,605,ST001519,adult_other,605,23.0,M,41.116381,Pretrain,Test,False,1,9,HP_mzml/Plasma/0133_WUG_FARMM_HIL-9009-2-PC.mzML,23.0,23.0,0.0
HP_mzml/Plasma/0088_WUG_FARMM_HIL-9038-2-PC.mzML,605,ST001519,adult_other,605,59.0,F,40.788034,Pretrain,Test,False,1,9,HP_mzml/Plasma/0088_WUG_FARMM_HIL-9038-2-PC.mzML,59.0,59.0,1.0
HP_mzml/Plasma/0119_WUG_FARMM_HIL-9040-1-PC.mzML,605,ST001519,adult_other,605,22.0,M,40.642101,Pretrain,Test,False,1,9,HP_mzml/Plasma/0119_WUG_FARMM_HIL-9040-1-PC.mzML,22.0,22.0,0.0
HP_mzml/Plasma/0127_WUG_FARMM_HIL-9029-2-PD.mzML,605,ST001519,adult_other,605,26.0,M,40.423203,Pretrain,Test,False,1,9,HP_mzml/Plasma/0127_WUG_FARMM_HIL-9029-2-PD.mzML,26.0,26.0,0.0
HP_mzml/Plasma/0074_WUG_FARMM_HIL-9013-2-PB.mzML,605,ST001519,adult_other,605,40.0,M,40.423203,Pretrain,Test,False,1,9,HP_mzml/Plasma/0074_WUG_FARMM_HIL-9013-2-PB.mzML,40.0,40.0,0.0


In [230]:
# All of these are Plasma Samples
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST001521_AN002533.json'

with open(json_file) as f:
    data = json.load(f)

new_ST001519 = extract_subject_factors_dataframe(data)

In [261]:
new_ST001519['RAW_FILE_NAME'].iloc[0]
new_ST001519['Sex'] = new_ST001519['Sex'].map(sex_label_matching_dict)

In [262]:
# All of these are Plasma Samples

new_ST001519_combined = None
col_id_list = ['Hilic','RP','Hilic','RP']
pol_id_list = ['Positive','Positive','Negative','Negative']
exp_desc_list = ['HP','CP','HN','CN']
for i in range(4):
    print(i)
    exp_desc = exp_desc_list[i]
    new_ST001519_part = new_ST001519.copy()
    # new_ST001519_part['RAW_FILE_NAME']  = new_ST001519_part['RAW_FILE_NAME'].apply(lambda x: x.split(';')[i])
    new_ST001519_part['RAW_FILE'] = ''
    for ii, x in enumerate(new_ST001519_part['RAW_FILE_NAME'].tolist()):
        x_splits = x.split(';')
        if len(x_splits) > 2:
            x_new = x_splits[i]
            new_ST001519_part.iloc[ii,-1] = x_new

    new_ST001519_part['mzml file'] = new_ST001519_part['RAW_FILE'].apply(lambda x: x.replace('.raw','.mzML'))
    new_ST001519_part['mzml path'] = new_ST001519_part['mzml file'].apply(lambda x: f'{exp_desc}_mzml/Plasma/'+x)
    new_ST001519_part['Column'] = col_id_list[i]
    new_ST001519_part['Polarity'] = pol_id_list[i]
    new_ST001519_part = new_ST001519_part[~(new_ST001519_part['mzml file'] == 'NA')]
    new_ST001519_part.index = new_ST001519_part['mzml file']

    if new_ST001519_combined is None:
        new_ST001519_combined = new_ST001519_part.copy()
    else:
        new_ST001519_combined = pd.concat([new_ST001519_combined,new_ST001519_part], axis=0)

0
1
2
3


In [256]:
new_ST001519_part['Type'].value_counts()

Type
Plasma    157
Name: count, dtype: int64

In [258]:
new_ST001519_combined

Unnamed: 0_level_0,Subject ID,Sample ID,Study_Diet,Age,Sex,Race,Time,Type,BMI,Ethnicity,RAW_FILE_NAME,RAW_FILE,mzml file,mzml path,Column,Polarity
mzml file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0078_WUG_FARMM_HIL-9002-3-PA.mzML,9002,9002-3-PA,Vegan,20,Male,White,Day 15,Plasma,21.3,Not Hispanic or Latino,0078_WUG_FARMM_HIL-9002-3-PA.raw;0078_WUG_FARM...,0078_WUG_FARMM_HIL-9002-3-PA.raw,0078_WUG_FARMM_HIL-9002-3-PA.mzML,HP_mzml/Plasma/0078_WUG_FARMM_HIL-9002-3-PA.mzML,Hilic,Positive
0076_WUG_FARMM_HIL-9002-3-PB.mzML,9002,9002-3-PB,Vegan,20,Male,White,Day 12,Plasma,21.3,Not Hispanic or Latino,0076_WUG_FARMM_HIL-9002-3-PB.raw;0076_WUG_FARM...,0076_WUG_FARMM_HIL-9002-3-PB.raw,0076_WUG_FARMM_HIL-9002-3-PB.mzML,HP_mzml/Plasma/0076_WUG_FARMM_HIL-9002-3-PB.mzML,Hilic,Positive
0079_WUG_FARMM_HIL-9002-3-PC.mzML,9002,9002-3-PC,Vegan,20,Male,White,Day 9,Plasma,21.3,Not Hispanic or Latino,0079_WUG_FARMM_HIL-9002-3-PC.raw;0079_WUG_FARM...,0079_WUG_FARMM_HIL-9002-3-PC.raw,0079_WUG_FARMM_HIL-9002-3-PC.mzML,HP_mzml/Plasma/0079_WUG_FARMM_HIL-9002-3-PC.mzML,Hilic,Positive
0080_WUG_FARMM_HIL-9002-3-PD.mzML,9002,9002-3-PD,Vegan,20,Male,White,Day 5,Plasma,21.3,Not Hispanic or Latino,0080_WUG_FARMM_HIL-9002-3-PD.raw;0080_WUG_FARM...,0080_WUG_FARMM_HIL-9002-3-PD.raw,0080_WUG_FARMM_HIL-9002-3-PD.mzML,HP_mzml/Plasma/0080_WUG_FARMM_HIL-9002-3-PD.mzML,Hilic,Positive
0077_WUG_FARMM_HIL-9002-3-PE.mzML,9002,9002-3-PE,Vegan,20,Male,White,Baseline,Plasma,21.3,Not Hispanic or Latino,0077_WUG_FARMM_HIL-9002-3-PE.raw;0077_WUG_FARM...,0077_WUG_FARMM_HIL-9002-3-PE.raw,0077_WUG_FARMM_HIL-9002-3-PE.mzML,HP_mzml/Plasma/0077_WUG_FARMM_HIL-9002-3-PE.mzML,Hilic,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0060a_WUG_FARMM_FFA-QCPP04.mzML,QC-pooled_plasma,QPP04,,,,,,Plasma,,,0060a_WUG_FARMM_HIL-QPP04.raw;0060a_WUG_FARMM_...,0060a_WUG_FARMM_FFA-QCPP04.raw,0060a_WUG_FARMM_FFA-QCPP04.mzML,CN_mzml/Plasma/0060a_WUG_FARMM_FFA-QCPP04.mzML,RP,Negative
0080a_WUG_FARMM_FFA-QCPP05.mzML,QC-pooled_plasma,QPP05,,,,,,Plasma,,,0080a_WUG_FARMM_HIL-QPP05.raw;0080a_WUG_FARMM_...,0080a_WUG_FARMM_FFA-QCPP05.raw,0080a_WUG_FARMM_FFA-QCPP05.mzML,CN_mzml/Plasma/0080a_WUG_FARMM_FFA-QCPP05.mzML,RP,Negative
0100a_WUG_FARMM_FFA-QCPP06.mzML,QC-pooled_plasma,QPP06,,,,,,Plasma,,,0100a_WUG_FARMM_HIL-QPP06.raw;0100a_WUG_FARMM_...,0100a_WUG_FARMM_FFA-QCPP06.raw,0100a_WUG_FARMM_FFA-QCPP06.mzML,CN_mzml/Plasma/0100a_WUG_FARMM_FFA-QCPP06.mzML,RP,Negative
0120a_WUG_FARMM_FFA-QCPP07.mzML,QC-pooled_plasma,QPP07,,,,,,Plasma,,,0120a_WUG_FARMM_HIL-QPP07.raw;NA;0100a_WU_FARM...,0120a_WUG_FARMM_FFA-QCPP07.raw,0120a_WUG_FARMM_FFA-QCPP07.mzML,CN_mzml/Plasma/0120a_WUG_FARMM_FFA-QCPP07.mzML,RP,Negative


In [260]:
new_ST001519_combined['Cancer Risk'] = 'NA'
new_ST001519_combined.loc[~(new_ST001519_combined['Subject ID'].str.contains('QC')),'Cancer Risk'] = False

In [None]:
# metadata_ST001519 = new_ST001519_combined[['Column','Polarity', 'Age','Sex'

## ST001932
- all patients are non-cancer

In [None]:
existing_

## ST001428
- all patients are non cancer

## ST000909
- all patients are non cancer

## ST002331
- all patients are non cancer
- we should already have gender and age information

## ST002251
- all patients are non cancer
-  we should already have age and gender information
-  add BMI data

In [263]:
json_file = '/Users/jonaheaton/ReviveMed Dropbox/Jonah Eaton/mzLearn_preTraining/source_metadata/JSON metadata/ST002251_AN003677.json'

with open(json_file) as f:
    data = json.load(f)

new_ST002251 = extract_subject_factors_dataframe(data)    

In [264]:
new_ST002251

Unnamed: 0,Subject ID,Sample ID,ThreeorMoreBursts,ICShighdose,Study,AgeatEnrollment,AsthmaDurationYears,Sex,Ethnicity,Race,Height,Weight,BMI,LABA,Montelukast,Batch,RAW_FILE_NAME
0,Blank,20200715_001_Blank-pos,,,,,,,,,,,,,,StartBatch,20200715_001_Blank-pos.raw
1,Blank,20200715_002_Blank-neg,,,,,,,,,,,,,,StartBatch,20200715_002_Blank-neg.raw
2,SRM1950,20200715_003_srm1950,,,,,,,,,,,,,,StartBatch,20200715_003_srm1950.raw
3,SRM1950,20200715_004_srm1950,,,,,,,,,,,,,,StartBatch,20200715_004_srm1950.raw
4,QC,20200715_005_qc,,,,,,,,,,,,,,StartBatch,20200715_005_qc.raw
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
541,FIT01-171,FIT01-171-neg,Yes,Yes,Dietary SAA,14,13.5,M,Not Hispanic,Black,172,111.1,37.55,Yes,Yes,Batch11,20200725_040_FIT01-171-neg.raw
542,FIT01-008,FIT01-008-pos,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,130,34.9,20.65,No,No,Batch11,20200725_041_FIT01-008-pos.raw
543,FIT01-008,FIT01-008-neg,No,No,SARP3,9.25,7.5,M,Not Hispanic,Black,130,34.9,20.65,No,No,Batch11,20200725_042_FIT01-008-neg.raw
544,QC,20200725_043_QC-pos,,,,,,,,,,,,,,Batch11,20200725_043_QC-pos.raw


## ST001931
- all patients are non cancer