In [1]:
import os
import re

def parse_spectrum_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Extract information from the text file
    name = None
    smiles = None
    inchikey = None
    formula = None
    pmass = None
    peaks_energy_1 = []
    ion_mode = "Positive"
    retention_time = "NA"
    precursor_type = "[M+H]+"
    instrument_type = "CFM-ID"
    ccs = "NA"
    
    current_energy = None
    for line in lines:
        line = line.strip()
        if "#ID=" in line:
            name = line.split("#ID=")[1]
        elif "#SMILES=" in line:
            smiles = line.split("#SMILES=")[1]
        elif "#InChiKey=" in line:
            inchikey = line.split("#InChiKey=")[1]
        elif "#Formula=" in line:
            formula = line.split("#Formula=")[1]
        elif "#PMass=" in line:
            pmass = line.split("#PMass=")[1]
        elif re.match(r'^energy\d+', line):
            current_energy = line
        elif re.match(r'^[0-9]', line) and current_energy == "energy1":
            peak_info = line.split()
            if len(peak_info) >= 2:
                mz = peak_info[0]
                intensity = int(float(peak_info[1]) * 10)  # Multiply intensity by 10 and take integer value
                peaks_energy_1.append((mz, intensity))
        elif current_energy == "energy2" and line == "":
            # Stop reading after reaching an empty line following energy2
            break
    
    # Ensure all required fields are present
    if not name or not pmass or not formula or not smiles or not inchikey:
        return None
    
    # Create entry for energy level 1 in MSP format
    if peaks_energy_1:
        msp_entry = [
            f"NAME: {name}",
            f"PRECURSORMZ: {pmass}",
            f"PRECURSORTYPE: {precursor_type}",
            f"IONMODE: {ion_mode}",
            f"RETENTIONTIME: {retention_time}",
            f"CCS: {ccs}",
            f"FORMULA: {formula}",
            f"ONTOLOGY: NA",
            f"SMILES: {smiles}",
            f"INCHIKEY: {inchikey}",
            f"INSTRUMENTTYPE: {instrument_type}",
            f"COLLISIONENERGY: 20 eV",
            f"COMMENT: Generated from CFM-ID prediction",
            f"Num Peaks: {len(peaks_energy_1)}"
        ]
        
        for mz, intensity in peaks_energy_1:
            msp_entry.append(f"{mz}	{intensity}")
        
        msp_entry.append("")  # Add a blank line between entries
        return "\n".join(msp_entry)
    
    return None

def compile_msp(directory_path, output_file):
    msp_entries = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".log"):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading file: {file_path}")
            msp_entry = parse_spectrum_file(file_path)
            if msp_entry:
                msp_entries.append(msp_entry)
            else:
                print("No valid data found in file.")
    
    # Write all entries to the output MSP file
    if msp_entries:
        with open(output_file, 'w') as f:
            f.write("\n".join(msp_entries))
    else:
        print("No valid entries found to write to the output file.")

In [2]:
import os
import re

def parse_spectrum_file(file_path, celevel):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Extract information from the text file
    name = None
    smiles = None
    inchikey = None
    formula = None
    pmass = None
    peaks_energy_1 = []
    ion_mode = "Positive"
    retention_time = "NA"
    precursor_type = "[M+H]+"
    instrument_type = "CFM-ID"
    ccs = "NA"
    if celevel =='energy0': 
        collision_energy = '10eV'
    elif celevel == 'energy1':
        collision_energy = '20eV'
    elif celevel == 'energy2':
        collision_energy = '40eV'

    
    current_energy = None
    for line in lines:
        line = line.strip()
        if "#ID=" in line:
            name = line.split("#ID=")[1]
        elif "#SMILES=" in line:
            smiles = line.split("#SMILES=")[1]
        elif "#InChiKey=" in line:
            inchikey = line.split("#InChiKey=")[1]
        elif "#Formula=" in line:
            formula = line.split("#Formula=")[1]
        elif "#PMass=" in line:
            pmass = line.split("#PMass=")[1]
        elif re.match(r'^energy\d+', line):
            current_energy = line
        elif re.match(r'^[0-9]', line) and current_energy == celevel:
            peak_info = line.split()
            if len(peak_info) >= 2:
                mz = peak_info[0]
                intensity = int(float(peak_info[1]) * 10)  # Multiply intensity by 10 and take integer value
                peaks_energy_1.append((mz, intensity))
        elif current_energy == "energy2" and line == "":
            # Stop reading after reaching an empty line following energy2
            break
    
    # Ensure all required fields are present
    if not name or not pmass or not formula or not smiles or not inchikey:
        return None

    # Create entry for energy level 1 in MSP format
    if peaks_energy_1:
        msp_entry = [
            f"NAME: {name}",
            f"PRECURSORMZ: {pmass}",
            f"PRECURSORTYPE: {precursor_type}",
            f"IONMODE: {ion_mode}",
            f"RETENTIONTIME: {retention_time}",
            f"CCS: {ccs}",
            f"FORMULA: {formula}",
            f"ONTOLOGY: NA",
            f"SMILES: {smiles}",
            f"INCHIKEY: {inchikey}",
            f"INSTRUMENTTYPE: {instrument_type}",
            f"COLLISIONENERGY: {collision_energy}",
            f"COMMENT: Generated from CFM-ID prediction",
            f"Num Peaks: {len(peaks_energy_1)}"
        ]
        
        for mz, intensity in peaks_energy_1:
            msp_entry.append(f"{mz}	{intensity}")
        
        msp_entry.append("")  # Add a blank line between entries
        return "\n".join(msp_entry)
    
    return None

def compile_msp(directory_path, output_file, celevel):
    msp_entries = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".log"):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading file: {file_path}")
            msp_entry = parse_spectrum_file(file_path, celevel)
            if msp_entry:
                msp_entries.append(msp_entry)
            else:
                print("No valid data found in file.")
    
    # Write all entries to the output MSP file
    # specified which energy is collecting
    new_output_file = output_file.strip().split(".msp")[0] + '_' + celevel + '.msp'

    if msp_entries:
        with open(new_output_file, 'w') as f:
            f.write("\n".join(msp_entries))
    else:
        print("No valid entries found to write to the output file.")

In [None]:
# Define your folder path and output file path
directory_path = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/CFM_prediction/pos_total_out"
output_file = os.path.join(directory_path, "compiled_spectrum_library_pos.msp")

# Compile the MSP file
compile_msp(directory_path, output_file, celevel = 'energy0')
compile_msp(directory_path, output_file, celevel = 'energy1')
compile_msp(directory_path, output_file, celevel = 'energy2')

In [3]:
# directory_path = "D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout"
# output_file = os.path.join(directory_path, "compiled_spectrum_library_pos_urine.msp")
# compile_msp(directory_path, output_file)

Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule1.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule10.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule100.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule1000.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule1001.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule1002.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_screeing_plastic_related_compounds/cfmid_spectra_match/myout\Molecule1003.log
Reading file: D:/UCSF_postdoc_topic/ECHO_project/suspect_scre

In [None]:
import os
import re

def parse_spectrum_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Extract information from the text file
    name = None
    smiles = None
    inchikey = None
    formula = None
    pmass = None
    peaks_energy_1 = []
    ion_mode = "Negative"
    retention_time = "NA"
    precursor_type = "[M-H]-"
    instrument_type = "CFM-ID"
    ccs = "NA"
    
    current_energy = None
    for line in lines:
        line = line.strip()
        if "#ID=" in line:
            name = line.split("#ID=")[1]
        elif "#SMILES=" in line:
            smiles = line.split("#SMILES=")[1]
        elif "#InChiKey=" in line:
            inchikey = line.split("#InChiKey=")[1]
        elif "#Formula=" in line:
            formula = line.split("#Formula=")[1]
        elif "#PMass=" in line:
            pmass = line.split("#PMass=")[1]
        elif re.match(r'^energy\d+', line):
            current_energy = line
        elif re.match(r'^[0-9]', line) and current_energy == "energy1":
            peak_info = line.split()
            if len(peak_info) >= 2:
                mz = peak_info[0]
                intensity = int(float(peak_info[1]) * 10)  # Multiply intensity by 10 and take integer value
                peaks_energy_1.append((mz, intensity))
        elif current_energy == "energy2" and line == "":
            # Stop reading after reaching an empty line following energy2
            break
    
    # Ensure all required fields are present
    if not name or not pmass or not formula or not smiles or not inchikey:
        return None
    
    # Create entry for energy level 1 in MSP format
    if peaks_energy_1:
        msp_entry = [
            f"NAME: {name}",
            f"PRECURSORMZ: {pmass}",
            f"PRECURSORTYPE: {precursor_type}",
            f"IONMODE: {ion_mode}",
            f"RETENTIONTIME: {retention_time}",
            f"CCS: {ccs}",
            f"FORMULA: {formula}",
            f"ONTOLOGY: NA",
            f"SMILES: {smiles}",
            f"INCHIKEY: {inchikey}",
            f"INSTRUMENTTYPE: {instrument_type}",
            f"COLLISIONENERGY: 20 eV",
            f"COMMENT: Generated from CFM-ID prediction",
            f"Num Peaks: {len(peaks_energy_1)}"
        ]
        
        for mz, intensity in peaks_energy_1:
            msp_entry.append(f"{mz}	{intensity}")
        
        msp_entry.append("")  # Add a blank line between entries
        return "\n".join(msp_entry)
    
    return None

def compile_msp(directory_path, output_file):
    msp_entries = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".log"):
            file_path = os.path.join(directory_path, filename)
            print(f"Reading file: {file_path}")
            msp_entry = parse_spectrum_file(file_path)
            if msp_entry:
                msp_entries.append(msp_entry)
            else:
                print("No valid data found in file.")
    
    # Write all entries to the output MSP file
    if msp_entries:
        with open(output_file, 'w') as f:
            f.write("\n".join(msp_entries))
    else:
        print("No valid entries found to write to the output file.")

# Define your folder path and output file path
# directory_path = "D:/UCSF_postdoc_topic/REVEAL_topics/First100_batch/CFMID_predicted_v1/myout_neg"
# directory_path = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase1_neg_out"
# output_file = os.path.join(directory_path, "compiled_spectrum_library_neg.msp")

# Compile the MSP file
# compile_msp(directory_path, output_file)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'D:/UCSF_postdoc_topic/REVEAL_topics/First100_batch/CFMID_predicted_v1/myout_neg'

In [4]:
import os
import re

def parse_spectrum_file_neg(file_path, celevel):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Extract information from the text file
    name = None
    smiles = None
    inchikey = None
    formula = None
    pmass = None
    peaks_energy_1 = []
    ion_mode = "Negative"
    retention_time = "NA"
    precursor_type = "[M-H]-"
    instrument_type = "CFM-ID"
    ccs = "NA"
    if celevel =='energy0': 
        collision_energy = '10eV'
    elif celevel == 'energy1':
        collision_energy = '20eV'
    elif celevel == 'energy2':
        collision_energy = '40eV'

    
    current_energy = None
    for line in lines:
        line = line.strip()
        if "#ID=" in line:
            name = line.split("#ID=")[1]
        elif "#SMILES=" in line:
            smiles = line.split("#SMILES=")[1]
        elif "#InChiKey=" in line:
            inchikey = line.split("#InChiKey=")[1]
        elif "#Formula=" in line:
            formula = line.split("#Formula=")[1]
        elif "#PMass=" in line:
            pmass = line.split("#PMass=")[1]
        elif re.match(r'^energy\d+', line):
            current_energy = line
        elif re.match(r'^[0-9]', line) and current_energy == celevel:
            peak_info = line.split()
            if len(peak_info) >= 2:
                mz = peak_info[0]
                intensity = int(float(peak_info[1]) * 10)  # Multiply intensity by 10 and take integer value
                peaks_energy_1.append((mz, intensity))
        elif current_energy == "energy2" and line == "":
            # Stop reading after reaching an empty line following energy2
            break
    
    # Ensure all required fields are present
    if not name or not pmass or not formula or not smiles or not inchikey:
        return None

    # Create entry for energy level 1 in MSP format
    if peaks_energy_1:
        msp_entry = [
            f"NAME: {name}",
            f"PRECURSORMZ: {pmass}",
            f"PRECURSORTYPE: {precursor_type}",
            f"IONMODE: {ion_mode}",
            f"RETENTIONTIME: {retention_time}",
            f"CCS: {ccs}",
            f"FORMULA: {formula}",
            f"ONTOLOGY: NA",
            f"SMILES: {smiles}",
            f"INCHIKEY: {inchikey}",
            f"INSTRUMENTTYPE: {instrument_type}",
            f"COLLISIONENERGY: {collision_energy}",
            f"COMMENT: Generated from CFM-ID prediction",
            f"Num Peaks: {len(peaks_energy_1)}"
        ]
        
        for mz, intensity in peaks_energy_1:
            msp_entry.append(f"{mz}	{intensity}")
        
        msp_entry.append("")  #Add a blank line between entries
        return "\n".join(msp_entry) 
    
    return None

def parse_spectrum_file_pos(file_path, celevel):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    # Extract information from the text file
    name = None
    smiles = None
    inchikey = None
    formula = None
    pmass = None
    peaks_energy_1 = []
    ion_mode = "Positive"
    retention_time = "NA"
    precursor_type = "[M+H]+"
    instrument_type = "CFM-ID"
    ccs = "NA"
    if celevel =='energy0': 
        collision_energy = '10eV'
    elif celevel == 'energy1':
        collision_energy = '20eV'
    elif celevel == 'energy2':
        collision_energy = '40eV'

    
    current_energy = None
    for line in lines:
        line = line.strip()
        if "#ID=" in line:
            name = line.split("#ID=")[1]
        elif "#SMILES=" in line:
            smiles = line.split("#SMILES=")[1]
        elif "#InChiKey=" in line:
            inchikey = line.split("#InChiKey=")[1]
        elif "#Formula=" in line:
            formula = line.split("#Formula=")[1]
        elif "#PMass=" in line:
            pmass = line.split("#PMass=")[1]
        elif re.match(r'^energy\d+', line):
            current_energy = line
        elif re.match(r'^[0-9]', line) and current_energy == celevel:
            peak_info = line.split()
            if len(peak_info) >= 2:
                mz = peak_info[0]
                intensity = int(float(peak_info[1]) * 10)  # Multiply intensity by 10 and take integer value
                peaks_energy_1.append((mz, intensity))
        elif current_energy == "energy2" and line == "":
            # Stop reading after reaching an empty line following energy2
            break
    
    # Ensure all required fields are present
    if not name or not pmass or not formula or not smiles or not inchikey:
        return None

    # Create entry for energy level 1 in MSP format
    if peaks_energy_1:
        msp_entry = [
            f"NAME: {name}",
            f"PRECURSORMZ: {pmass}",
            f"PRECURSORTYPE: {precursor_type}",
            f"IONMODE: {ion_mode}",
            f"RETENTIONTIME: {retention_time}",
            f"CCS: {ccs}",
            f"FORMULA: {formula}",
            f"ONTOLOGY: NA",
            f"SMILES: {smiles}",
            f"INCHIKEY: {inchikey}",
            f"INSTRUMENTTYPE: {instrument_type}",
            f"COLLISIONENERGY: {collision_energy}",
            f"COMMENT: Generated from CFM-ID prediction",
            f"Num Peaks: {len(peaks_energy_1)}"
        ]
        
        for mz, intensity in peaks_energy_1:
            msp_entry.append(f"{mz}	{intensity}")
        
        msp_entry.append("")  #Add a blank line between entries
        return "\n".join(msp_entry) 
    
    return None

def compile_msp(directory_path, output_file, celevel, polarity = 'pos'):
    if polarity == 'pos':
        msp_entries = []
        for filename in os.listdir(directory_path):
            if filename.endswith(".log"):
                file_path = os.path.join(directory_path, filename)
                print(f"Reading file: {file_path}")
                msp_entry = parse_spectrum_file_pos(file_path, celevel)
                if msp_entry:
                    msp_entries.append(msp_entry)
                else:
                    print("No valid data found in file.")


    if polarity == 'neg':
        msp_entries = []
        for filename in os.listdir(directory_path):
            if filename.endswith(".log"):
                file_path = os.path.join(directory_path, filename)
                print(f"Reading file: {file_path}")
                msp_entry = parse_spectrum_file_neg(file_path, celevel)
                if msp_entry:
                    msp_entries.append(msp_entry)
                else:
                    print("No valid data found in file.")

    
    # Write all entries to the output MSP file
    # specified which energy is collecting
    new_output_file = output_file.strip().split(".msp")[0] + '_' + celevel + '.msp'

    if msp_entries:
        with open(new_output_file, 'w') as f:
            f.write("\n".join(msp_entries))
    else:
        print("No valid entries found to write to the output file.")

In [None]:
# Define your folder path and output file path
# directory_path1 = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/CFM_prediction/neg_total_out"
# directory_path1 = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase1_neg_out"
directory_path1 = 'D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out'

output_file1 = os.path.join(directory_path1, "phase2_spectrum_library_neg.msp")

# Compile the MSP file
# compile_msp(directory_path, output_file, celevel = 'energy0')
# compile_msp(directory_path, output_file, celevel = 'energy1')
# compile_msp(directory_path, output_file, celevel = 'energy2')
cel =['energy0', 'energy1', 'energy2']
for i in cel:
    compile_msp(directory_path1, output_file1, celevel = i, polarity='neg')

Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule1.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule10.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule100.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule101.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule1015.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule1016.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_neg_out\Molecule1017.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200sa

In [10]:
##combined CFM-ID spectrum for 5k plastic related compounds
# directory_path2 = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/CFM_prediction/pos_total_out"
# directory_path2 = 'D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase1_pos_out'
directory_path2 = "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out"
output_file2 = os.path.join(directory_path2, "phase2_spectrum_library_pos.msp")

# Compile the MSP file
# compile_msp(directory_path, output_file, celevel = 'energy0')
# compile_msp(directory_path, output_file, celevel = 'energy1')
# compile_msp(directory_path, output_file, celevel = 'energy2')
cel =['energy0', 'energy1', 'energy2']
for i in cel:
    compile_msp(directory_path2, output_file2, celevel = i, polarity='pos')

Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule1.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule10.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule100.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule101.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule1015.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule1016.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/biotransformation_product/phase2_pos_out\Molecule1017.log
Reading file: D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200sa