In [137]:
#final update 2024-12-11
import sys
import os
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sb
import random as rd
import matplotlib
import warnings
import tqdm

In [138]:
# Load peak area data for positive ion mode after batch correction
import pandas as pd
from tqdm import tqdm

peakarea_path = ["D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/raw_peak_area_before_bc.csv", #raw peak area before batch correction (40004, 403)
                 "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/filtered_peak_area_after_bc_70.csv", #filtered peak area after batch correction, with DF >=70%, s/n >=3,
                 "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/filtered_peak_area_for_bc_70.csv"] #filtered peak area before batch correction, with DF >=70%, s/n >=3,
targetmz_path = ["D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_withRT_CFMID_spectrumdatabase_chemical_annotation_20241228.csv", #plastic chemical list with predicted/library RT, CRMID, and spectrum ID match record,
                 'D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_withRT_CFMID_annotation_20241228.csv'] #plastic chemical list with RT and CFMID

#load peak area data
pos_peak_area = pd.read_csv(peakarea_path[1])
# print(f'dimention of the peak area data: {pos_peak_area_200.shape}')
# Load target mz data
targetmzdat_pos = pd.read_csv(targetmz_path[0])
targetmzdat_pos['mz'] = targetmzdat_pos['Monoisotopic_Mass_ready'] + 1.007825

# Select relevant columns for inquiry
peak_area_inquiry = pos_peak_area[['Average Rt(min)', 'Average Mz'] + [col for col in pos_peak_area.columns if col.startswith('BH')]]

# Function to calculate ppm difference
def ppm_difference(value1, value2):
    return abs(value1 - value2) / value1 * 1e6

ppm_threshold = 5
matched_rows_pos = []
matched_summary = []

# MS1 search and matching
for _, chem_row in tqdm(targetmzdat_pos.iterrows(), total=targetmzdat_pos.shape[0], desc="Matching compounds"):
    mw = chem_row['mz']
    matches = peak_area_inquiry[peak_area_inquiry['Average Mz'].apply(lambda mz: ppm_difference(mz, mw) <= ppm_threshold)]
    if not matches.empty:
        # Append the matches along with the corresponding compound information
        for _, match_row in matches.iterrows():
            matched_rows_pos.append({
                **match_row,
                'Matched Compound': chem_row['SMILES'],
                'PREFERRED_NAME': chem_row.get('PREFERRED_NAME', None),
                'DTXSID_Hits': chem_row.get('DTXSID', None),
                'DTXSID': chem_row.get('DTXSID', None),
                'InChiKey_origin': chem_row.get('INCHIKEY', None),
                'MOLECULAR_FORMULA_original': chem_row.get('MOLECULAR_FORMULA', None),
                'Compound_comment': chem_row.get('QC_NOTES', None),
                'SMILES_STD': chem_row.get('SMILES_ready', None),
                "Monoisotopic_mass_ready": chem_row.get('Monoisotopic_Mass_ready', None),
                "PredictedRT": chem_row.get('PredRT_Retip', None),
                "PredictedRT_lower": chem_row.get('predRT_lower', None),
                "PredictedRT_upper": chem_row.get('predRT_higher', None),
                'RT_library' : chem_row.get('RT_lib', None),
                'total_spectra_pos': chem_row.get('total_spectra_pos', None),
                'total_pred_spectra_pos': chem_row.get('total_pred_spectra_pos', None),
                'BloodExpo_check': chem_row.get('BloodExpo_check', None),
                'ExposomeExplorer_check': chem_row.get('ExposomeExplorer_check', None),
                'Function': chem_row.get('Function', None),
                'Polymer': chem_row.get('Polymer', None),
                'US_production': chem_row.get('US_production', None),
                'Total_production': chem_row.get('Total_production', None),
                'Industrial_Sector': chem_row.get('Industrial_Sector', None)

            })
        # Record summary statistics for each targeted mz, preserving multiple hits
        matched_summary.append({
            'Targeted MZ': mw,
            'Matched Count': len(matches),
            'Average Rt(min)': matches['Average Rt(min)'].tolist(),
            'Average Mz': matches['Average Mz'].tolist(),
            'Mean Value': matches.iloc[:, 3:].mean(axis=1).tolist(),
            'Max Value': matches.iloc[:, 3:].max(axis=1).tolist(),
            'Min Value': matches.iloc[:, 3:].min(axis=1).tolist(),
            'Std Dev': matches.iloc[:, 3:].std(axis=1).tolist(),
            'RSD (%)': ((matches.iloc[:, 3:].std(axis=1) / matches.iloc[:, 3:].mean(axis=1)) * 100).tolist(),
            'SMILES_STD': chem_row.get('SMILES_ready', None),
            'DTXSID': chem_row.get('DTXSID', None),
            'CASRN': chem_row.get('CASRN', None),
            'Compound Name': chem_row.get('PREFERRED_NAME', None),
            'QC_NOTES': chem_row.get('QC_NOTES', None),
            'InChiKey_origin': chem_row.get('INCHIKEY', None)
        })

# Create a DataFrame for matched MSDIAL result rows
matched_peak_area_df_pos = pd.DataFrame(matched_rows_pos)

# drop rows with duplicated column values at Average Rt(min) and Average Mz, retaining the first occurrence
matched_peak_area_df_pos_uniquemzrt = matched_peak_area_df_pos.drop_duplicates(subset=['Average Rt(min)', 'Average Mz'], keep='first')
print(f"Number of matched feature from peak area table: {matched_peak_area_df_pos_uniquemzrt.shape[0]}")
matched_peak_area_df_pos_uniquechemical = matched_peak_area_df_pos.drop_duplicates(subset=['InChiKey_origin'], keep='first')
print(f'Number of unique chemical matched: {matched_peak_area_df_pos_uniquechemical.shape[0]}')

# Create a DataFrame for matched summary statistics
matched_summary_df_pos = pd.DataFrame(matched_summary)

# Step 3: Handling technical duplicates and calculating mean values
bh_columns = [col for col in matched_peak_area_df_pos.columns if col.startswith('BH')]
mean_columns = {}
for col in bh_columns:
    base_col = col.split('_')[0]
    if base_col not in mean_columns:
        duplicate_cols = [c for c in bh_columns if c.startswith(base_col)]
        mean_columns[base_col] = matched_peak_area_df_pos[duplicate_cols].mean(axis=1)

# Retain only the columns with calculated mean values
matched_peak_area_df_pos = pd.concat([matched_peak_area_df_pos.drop(columns=bh_columns), pd.DataFrame(mean_columns)], axis=1)

#output the matched peak area data
#rank the feature by rt AND mz
matched_peak_pos = matched_peak_area_df_pos.sort_values(by=['Average Rt(min)', 'Average Mz'])

#filerted features with intensity lower than 5000 
peak_areapos  = matched_peak_pos[[col for col in matched_peak_pos.columns if col.startswith('BH')]]
row_medianspos  = peak_areapos.median(axis=1)
peak_indexpos  = row_medianspos[row_medianspos >= 5000].index
filter_resultspos  = matched_peak_pos.loc[peak_indexpos]
print(f'dimension of feature table filtered by intensity: {filter_resultspos.shape}')

# Step 4: Retain columns with median values > 5000, filter non-numeric values, and perform DataFrame transpose
filtered_df = matched_peak_area_df_pos.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN
filtered_df = filtered_df.loc[:, (filtered_df.median(axis=0) >= 5000)].dropna(how='all', axis=1)  # Retain columns with median value > 5000

# Count the number of retained columns after intensity filtering
num_retained_columns = filtered_df.shape[1] 
print(f"Number of columns retained after intensity filtering: {num_retained_columns}")
print(f'dimension of the filtered data: {filtered_df.shape}')

# Calculate summary statistics for each sample column
summary_stats = {
    'Sum': filtered_df.sum(),
    'Median': filtered_df.median(),
    'Mean': filtered_df.mean(),
    'Max': filtered_df.max(),
    'Min': filtered_df.min(),
    '95th Percentile': filtered_df.quantile(0.95),
    '5th Percentile': filtered_df.quantile(0.05)
}

# Create a DataFrame from the summary statistics
summary_stats_df_pos = pd.DataFrame(summary_stats)
# summary_stats_df_pos.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/mathced_peaks_summary_forallsamples_pos_woRT.csv")

# Transpose the summary statistics DataFrame
# transposed_summary_df = summary_stats_df_pos.transpose()
# Display the final DataFrame
# print(summary_stats_df_pos)

Matching compounds: 100%|██████████| 5278/5278 [00:48<00:00, 108.97it/s]


Number of matched feature from peak area table: 2159
Number of unique chemical matched: 1127
dimension of feature table filtered by intensity: (2050, 224)
Number of columns retained after intensity filtering: 133
dimension of the filtered data: (4039, 133)


In [139]:
#annotate the features according to RT_library and predictedRT
for iter, row in filter_resultspos.iterrows():
    if pd.notnull(row['RT_library']):
        if (row['Average Rt(min)'] >= row['RT_library']-0.5) & (row['Average Rt(min)'] <= row['RT_library']+0.5):
            filter_resultspos.loc[iter, 'annotation_RTMS'] = '2'
        else:
            filter_resultspos.loc[iter, 'annotation_RTMS'] = 'NotMatched'
    else:
        if (row['Average Rt(min)'] >= row['PredictedRT_lower']) & (row['Average Rt(min)'] <= row['PredictedRT_upper']):
            filter_resultspos.loc[iter, 'annotation_RTMS'] = '3'
        else:
            filter_resultspos.loc[iter, 'annotation_RTMS'] = '4'

#output the datafile for spectrum matching
filter_resultspos.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/feature_annotation_RTMS_pos_peakover5000_20241228.csv")

In [140]:
print(filter_resultspos[filter_resultspos['annotation_RTMS'] == '2'].shape)
print(filter_resultspos[filter_resultspos['annotation_RTMS'] == '3'].shape)

(10, 225)
(1311, 225)


In [141]:
#rank the feature by rt AND mz
matched_peak_pos = matched_peak_area_df_pos.sort_values(by=['Average Rt(min)', 'Average Mz'])
#filerted features with intensity lower than 5000 
peak_area = matched_peak_pos[[col for col in matched_peak_pos.columns if col.startswith('BH')]]
row_medians = peak_area.median(axis=1)
peak_index = row_medians[row_medians >= 5000].index
filter_results = matched_peak_pos.loc[peak_index]
filter_results.shape
filter_results.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/matched_peak_pos_over5000.csv")
matched_peak_pos.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/matched_peak_pos.csv")

In [142]:
# Importing negative alignment results
# Negative alignment results
peakarea_path = ["D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/raw_peak_area_before_bc.csv", #raw peak area before batch correction
                 "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/filtered_peak_area_after_bc_70.csv", #filtered peak area after batch correction, with DF >=70%, s/n >=3,
                 "D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/filtered_peak_area_for_bc_70.csv"] #filtered peak area before batch correction, with DF >=70%, s/n >=3,
targetmz_path = ["D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_withRT_CFMID_spectrumdatabase_chemical_annotation_20241228.csv", #plastic chemical list with predicted/library RT, CRMID, and spectrum ID match record,
                 'D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_withRT_CFMID_annotation_20241228.csv'] #plastic chemical list with RT and CFMID

#load peak area data
neg_peak_area = pd.read_csv(peakarea_path[1])
# print(f'dimention of the peak area data: {pos_peak_area_200.shape}')
# Load target mz data
targetmzdat_neg = pd.read_csv(targetmz_path[0])
targetmzdat_neg['mz'] = targetmzdat_neg['MONOISOTOPIC_MASS'] - 1.007825

# Select relevant columns for inquiry
peak_area_inquiry = neg_peak_area[['Average Rt(min)', 'Average Mz'] + [col for col in neg_peak_area.columns if col.startswith('BH')]]

# Function to calculate ppm difference
def ppm_difference(value1, value2):
    return abs(value1 - value2) / value1 * 1e6

ppm_threshold = 5
matched_rows_neg = []
matched_summary = []

# MS1 search and matching
for _, chem_row in tqdm(targetmzdat_neg.iterrows(), total=targetmzdat_neg.shape[0], desc="Matching compounds"):
    mw = chem_row['mz']
    matches = peak_area_inquiry[peak_area_inquiry['Average Mz'].apply(lambda mz: ppm_difference(mz, mw) <= ppm_threshold)]
    #print(f"Matches after ppm filtering: {matches.shape}")  # Debugging
    
    if not matches.empty:
        # Append the matches along with the corresponding compound information
        for _, match_row in matches.iterrows():
            matched_rows_neg.append({
                **match_row,
                'Matched Compound': chem_row['SMILES'],
                'PREFERRED_NAME': chem_row.get('PREFERRED_NAME', None),
                'DTXSID_Hits': chem_row.get('DTXSID', None),
                'DTXSID': chem_row.get('DTXSID', None),
                'InChiKey_origin': chem_row.get('INCHIKEY', None),
                'MOLECULAR_FORMULA_original': chem_row.get('MOLECULAR_FORMULA', None),
                'Compound_comment': chem_row.get('QC_NOTES', None),
                'SMILES_STD': chem_row.get('SMILES_ready', None),
                "Monoisotopic_mass_ready": chem_row.get('Monoisotopic_Mass_ready', None),
                "PredictedRT": chem_row.get('PredRT_Retip', None),
                "PredictedRT_lower": chem_row.get('predRT_lower', None),
                "PredictedRT_upper": chem_row.get('predRT_higher', None),
                'RT_library' : chem_row.get('RT_lib ', None),
                'total_spectra_neg': chem_row.get('total_spectra_neg', None),
                'total_pred_spectra_neg': chem_row.get('total_pred_spectra_neg', None),
                'BloodExpo_check': chem_row.get('BloodExpo_check', None),
                'ExposomeExplorer_check': chem_row.get('ExposomeExplorer_check', None),
                'Function': chem_row.get('Function', None),
                'Polymer': chem_row.get('Polymer', None),
                'US_production': chem_row.get('US_production', None),
                'Total_production': chem_row.get('Total_production', None),
                'Industrial_Sector': chem_row.get('Industrial_Sector', None)
            })
        # Record summary statistics for each targeted mz, preserving multiple hits
        matched_summary.append({
            'Targeted MZ': mw,
            'Matched Count': len(matches),
            'Average Rt(min)': matches['Average Rt(min)'].tolist(),
            'Average Mz': matches['Average Mz'].tolist(),
            'Mean Value': matches.iloc[:, 3:].mean(axis=1).tolist(),
            'Max Value': matches.iloc[:, 3:].max(axis=1).tolist(),
            'Min Value': matches.iloc[:, 3:].min(axis=1).tolist(),
            'Std Dev': matches.iloc[:, 3:].std(axis=1).tolist(),
            'RSD (%)': ((matches.iloc[:, 3:].std(axis=1) / matches.iloc[:, 3:].mean(axis=1)) * 100).tolist(),
            'SMILES_STD': chem_row.get('SMILES_ready', None),
            'DTXSID': chem_row.get('DTXSID', None),
            'CASRN': chem_row.get('CASRN', None),
            'Compound Name': chem_row.get('PREFERRED_NAME', None),
            'QC_NOTES': chem_row.get('QC_NOTES', None),
            'InChiKey_origin': chem_row.get('INCHIKEY', None)
        })

# Create a DataFrame for matched MSDIAL result rows
matched_peak_area_df_neg = pd.DataFrame(matched_rows_neg)

# drop rows with duplicated column values at Average Rt(min) and Average Mz, retaining the first occurrence
matched_peak_area_df_neg_uniquemzrt = matched_peak_area_df_neg.drop_duplicates(subset=['Average Rt(min)', 'Average Mz'], keep='first')
matched_peak_area_df_neg_uniquechemical = matched_peak_area_df_neg.drop_duplicates(subset=['InChiKey_origin'], keep='first')
print(f"Number of matched unique feature from peak area table: {matched_peak_area_df_neg_uniquemzrt.shape[0]}")
print(f"Number of matched unique chemicals from peak area table: {matched_peak_area_df_neg_uniquechemical.shape[0]}")

# Create a DataFrame for matched summary statistics
matched_summary_df_neg = pd.DataFrame(matched_summary)
print(f"#of targetedmz with hits:{matched_summary_df_neg.shape}")
# matched_summary_df_neg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/mathced_targetedmz_summary_forallsamples_Neg_woRT.csv")
print(matched_summary_df_neg.shape)

# Step 3: Handling technical duplicates and calculating mean values
bh_columns = [col for col in matched_peak_area_df_neg.columns if col.startswith('BH')]
mean_columns = {}
for col in bh_columns:
    base_col = col.split('_')[0]
    if base_col not in mean_columns:
        duplicate_cols = [c for c in bh_columns if c.startswith(base_col)]
        mean_columns[base_col] = matched_peak_area_df_neg[duplicate_cols].mean(axis=1)

# Retain only the columns with calculated mean values
matched_peak_area_df_neg = pd.concat([matched_peak_area_df_neg.drop(columns=bh_columns), pd.DataFrame(mean_columns)], axis=1)

#output the matched peak area data
#rank the feature by rt AND mz
matched_peak_neg = matched_peak_area_df_neg.sort_values(by=['Average Rt(min)', 'Average Mz'])

#filerted features with intensity lower than 5000 
peak_areaneg = matched_peak_neg[[col for col in matched_peak_neg.columns if col.startswith('BH')]]
row_mediansneg = peak_areaneg.median(axis=1)
peak_indexneg = row_mediansneg[row_mediansneg >= 5000].index
filter_resultsneg = matched_peak_neg.loc[peak_indexneg]
print(filter_resultsneg.shape)
# filter_resultsneg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/matched_peak_neg_over5000_wopredRT.csv")

# Step 4: Retain columns with median values > 5000, filter non-numeric values, and perform DataFrame transpose
filtered_df = matched_peak_area_df_neg.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN
filtered_df = filtered_df.loc[:, (filtered_df.median(axis=0) >= 5000)].dropna(how='all', axis=1)  # Retain columns with median value > 5000

# Count the number of retained columns
num_retained_columns = filtered_df.shape[1]
print(f"Number of columns retained after filtering: {num_retained_columns}")

# Calculate summary statistics for each sample column for each individual samples
#
summary_stats = {
    'Sum': filtered_df.sum(),
    'Median': filtered_df.median(),
    'Mean': filtered_df.mean(),
    'Max': filtered_df.max(),
    'Min': filtered_df.min(),
    '95th Percentile': filtered_df.quantile(0.95),
    '5th Percentile': filtered_df.quantile(0.05)
}

# Create a DataFrame from the summary statistics
summary_stats_df_neg = pd.DataFrame(summary_stats)
# summary_stats_df_neg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/mathced_peaks_summary_forallsamples_Neg_woRT.csv")

# Transpose the summary statistics DataFrame
# transposed_summary_df_neg = summary_stats_df_neg.transpose()
# Display the final DataFrame
# print(summary_stats_df_neg)

Matching compounds: 100%|██████████| 5278/5278 [00:53<00:00, 98.63it/s] 


Number of matched unique feature from peak area table: 3609
Number of matched unique chemicals from peak area table: 1530
#of targetedmz with hits:(1530, 15)
(1530, 15)
(4250, 225)
Number of columns retained after filtering: 151


In [143]:
print(tabulate.tabulate(filter_resultsneg.head(), headers='keys', tablefmt='psql'))

+------+-------------------+--------------+------------------------------------------------------------------+----------------------------------------------------------------+-----------------+-----------------+-----------------------------+------------------------------+-------------------------------------------------------------------+-------------------------------------------------+---------------------------+---------------+---------------------+---------------------+--------------+---------------------+--------------------------+-------------------+--------------------------+-----------------------+--------------------+-----------------+--------------------+---------------------+-----------+-----------+------------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+---------

In [144]:
# filter_resultsneg = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/matched_peak_neg_over5000_wopredRT.csv")
#annotate the features according to RT_library and predictedRT
for iter, row in filter_resultsneg.iterrows():
    if pd.notnull(row['RT_library']):
        if (row['Average Rt(min)'] >= row['RT_library']-0.5) & (row['Average Rt(min)'] <= row['RT_library']+0.5):
            filter_resultsneg.loc[iter, 'annotation_RTMS'] = '2'
        else:
            filter_resultsneg.loc[iter, 'annotation_RTMS'] = 'NotMatched'
    else:
        if (row['Average Rt(min)'] >= row['PredictedRT_lower']) & (row['Average Rt(min)'] <= row['PredictedRT_upper']):
            filter_resultsneg.loc[iter, 'annotation_RTMS'] = '3'
        else:
            filter_resultsneg.loc[iter, 'annotation_RTMS'] = '4'

#output the datafile for spectrum matching
filter_resultsneg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/feature_annotation_RTMS_neg_peakover5000_20241228.csv")

#forward to the next step for MSMS insilico or experimental matching and validation

##retreive msms injection patch for library matching

In [145]:
#acquired msms data 
neg_msmspath = "D:/UCSF_postdoc_topic/REVEAL_topics/R200_MSMS/targeted_msms_check_spectrumsearch_neg.csv"
pos_msmspath = "D:/UCSF_postdoc_topic/REVEAL_topics/R200_MSMS/targeted_msms_check_spectrumsearch_pos.csv"

neg_msms = pd.read_csv(neg_msmspath)
pos_msms = pd.read_csv(pos_msmspath)

#match the targeted msms list with the new feature list 
for iter, row in filter_resultsneg.iterrows():
    mz = row['Average Mz']
    rt = row['Average Rt(min)']
    matches = neg_msms[(neg_msms['Average Mz'] == mz) & (neg_msms['Average Rt(min)'] == rt)]
    if not matches.empty:
        filter_resultsneg.loc[iter, 'MSMS_injection'] = matches['MSMS_injection'].values[0]
        filter_resultsneg.loc[iter, 'AlignmentID'] = matches['AlignmentID'].values[0]
    else:
        filter_resultsneg.loc[iter, 'MSMS_injection'] = 'NotMatched'
        filter_resultsneg.loc[iter, 'AlignmentID'] = 'NotMatched'


for iter, row in filter_resultspos.iterrows():
    mz = row['Average Mz']
    rt = row['Average Rt(min)']
    matches = pos_msms[(pos_msms['Average Mz'] == mz) & (pos_msms['Average Rt(min)'] == rt)]
    if not matches.empty:
        filter_resultspos.loc[iter, 'MSMS_injection'] = matches['MSMS_injection'].values[0]
        filter_resultspos.loc[iter, 'AlignmentID'] = matches['AlignmentID'].values[0]
    else:
        filter_resultspos.loc[iter, 'MSMS_injection'] = 'NotMatched'
        filter_resultspos.loc[iter, 'AlignmentID'] = 'NotMatched'

#output the datafile for spectrum matching
filter_resultspos.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/feature_annotation_RTMS_pos_peakover5000_MSMS_injection.csv")
filter_resultsneg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/feature_annotation_RTMS_neg_peakover5000_MSMS_injection.csv")

In [146]:
print(tabulate.tabulate(filter_resultsneg[filter_resultsneg['PREFERRED_NAME']=='Perfluorooctanesulfonic acid'], headers='keys', tablefmt='psql'))

+------+-------------------+--------------+---------------------------------------------------------------------+------------------------------+---------------+---------------+-----------------------------+------------------------------+--------------------+---------------------------------------------------------------------+---------------------------+---------------+---------------------+---------------------+--------------+---------------------+--------------------------+-------------------+--------------------------+------------+-----------+-----------------+--------------------+---------------------+-----------+-----------+------------+-----------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+------------+

In [147]:
# #found previous MSMS acquisition data

# #filter_resultsneg = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/matched_peak_neg_over5000_wopredRT.csv")
# filter_resultsneg_previousfound = filter_resultsneg
# #update the first round with the experimental Record from previous results
# previous_first_round = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/matched_peak_neg_over5000_firstround_msms.csv")
# # # Frist_round_amend 
# for iter, row in filter_resultsneg_previousfound.iterrows():
#     match_mz = filter_resultsneg_previousfound['Average Mz']
#     match_rt = filter_resultsneg_previousfound['Average Rt(min)']

#     if row['DTXSID'] in previous_first_round['DTXSID'].tolist():
#         filter_resultsneg_previousfound.loc[iter, 'Previous_MSMS'] = 'Yes'
#     else:
#         filter_resultsneg_previousfound.loc[iter, 'Previous_MSMS'] = 'No'

# filter_resultsneg_previousfound.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/featureannotation_firstround_msms_record.csv")


In [148]:
from rdkit import Chem
from rdkit.Chem import MolToSmiles
# ##cross refrence, cross check with blood exposome database
# bloodexpo_dat = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/references/blood_exposome_database/blood_exposome_chemicals_july_2023.csv")

# # Step 1: Filter rows with available MW (MONOISOTOPIC MASS) between 100 and 1000 and classified as organic
# bloodexpo_modified = []
# for index, row in bloodexpo_dat.iterrows():
#     smiles = row['CanonicalSMILES']  # Assuming the SMILES column is named 'SMILES'
#     mol = Chem.MolFromSmiles(smiles)
#     canonical_smiles = Chem.MolToSmiles(mol, canonical=True) if mol else None
#     bloodexpo_modified.append({
#         **row,
#         "SMILES_ready": canonical_smiles})

# # Create a new DataFrame from the filtered rows
# bloodexpo_modified = pd.DataFrame(bloodexpo_modified)

# # Match SMILES from match_summary_df with bloodexpo_modified and retain hits
# # Match inchikey original between hits and the blood exposome database
# # matched_hits_SMILES = pd.merge(matched_summary_df_neg, bloodexpo_modified, how='inner', left_on='SMILES_STD', right_on='SMILES_ready')
# matched_hits_inchi = pd.merge(matched_summary_df_neg, bloodexpo_modified, how='inner', left_on='InChiKey_origin', right_on='InChIKey')
# #print(matched_hits_SMILES.shape)
# print(matched_hits_inchi.shape)
# # Match inchikey original between hits and the blood exposome database
# print(matched_summary_df_neg.shape)

# #title_dict = matched_hits_SMILES.groupby('SMILES_STD')['Title'].apply(list).to_dict()
# title_dict2 = matched_hits_inchi.groupby('InChiKey_origin')['Title'].apply(list).to_dict()
# papercount_dict = matched_hits_inchi.groupby('InChiKey_origin')['BloodPaperCount'].apply(list).to_dict()
# #matched_summary_df_neg['matched_with_bloodexpo_smi'] = matched_summary_df_neg['SMILES_STD'].map(title_dict).apply(lambda x: x if isinstance(x,list) else[])
# matched_summary_df_neg['matched_with_bloodexpo_inchi'] = matched_summary_df_neg['InChiKey_origin'].map(title_dict2).apply(lambda x: x if isinstance(x,list) else[])
# matched_summary_df_neg['BloodPaperCount'] = matched_summary_df_neg['InChiKey_origin'].map(papercount_dict).apply(lambda x: x if isinstance(x,list) else[])
# matched_summary_df_neg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/mathced_targetedmz_summary_forallsamples_Neg.csv")
# ##annotate the targeted mz with the function information and the production volume information from the EST paper
# ##when matching the plastic paper, some compounds are missing
# # plasticmap_est = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/plastic_related_chemicals/plasticmap_from_ESTpaper.csv',encoding='ISO-8859-1')
# # # Match Inchikey from match_summary_df with bloodexpo_modified and retain hits
# matched_summary_df_neg_plastic_note = pd.merge(matched_summary_df_neg, plasticmap_est, how='inner', left_on='InChiKey_origin', right_on='InChI_key')
# matched_summary_df_neg_plastic_note.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/mathced_peaks_summary_withplasticannotation_Neg.csv")

In [149]:
# # Match SMILES from match_summary_df with bloodexpo_modified and retain hits
# #matched_hits_SMILES = pd.merge(matched_summary_df_pos, bloodexpo_modified, how='inner', left_on='SMILES_STD', right_on='SMILES_ready')
# matched_hits_inchi = pd.merge(matched_summary_df_pos, bloodexpo_modified, how='inner', left_on='InChiKey_origin', right_on='InChIKey')

# # print(matched_hits_SMILES.shape)
# print(matched_hits_inchi.shape)
# # Match inchikey original between hits and the blood exposome database
# print(matched_summary_df_pos.shape)

# # title_dict = matched_hits_SMILES.groupby('SMILES_STD')['Title'].apply(list).to_dict()
# title_dict2 = matched_hits_inchi.groupby('InChiKey_origin')['Title'].apply(list).to_dict()
# papercount_dict = matched_hits_inchi.groupby('InChiKey_origin')['BloodPaperCount'].apply(list).to_dict()
# matched_summary_df_pos['matched_with_bloodexpo_inchi'] = matched_summary_df_pos['InChiKey_origin'].map(title_dict2).apply(lambda x: x if isinstance(x,list) else[])
# matched_summary_df_pos['BloodPaperCount'] = matched_summary_df_pos['InChiKey_origin'].map(papercount_dict).apply(lambda x: x if isinstance(x,list) else[])
# matched_summary_df_pos.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/mathced_targetedmz_summary_forallsamples_Pos.csv")

# ##annotate the targeted mz with the function information and the production volume information from the EST paper
# ##when matching the plastic paper, some compounds are missing
# # plasticmap_est = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/plastic_related_chemicals/plasticmap_from_ESTpaper.csv',encoding='ISO-8859-1')
# # # Match Inchikey from match_summary_df with bloodexpo_modified and retain hits
# # matched_summary_df_pos_plastic_note = pd.merge(matched_summary_df_pos, plasticmap_est, how='inner', left_on='InChiKey_origin', right_on='InChI_key')
# # matched_summary_df_pos_plastic_note.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/mathced_peaks_summary_withplasticannotation_pos.csv")

In [150]:
######
##generate the files for CFM-ID spectra prediciton
# CFMID_ready_df = matched_summary_df_neg[['SMILES_STD']].copy()
# CFMID_ready_df['ID'] = ['Molecule' + str(i + 1) for i in range(len(CFMID_ready_df))]
# CFMID_ready_df = CFMID_ready_df[['ID','SMILES_STD']]
# ##CFMID_ready_df.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/First100_batch/neg_CFMID_MSready.txt', sep='\t', index=False, header=False)
# CFMID_ready_df.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/neg_CFMID_MSready.txt', sep='\t', index=False, header=False)

# #####
# #generate the files for CFM-ID spectra prediciton
# CFMID_ready_df = matched_summary_df_pos[['SMILES_STD']].copy()
# CFMID_ready_df['ID'] = ['Molecule' + str(i + 1) for i in range(len(CFMID_ready_df))]
# CFMID_ready_df = CFMID_ready_df[['ID','SMILES_STD']]
# # CFMID_ready_df.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/First100_batch/neg_CFMID_MSready.txt', sep='\t', index=False, header=False)
# CFMID_ready_df.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Pos_AlignmentResults/Pos_CFMID_MSready.txt', sep='\t', index=False, header=False)

In [151]:
#Update on 20241211
#check stock in ENTACKT mixture
# ENTACTmix = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/ENTACT_Mixture_chemlist.csv")
# #add experimental RT to the matched peaks

# #check std availability and make annotation
# for iter, row in filter_resultsneg.iterrows():
#     if row['DTXSID_Hits'] in ENTACTmix['DTXSID'].tolist():
#         filter_resultsneg.loc[iter, 'ENTACT_std_availability'] = 'Yes'
#         # filter_resultsneg.loc[iter, 'ENTACT_std_RT'] = 'ENTACT_std_RT'
#     else:
#         filter_resultsneg.loc[iter, 'ENTACT_std_availability'] = 'No'

# filter_resultsneg.to_csv("D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Neg_AlignmentResults/matched_peak_neg_over5000_wopredRT.csv")

In [152]:
import pickle
import os 

# Save all variables in the environment, excluding un-picklable ones
with open('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/scripts/saved_environment.pkl', 'wb') as f:
    save_dict = {}
    for var_name, var_value in globals().items():
        if not var_name.startswith('__') and var_name != 'f':
            try:
                pickle.dumps(var_value)
                save_dict[var_name] = var_value
            except (pickle.PicklingError, AttributeError, TypeError):
                # Ignore un-picklable variables
                print(f"Variable '{var_name}' could not be pickled and was ignored.")
    pickle.dump(save_dict, f)

print("Environment saved successfully.")

Variable 'get_ipython' could not be pickled and was ignored.
Variable 'exit' could not be pickled and was ignored.
Variable 'quit' could not be pickled and was ignored.
Variable 'open' could not be pickled and was ignored.
Variable 'pd' could not be pickled and was ignored.
Variable 'sys' could not be pickled and was ignored.
Variable 'os' could not be pickled and was ignored.
Variable 'np' could not be pickled and was ignored.
Variable 'stats' could not be pickled and was ignored.
Variable 'plt' could not be pickled and was ignored.
Variable 'sb' could not be pickled and was ignored.
Variable 'rd' could not be pickled and was ignored.
Variable 'matplotlib' could not be pickled and was ignored.
Variable 'tabulate' could not be pickled and was ignored.
Variable 'pickle' could not be pickled and was ignored.


MemoryError: 