In [11]:
import sys
import os
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sb
import random as rd
import matplotlib
import warnings

In [3]:
!pip install rdkit



You should consider upgrading via the 'D:\UCSF_postdoc_topic\ECHO_project\Data_analysis\covid_19_analysis\echoenv\Scripts\python.exe -m pip install --upgrade pip' command.


In [44]:
##the below code is for filtering the chemical list: by selecting the organic compounds from the plasticmap chemical list
path3 = 'D:/UCSF_postdoc_topic/ECHO_project/Reprocessed_MSDIAL/CCD-Batch-Search_2024-10-21_07_17_40.csv'

# Load the targeted CSV file
chemical_df = pd.read_csv(path3, encoding='ISO-8859-1')
print(chemical_df.shape)
# Retain rows with available value in SMILES column
chemical_df = chemical_df[chemical_df['MS_READY_SMILES'].notna()]
# chemical_df = chemical_df[chemical_df['SMILES'].notna()]
print(chemical_df.shape)

(10547, 794)
(6375, 794)


  chemical_df = pd.read_csv(path3, encoding='ISO-8859-1')


In [45]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdmolops import RemoveHs

# Function to sanitize the SMILES
def sanitize_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None, None

        # Remove fragments after removing salts
        frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=False)
        mol = max(frags, default=None, key=lambda m: m.GetNumAtoms())
        if mol is None:
            return None, None

        # Attempt to sanitize molecule
        Chem.SanitizeMol(mol)

        # Standardize and calculate monoisotopic mass
        standardized_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)
        monoisotopic_mass = Descriptors.ExactMolWt(mol)
        
        return standardized_smiles, monoisotopic_mass
    except Exception as e:
        print(f"Error sanitizing SMILES {smiles}: {e}")
        return None, None

SMILES_ready =[]
Monoisotopic_Mass_ready =[]
for smiles in chemical_df['SMILES']:
    standardized_smiles, monoisotopic_mass = sanitize_smiles(smiles)
    if standardized_smiles:
        SMILES_ready.append(standardized_smiles)
        Monoisotopic_Mass_ready.append(monoisotopic_mass)
    else:
        SMILES_ready.append('NA')
        Monoisotopic_Mass_ready.append('NA')
chemical_df['SMILES_ready'] = SMILES_ready
chemical_df['Monoisotopic_Mass_ready'] = Monoisotopic_Mass_ready

[14:47:38] Explicit valence for atom # 16 O, 3, is greater than permitted
[14:47:38] Explicit valence for atom # 4 Al, 7, is greater than permitted
[14:47:38] Explicit valence for atom # 4 Sn, 6, is greater than permitted
[14:47:39] Explicit valence for atom # 8 Al, 9, is greater than permitted
[14:47:39] Explicit valence for atom # 0 B, 6, is greater than permitted


In [46]:
#clean up targeted molecule list, retain organic molecules only
import pandas as pd
from rdkit import Chem

# # Retain rows with available value in SMILES column
chemical_df = chemical_df[chemical_df['SMILES_ready'].notna()]

# Convert 'MONOISOTOPIC MASS' column to numeric, forcing errors to NaN
chemical_df['Monoisotopic_Mass_ready'] = pd.to_numeric(chemical_df['Monoisotopic_Mass_ready'], errors='coerce')

# Drop rows with NaN in 'MONOISOTOPIC MASS'
chemical_df = chemical_df[chemical_df['Monoisotopic_Mass_ready'].notna()]

# Function to strictly determine if a molecule is organic
def is_organic(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return False  # invalid SMILES
    organic_elements = {'C', 'H', 'N', 'O', 'P', 'S', 'F', 'Cl', 'Br', 'I'}  # Common organic elements
    inorganic_elements = {'Li', 'Be', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Th', 'U'}
    contains_carbon = False
    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        if symbol == 'C':
            contains_carbon = True
        if symbol in inorganic_elements:
            return False  # Contains metal or other inorganic elements
        if symbol not in organic_elements:
            return False  # Contains uncommon elements for organic molecules
    return contains_carbon

# Step 1: Filter rows with available MW (MONOISOTOPIC MASS) between 100 and 1000 and classified as organic
filtered_df = []
for index, row in chemical_df.iterrows():
    smiles = row['SMILES_ready']  # Assuming the SMILES column is named 'SMILES'
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mw = row['Monoisotopic_Mass_ready']  # Assuming the column is named 'MONOISOTOPIC MASS'
        if 100 <= mw <= 1000:
            if is_organic(smiles):
                filtered_df.append(row)

# Create a new DataFrame from the filtered rows
filtered_df = pd.DataFrame(filtered_df)

# Save the filtered DataFrame to a new CSV file
# filtered_df.to_csv('D:/UCSF_postdoc_topic/ECHO_project/Reprocessed_MSDIAL/Filtered_Chemical_List.csv', index=False)
print("Filtering complete. The filtered list has been saved as 'Filtered_Chemical_List.csv'.")
filtered_df.drop_duplicates(subset= 'SMILES_ready', inplace=True)
# filtered_df.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Plastic_map_chemicallist_match/Plastic_Chemical_List_organic.csv', index=False)
#export for biontransformation using biotransformer3
# Export_Transformation = filtered_df[['SMILES_ready']]
# Export_Transformation.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/First100_batch/Plastic_Chemical_List_organic_forEPA_CTS.txt', index=False)

Filtering complete. The filtered list has been saved as 'Filtered_Chemical_List.csv'.


In [47]:
filtered_df.shape

(5144, 796)

In [None]:
##make plot for the classficatin of the plastic cheicals 
##make venn digram for the suspect list
#make classification of the filtered plastic map, or combined with the annotation from the est paper
#annotate the targeted mz with the function information and the production volume information from the EST paper
plasticmap_est = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/plastic_related_chemicals/plasticmap_from_ESTpaper.csv',encoding='ISO-8859-1')
# Match Inchikey from match_summary_df with bloodexpo_modified and retain hits
plastic_with_orig_annot= pd.merge(chemical_df, plasticmap_est, how='inner', left_on='SMILES', right_on='SMILES')
plastic_with_orig_annot.shape
plastic_with_orig_annot.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/Plastic_map_chemicallist_match/Plastic_Chemical_List_organic_withESTannotation.csv', index=False)

(5628, 828)

In [41]:
filtered_df['QC_NOTES'].describe()

count                                      1330
unique                                      198
top       SRS/ChemID matched; SRS trust index 3
freq                                        375
Name: QC_NOTES, dtype: object

################searching chemicals with in blood exposome database and the WHO explorer database

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import MolToSmiles
##cross refrence, cross check with blood exposome database
bloodexpo_dat = pd.read_csv("D:/UCSF_postdoc_topic/REVEAL_topics/references/blood_exposome_database/blood_exposome_chemicals_july_2023.csv")

# Step 1: Filter rows with available MW (MONOISOTOPIC MASS) between 100 and 1000 and classified as organic
bloodexpo_modified = []
for index, row in bloodexpo_dat.iterrows():
    smiles = row['CanonicalSMILES']  # Assuming the SMILES column is named 'SMILES'
    mol = Chem.MolFromSmiles(smiles)
    canonical_smiles = Chem.MolToSmiles(mol, canonical=True) if mol else None
    bloodexpo_modified.append({
        **row,
        "SMILES_ready": canonical_smiles})

# Create a new DataFrame from the filtered rows
bloodexpo_modified = pd.DataFrame(bloodexpo_modified)

[08:57:11] Explicit valence for atom # 5 Cl, 3, is greater than permitted
[08:57:12] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[08:57:13] Explicit valence for atom # 7 Br, 3, is greater than permitted
[08:57:13] Explicit valence for atom # 20 Cl, 3, is greater than permitted
[08:57:15] Explicit valence for atom # 6 Cl, 5, is greater than permitted
[08:57:16] Explicit valence for atom # 3 Cl, 3, is greater than permitted
[08:57:16] Explicit valence for atom # 5 Br, 3, is greater than permitted
[08:57:17] Explicit valence for atom # 9 Cl, 3, is greater than permitted


In [2]:
Explorer = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/references/WHO_exposome_explorer/biomarkers_exposome_explorer.csv')
Explorer.dropna(subset=['SMILES'], inplace=True)
# Step 1: Filter rows with available MW (MONOISOTOPIC MASS) between 100 and 1000 and classified as organic
expo_modified = []
for index, row in Explorer.iterrows():
    smiles = row['SMILES']  # Assuming the SMILES column is named 'SMILES'
    mol = Chem.MolFromSmiles(smiles)
    canonical_smiles = Chem.MolToSmiles(mol, canonical=True) if mol else None
    expo_modified.append({
        **row,
        "SMILES_ready": canonical_smiles})

# Create a new DataFrame from the filtered rows
expo_modified = pd.DataFrame(expo_modified)

In [3]:
Explorer_bac = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/references/WHO_exposome_explorer/microbial_metabolites_explorer.csv')
Explorer_bac.dropna(subset=['SMILES'], inplace=True)
# Step 1: Filter rows with available MW (MONOISOTOPIC MASS) between 100 and 1000 and classified as organic
expobac_modified = []
for index, row in Explorer_bac.iterrows():
    smiles = row['SMILES']  # Assuming the SMILES column is named 'SMILES'
    mol = Chem.MolFromSmiles(smiles)
    canonical_smiles = Chem.MolToSmiles(mol, canonical=True) if mol else None
    expobac_modified.append({
        **row,
        "SMILES_ready": canonical_smiles})

# Create a new DataFrame from the filtered rows
expobac_modified = pd.DataFrame(expobac_modified)

In [4]:
explorer_comb = pd.concat([expobac_modified, expo_modified])
explorer_comb.drop_duplicates(subset= 'SMILES_ready', inplace=True)
explorer_comb.shape

(1063, 29)

In [15]:
plastic_est_paper = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/plastic_related_chemicals/plasticmap_from_ESTpaper.csv',encoding='ISO-8859-1')
plastic_chem = pd.read_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_List_organic_withRTprediction_new_SMILES.csv')

In [24]:
#add comments of the cross check with blood expo and exposome explorer, and plastic_est_paper to the orignial plastic_chem list
for i,row in plastic_chem.iterrows():
    inchikey = row['INCHIKEY']
    smiles = row['SMILES_ready']
    if inchikey in bloodexpo_modified['InChIKey'].tolist():
        plastic_chem.loc[i,'BloodExpo_check'] = 'Y'
    else:
        plastic_chem.loc[i,'BloodExpo_check'] = 'N'
    if inchikey in explorer_comb['InChIKey'].tolist():
        plastic_chem.loc[i,'ExposomeExplorer_check'] = 'Y'
    else:
        plastic_chem.loc[i,'ExposomeExplorer_check'] = 'N'
    if inchikey in plastic_est_paper['InChI_key'].tolist():
        function_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'Function'].values
        polymer_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'Polymer'].values
        EU_production_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'EU'].values
        US_production_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'USA'].values
        OECD_production_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'OECD'].values
        Total_production_values = plastic_est_paper.loc[plastic_est_paper['InChI_key'] == inchikey, 'Total'].values
        
        if len(function_values) > 0:
            plastic_chem.loc[i,'Function'] = function_values[0]
        else:
            plastic_chem.loc[i,'Function'] = 'NA'
        if len(polymer_values) > 0:
            plastic_chem.loc[i,'Polymer'] = polymer_values[0]
        else:
            plastic_chem.loc[i,'Polymer'] = 'NA'
        if len(EU_production_values) > 0:
            plastic_chem.loc[i,'EU_production'] = EU_production_values[0]
        else:
            plastic_chem.loc[i,'EU_production'] = 'NA'
        if len(US_production_values) > 0:
            plastic_chem.loc[i,'US_production'] = US_production_values[0]
        else:
            plastic_chem.loc[i,'US_production'] = 'NA'
        if len(OECD_production_values) > 0:
            plastic_chem.loc[i,'OECD_production'] = OECD_production_values[0]
        else:
            plastic_chem.loc[i,'OECD_production'] = 'NA'
        if len(Total_production_values) > 0:
            plastic_chem.loc[i,'Total_production'] = Total_production_values[0]
        else:
            plastic_chem.loc[i,'Total_production'] = 'NA'
    else:
        plastic_chem.loc[i,'Function'] = 'NA'
        plastic_chem.loc[i,'Polymer'] = 'NA'
        plastic_chem.loc[i,'EU_production'] = 'NA'
        plastic_chem.loc[i,'US_production'] = 'NA'
        plastic_chem.loc[i,'OECD_production'] = 'NA'
        plastic_chem.loc[i,'Total_production'] = 'NA'


# Drop columns that start with "ring:" or "group:"
columns_to_drop = [col for col in plastic_chem.columns if col.startswith('ring:') or col.startswith('group:')
                   or col.startswith('chain:')or col.startswith('bond:')or col.startswith('atom:')]
plastic_chem.drop(columns=columns_to_drop, inplace=True)
plastic_chem.to_csv('D:/UCSF_postdoc_topic/REVEAL_topics/REVEAL_200samples_analysis/plastic_map_chemlist/Plastic_Chemical_withupated_annotation.csv', index=False)

In [None]:
subset_row = plastic_chem[plastic_chem['DTXSID'] == 'DTXSID5020730']

Unnamed: 0,INPUT,FOUND_BY,DTXSID,PREFERRED_NAME,DTXCID,INCHIKEY,CASRN,SMILES,MS_READY_SMILES,MONOISOTOPIC_MASS,...,TOXCAST_NUMBER_OF_ASSAYS/TOTAL,TOXCAST_PERCENT_ACTIVE,ECHAPLASTICS,CPPDBLISTA,CPPDBLISTB,SMILES_ready,Monoisotopic_Mass_ready,CFMID_ID,BloodExpo_check,ExposomeExplorer_check
2702,DTXSID5020730,DSSTox_Substance_Id,DTXSID5020730,8-Hydroxyquinoline,DTXCID30730,MCJGNVYPOGVAJF-UHFFFAOYSA-N,148-24-3,OC1=CC=CC2=CC=CN=C12,OC1=CC=CC2=CC=CN=C12,145.052764,...,163/598,27.26,-,-,-,Oc1cccc2cccnc12,145.052764,Molecule2703,Y,N
