In [None]:
## requires installed and activated rdkit environment in conda

import pandas as pd
import numpy as np
import requests
import os

import rdkit
from rdkit import rdBase
from rdkit import Chem
from rdkit.Chem import PandasTools

from rdkit import RDConfig
from rdkit.Chem import rdRGroupDecomposition

from rdkit.Chem import AllChe
from rdkit.Chem import Draw

In [None]:
CIDS = pd.read_csv('chem_id_pubmed.txt', 
                   names = ['CID'])

In [None]:
CIDS = CIDS['CID'].apply(lambda x: int(x))

### Get data from Pubchem

In [None]:
pip install pubchempy

In [None]:
import pubchempy as pcp

In [None]:
# dataframe for compounds' features

chem_data = pd.DataFrame(columns = ['CID', 'CanonicalSMILES', 'Fingerprint2D', 'MolecularFormula'])

In [None]:
# use pubchempy API to get CIDs from the pubchem

for CID in CIDS:
    comp = pcp.Compound.from_cid(CID)
    new_row = {'CID': comp.cid, 'CanonicalSMILES':comp.canonical_smiles, 'Fingerprint2D':comp.fingerprint, 'MolecularFormula':comp.molecular_formula}
    chem_data = chem_data.append(new_row, ignore_index=True)


In [None]:
chem_data.to_csv('chem_data.csv') 

### SMILES to RDKit Molecules, to SDF

In [None]:
# load csv file with smiles and fingerprints
chem_data = pd.read_csv('chem_data.csv')

In [None]:
# add column with rdkit molecule objects with pandas API
PandasTools.AddMoleculeColumnToFrame(chem_data, 'CanonicalSMILES','Molecule')

In [None]:
chem_data['Molecule']

In [None]:
# save to sdf
PandasTools.WriteSDF(chem_data, 'chem_data.sdf', molColName='Molecule', properties=list(chem_data.columns))

### Molecule Visualisation

In [None]:
ms = [] # future list of molecules

for mol in chem_data['Molecule']:
    ms.append(mol)  


In [None]:
# use Rdkit pandas API to generate images of compounds

rdkit.Chem.PandasTools.FrameToGridImage(chem_data,
                                        column='Molecule', 
                                        legendsCol='CID',
                                        molsPerRow=4,subImgSize=(300,300))

### Get compounds' fingerprints

#### Morgan fingerprints

In [None]:
from rdkit import DataStructs

In [None]:
chem_data['Morgan'] = chem_data['Molecule'].apply(lambda x:
                                                  AllChem.GetMorganFingerprint(x,2))

In [None]:
morgan_Similarity = []

for fp1 in chem_data['Morgan']:
    for fp2 in chem_data['Morgan']:
        morgan_Similarity.append(DataStructs.DiceSimilarity(fp1,fp2))                     


 #### MACCS keys

In [None]:
from rdkit.Chem import MACCSkeys

In [None]:
chem_data['MACCS keys'] = chem_data['Molecule'].apply(lambda x:
                                                  MACCSkeys.GenMACCSKeys(x))

In [None]:
MACCS_Similarity = []

for fp1 in chem_data['MACCS keys']:
    for fp2 in chem_data['MACCS keys']:
        MACCS_Similarity.append(DataStructs.FingerprintSimilarity(fp1,fp2))                     
        


#### RDKit fingerprints

In [None]:
from rdkit.Chem.Fingerprints import FingerprintMols

In [None]:
chem_data['RDKit fingerprints'] = chem_data['Molecule'].apply(lambda x:
                                                  FingerprintMols.FingerprintMol(x))

In [None]:
RDKit_Similarity = []

for fp1 in chem_data['RDKit fingerprints']:
    
    for fp2 in chem_data['RDKit fingerprints']:
        RDKit_Similarity.append(DataStructs.FingerprintSimilarity(fp1,fp2))                     

In [None]:
# list of fingerprints
fps = [FingerprintMols.FingerprintMol(x, minPath=1, maxPath=7, fpSize=2048,
                               bitsPerHash=2, useHs=True, tgtDensity=0.0,
                               minSize=128) for x in mols ]

# lists for the variables
qu, ta, sim = [], [], []

# compare all fp pairwise without duplicates
for n in range(len(fps)-1): # -1 so the last fp will not be used
    s = DataStructs.BulkTanimotoSimilarity(fps[n], fps[n+1:]) # +1 compare with the next to the last fp
    print(smiles[n], smiles[n+1:]) # witch mol is compared with what group
    # collect the SMILES and values
    for m in range(len(s)):
        qu.append(smiles[n])
        ta.append(smiles[n+1:][m])
        sim.append(s[m])


# build the dataframe and sort it
d = {'query':qu, 'target':ta, 'Similarity':sim}
df_final = pd.DataFrame(data=d)
df_final = df_final.sort_values('Similarity', ascending=False)


# save as csv
df_final.to_csv('RDKit_fps.csv', index=False, sep=',')




### Plot distribution of similarity

In [None]:
import seaborn as sns

# male similarity distribution histograms
 
sns.displot(RDKit_Similarity).set_title('RDKit_Similarity')
sns.displot(morgan_Similarity).set_title('morgan_Similarity')
sns.displot(MACCS_Similarity).set_title('MACCS_Similarity')
