In [11]:
import pandas as pd
from rdkit import Chem

In [23]:
def canonicalize(Dataframe: pd.DataFrame):
    
    """Canonicalizes the SMILES from Dataframe. A column called 'SMILES' is requiered

    Args: Dataframe with 'SMILES' column contaning smiles. 
    """
    
    Dataframe['SMILES'] = Dataframe['SMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x))) #canonicalize smiles from a Dataframe                                          
    

In [27]:
def check_overlapp_Smiles(Dataframe_1: pd.DataFrame, Dataframe_2: pd.DataFrame):
    
    """Checks if the two Dataframes have overlapping Smiles entries and returns a Dataframe which contains
       the overlapping smiles with the data of both Dataframes. The Dataframes need to have a column called 'SMILES'

    Args: two Dataframes with 'SMILES' columns containig smiles.
    """
    df_overlapp = pd.DataFrame
    
    
    

In [16]:
df_noBIO = pd.read_csv(r'/Users/matthiasgalka/git/AI-for-Chemistry/ADME_public_set_3521.csv')

In [18]:
df_BIO = pd.read_csv(r'/Users/matthiasgalka/git/AI-for-Chemistry/11095_2013_1222_MOESM2_ESM.csv', sep=';')

In [38]:
df_noBIO.rename(columns={'Updated SMILES': 'SMILES'}, inplace=True) # change the name of the column from 'Updated SMILES' to 'SMILES'
df_BIO.rename(columns={'Updated SMILES': 'SMILES'}, inplace=True) # change the name of the column from 'Updated SMILES' to 'SMILES'

In [39]:
df_noBIO.head()

Unnamed: 0,Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg)
0,Mol1,317714313,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,emolecules,0.675687,1.493167,0.089905,0.991226,0.518514,1.392169
1,Mol2,324056965,CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1,emolecules,0.675687,1.04078,0.550228,0.099681,0.268344,1.02792
2,Mol3,304005766,CN(c1ncc(F)cn1)[C@H]1CCCNC1,emolecules,0.675687,-0.358806,,2.0,2.0,1.02792
3,Mol4,194963090,CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C...,emolecules,0.675687,1.026662,1.657056,-1.158015,-1.403403,1.02792
4,Mol5,324059015,CC(C)(O)CCn1cc2cc(NC(=O)c3cccc(C(F)(F)F)n3)c(C...,emolecules,0.99638,1.010597,,1.015611,1.092264,1.629093


In [40]:
df_BIO.head()

Unnamed: 0,No,Name,SMILES,%F,logK(%F),Category,Source
0,1,3-Ketodesogestrel,OC1(CCC2C3C(C4C(=CC(=O)CC4)CC3)C(CC12CC)=C)C#C,76.0,0.500602,1,58131416
1,2,Abacavir,OCC1CC(n2c3nc(nc(NC4CC4)c3nc2)N)C=C1,83.0,0.688629,1,35812131416
2,3,Abecarnil,O(Cc1ccccc1)C=1C=CC2=NC=3C(=C2C=1)C(COC)=C(NC=...,92.0,1.060698,1,1516
3,4,Acadesine,NC(=O)c1ncn(C2OC(CO)C(O)C2O)c1N,10.0,-0.954243,0,1516
4,5,Acamprosate,S(O)(=O)(=O)CCCNC(=O)C,11.0,-0.907997,0,58131416


In [42]:
canonicalize(df_noBIO)
canonicalize(df_BIO)

In [43]:
noBIO_SMILES = df_noBIO['SMILES'].tolist()
BIO_SMILES = df_BIO['SMILES'].tolist()

In [45]:
len(BIO_SMILES)

995

In [47]:
def find_duplicates(list1, list2):

    """finds overlapping elements in a list and returns a list of them

    Args: 
         two lists 
    """

    duplicates = [] #list for saving overlapping elements

    unique_elements = set()

    for item in list1:
        if item in list2 and item not in unique_elements:
            duplicates.append(item)
            unique_elements.add(item)

    return duplicates #returns list of overlapping elements

Überlappende Elemente: [4, 5]


In [56]:
overlap = find_duplicates(noBIO_SMILES, BIO_SMILES)
len(overlap) # checks the amount of overlapping smiles

24

In [61]:
overlapping_data = pd.merge(df_noBIO, df_BIO, on='SMILES', how='inner') #merging two the data of overlapping Smiles

In [62]:
overlapping_data.head()

Unnamed: 0,Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg),No,Name,%F,logK(%F),Category,Source
0,Mol7,299985775,CC(C)NCC(O)COc1cccc2ccccc12,emolecules,1.327232,-0.162401,,,1.347759,3.318276,770,Propranolol,26.0,-0.454258,0,58101213141516
1,Mol8,538570,COc1ccc(Cl)cc1C(=O)NCCc1ccc(S(=O)(=O)NC(=O)NC2...,emolecules,1.458063,2.091561,,-0.917215,-0.943095,2.728062,417,Glyburide (Glibenclamide),82.0,0.658541,1,121516
2,Mol106,496510,Cc1cnc(C(=O)NCCc2ccc(S(=O)(=O)NC(=O)NC3CCCCC3)...,emolecules,0.675687,2.281583,,0.017451,-0.20412,1.02792,415,Glipizide,95.0,1.278754,1,358101213141516
3,Mol122,511449,CCCCNC(=O)NS(=O)(=O)c1ccc(C)cc1,emolecules,0.675687,0.25098,,0.439333,0.440122,1.02792,917,Tolbutamide,85.0,0.753328,1,358101213141516
4,Mol130,535781,CCCNC(=O)NS(=O)(=O)c1ccc(Cl)cc1,emolecules,0.675687,,,0.969835,0.628287,1.180527,186,CHLORPROPAMIDE,95.0,1.278754,1,581013141516


In [63]:
overlapping_data.shape

(24, 16)

In [66]:
overlapping_data.to_csv(r'/Users/matthiasgalka/git/AI-for-Chemistry/overlapping_data.csv')