In [1]:
import pandas as pd
from rdkit import Chem
import seaborn as sns
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt


In [2]:
pubchem_mapping_file = "./map_to_common_names//actives_common_name_SMILES_mapping.txt"
# pandas doesn't read the nasty line splitting well
lookup = {}
with open(pubchem_mapping_file, "r") as f:
    for line in f.readlines():
        toks = line.split()
        smiles = toks[0]
        rest = toks[1:]
        name = "".join(rest)
        lookup[smiles] = name



In [3]:
actives_combined = pd.read_csv("final/actives_combined.csv")

In [4]:
actives_combined.head()

Unnamed: 0,CANONICAL_SMILES,INCHIKEY,common_name,appears_in_N_ds
0,CC#C[C@]1(O)CC[C@H]2C3=C(C4=C(CC3)CC(=O)CC4)C(...,BHZCGCMWEVHUBA-ZLMDOPAYSA-N,mifepristone,4
1,COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)...,HSUGRBWQSSZJOP-LBAQZLPGSA-N,diltiazem,4
2,COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc2c...,HBNPJJILLOYFJU-VMPREFPWSA-N,mibefradil,4
3,Cc1ccc(-c2nc3ccc(C)cn3c2CC(=O)N(C)C)cc1,ZAFYATHCZYHLPB-UHFFFAOYSA-N,zolpidem,3
4,COc1ccc(CCN(C)CCC[C@@](C#N)(c2cc(OC)c(OC)c(OC)...,XQLWNAFCTODIRK-NDEPHWFRSA-N,gallopamil,3


In [5]:
def assign_common_name_if_not_exists(mapping, SMILES, common_name):
    if pd.isna(common_name):
        name = mapping[SMILES]
    else:
        name = common_name
    return name

In [6]:
actives_combined['common_name'] = actives_combined.apply(lambda row: assign_common_name_if_not_exists(lookup, row["CANONICAL_SMILES"], row["common_name"]), axis=1)


In [7]:
actives_combined

Unnamed: 0,CANONICAL_SMILES,INCHIKEY,common_name,appears_in_N_ds
0,CC#C[C@]1(O)CC[C@H]2C3=C(C4=C(CC3)CC(=O)CC4)C(...,BHZCGCMWEVHUBA-ZLMDOPAYSA-N,mifepristone,4
1,COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)...,HSUGRBWQSSZJOP-LBAQZLPGSA-N,diltiazem,4
2,COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc2c...,HBNPJJILLOYFJU-VMPREFPWSA-N,mibefradil,4
3,Cc1ccc(-c2nc3ccc(C)cn3c2CC(=O)N(C)C)cc1,ZAFYATHCZYHLPB-UHFFFAOYSA-N,zolpidem,3
4,COc1ccc(CCN(C)CCC[C@@](C#N)(c2cc(OC)c(OC)c(OC)...,XQLWNAFCTODIRK-NDEPHWFRSA-N,gallopamil,3
...,...,...,...,...
9274,Cc1nc2c3ccccc3nc(NNC(=O)CCC(=O)N3CCN(c4ccccc4F...,KQEZZZZKWOPGAX-UHFFFAOYSA-N,C600-0088,1
9275,Cc1cccc(N2CCN(C(=O)CCC(=O)NNc3nc4ccccc4c4nc(C)...,SWUYLFQSAWAMFM-UHFFFAOYSA-N,C600-0098,1
9276,Cc1nc2c3ccccc3nc(NNC(=O)CCC(=O)N3CCN(c4cccc(Cl...,SKCLDYRCRQAAMC-UHFFFAOYSA-N,C600-0103,1
9277,Cc1nc2c3ccccc3nc(NNC(=O)CCC(=O)N3CCN(c4cc(Cl)c...,DJFZKZSOMMIQLN-UHFFFAOYSA-N,C600-0115,1


In [8]:
actives_combined.head()

Unnamed: 0,CANONICAL_SMILES,INCHIKEY,common_name,appears_in_N_ds
0,CC#C[C@]1(O)CC[C@H]2C3=C(C4=C(CC3)CC(=O)CC4)C(...,BHZCGCMWEVHUBA-ZLMDOPAYSA-N,mifepristone,4
1,COc1ccc([C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)C2OC(C)...,HSUGRBWQSSZJOP-LBAQZLPGSA-N,diltiazem,4
2,COCC(=O)O[C@]1(CCN(C)CCCc2nc3ccccc3[nH]2)CCc2c...,HBNPJJILLOYFJU-VMPREFPWSA-N,mibefradil,4
3,Cc1ccc(-c2nc3ccc(C)cn3c2CC(=O)N(C)C)cc1,ZAFYATHCZYHLPB-UHFFFAOYSA-N,zolpidem,3
4,COc1ccc(CCN(C)CCC[C@@](C#N)(c2cc(OC)c(OC)c(OC)...,XQLWNAFCTODIRK-NDEPHWFRSA-N,gallopamil,3


In [9]:
actives_combined.common_name.value_counts()

common_name
                                                                                                                                               1027
N-(2,6-dimethylbenzyl)-1,2-dimethyl-1H-pyrrolo[3,2-c]pyridin-4-amine                                                                              1
4-(2-(4-chloro-1H-pyrazol-1-yl)ethylamino)-3-(4-methyl-6-morpholino-1H-benzo[d]imidazol-2-yl)pyridin-2(1H)-one                                    1
4-[(S)-2-(3-Chloro-phenyl)-2-hydroxy-ethylamino]-3-[6-(1,4-dioxa-8-aza-spiro[4.5]dec-8-yl)-4-methyl-1H-benzoimidazol-2-yl]-1H-pyridin-2-one       1
PD189076                                                                                                                                          1
                                                                                                                                               ... 
N-methyl-N-[2-[4-[1-(2,4,6-trimethylphenyl)sulfonyl-3-pyrazolyl]phenoxy]ethyl]-2-pyridinamine       

In [10]:
actives_combined.to_csv("./map_to_common_names/actives_combined_common_name_mapping.csv", index=False)

In [11]:
actives_multi_ds_combined = pd.read_csv("final/actives_combined_more_than_one_dataset.csv")

In [12]:
actives_multi_ds_combined['common_name'] = actives_multi_ds_combined.apply(lambda row: assign_common_name_if_not_exists(lookup, row["CANONICAL_SMILES"], row["common_name"]), axis=1)


In [13]:
actives_multi_ds_combined.to_csv("./map_to_common_names/actives_more_than_one_ds_common_name_mapping.csv", index=False)