# eMolecule inch_key 변환

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from tqdm import tqdm
import tensorflow as tf

In [2]:
eMol = pd.read_csv('version.smi', sep=' ')
eMol.head()

Unnamed: 0,isosmiles,version_id,parent_id
0,CC(C)CCC[C@@H](C)[C@H]1CC[C@H]2[C@@H]3CC=C4CC(...,9988,9988
1,CC(C)CCC[C@@H](C)[C@H]1CCC2C3CC=C4CC(CC[C@]4(C...,9990,9990
2,CCCCCCCCCCCCCC(=O)OC1CC[C@]2(C)C3CC[C@]4(C)[C@...,10004,10004
3,Br.Oc1cc(on1)C1CCNCC1,10024,299959306
4,Cn1ncc2cc(CN)ccc12,10025,10025


In [8]:
eMol['isosmiles_length'] = eMol['isosmiles'].apply(len)


In [5]:
min_length_data = eMol[eMol['isosmiles_length'] == eMol['isosmiles_length'].min()]

In [9]:
eMol[eMol['isosmiles_length'] <=3]

Unnamed: 0,isosmiles,version_id,parent_id,isosmiles_length
10044,CON,33896,33896,3
127545,BrI,475210,475210,3
127546,ClI,475212,475212,3
127689,[V],475688,475688,3
127747,CCI,475832,475832,3
128952,B#N,479170,479170,3
129956,[W],481713,481713,3
130871,ICI,484057,484057,3
132507,[Y],488368,488368,3
134463,CSC,493755,493755,3


크기가 작은 분자가 많지 않음

In [4]:
len(eMol)

17798589

In [5]:
count_dot = eMol['isosmiles'].apply(lambda x: '.' in x).sum()

print("Dot count in 'isosmiles' column:", count_dot) # 344317

Dot count in 'isosmiles' column: 344317


마침표(.)가 포함되지 않은 데이타만 필터

In [6]:
df_filtered = eMol[~eMol['isosmiles'].str.contains('\.')]

In [7]:
len(df_filtered)

17454272

In [33]:
def smiles_to_inchi(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        inchi_key = Chem.inchi.MolToInchiKey(mol)
        return inchi_key
    except:
        print(smiles)
        return None

In [46]:
def smiles_to_inchi(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        inchi_key = Chem.inchi.MolToInchiKey(mol)
        return inchi_key
    except:
        return None

def batch_process(df, batch_size):
    
    num_batches = len(df) // batch_size
    e_mol_inchi = []  # Create an empty list to store the results
   
    for i in tqdm(range(num_batches)):
        
        batch_df = df.iloc[i * batch_size:(i + 1) * batch_size]
        
        with tf.device('/GPU:0'):
            inchi_batch = [smiles_to_inchi(smi) for smi in batch_df['isosmiles']]
        e_mol_inchi.extend(inchi_batch)  # Extend the list with the inchi values for the current batch
        print("Processing Batch:", i + 1, "/", num_batches)
    return e_mol_inchi

# Assuming you have a DataFrame called df_filtered containing the 'isosmiles' column
batch_size = 1000000
e_mol_inchi = batch_process(df_filtered, batch_size)

파일 저장

In [45]:
df = pd.DataFrame({'inchl_key': e_mol_inchi})
output_file = 'e_mol_inchi.txt'
df.drop_duplicates().to_csv(output_file, index=False, columns=['inchl_key'], sep='\t')

print("DataFrame has been saved as a text file:", output_file)

DataFrame has been saved as a text file: e_mol_inchi.txt


PubChemQC데이타 추가

In [13]:
pub = pd.read_csv('PubChemQC_105052.csv', sep=',')
pub.head()

Unnamed: 0.4,Unnamed: 0,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0.1.1,i,molecular_formula,molecular_weight,atom_count,heavy_atom_count,...,gap,Isomeric_SMILES,Can_SMILES,delocal_charged,protic_N,alcohol,thiol,PH,small_ring,C_only
0,0,0.0,0.0,0.0,24,46225301,C16H12N4O2,292.29208,34,22,...,3.425913,CC1=NN(C2=NC=C3C(=C12)C(=O)N(C3=O)C4=CC=CC=C4)C,Cc1nn(C)c2ncc3c(c12)C(=O)N(c1ccccc1)C3=O,0,0,0,0,0,0,0
1,1,1.0,1.0,1.0,39,46225638,C18H18O,250.33492,37,19,...,4.266745,CC1=CC(=CC(=C1C)C)/C=C/C(=O)C2=CC=CC=C2,Cc1cc(/C=C/C(=O)c2ccccc2)cc(C)c1C,0,0,0,0,0,0,0
2,2,2.0,2.0,2.0,48,46225722,C15H11N3O4,297.26554,33,22,...,3.904834,CC1=NC2=CC3=C(C=C2C(=O)N1/N=C/C4=CC=CO4)OCO3,Cc1nc2cc3c(cc2c(=O)n1/N=C/c1ccco1)OCO3,0,0,0,0,0,0,0
3,3,3.0,3.0,3.0,49,46225741,C13H8N2O,208.21542,24,16,...,4.571513,C1=CC=C2C(=C1)C3=C(C4=C2OC=C4)N=CN3,c1ccc2c(c1)c1[nH]cnc1c1ccoc21,0,0,0,0,0,0,0
4,4,4.0,4.0,4.0,52,46225750,C15H11N3O,249.26734,30,19,...,3.719796,CC1=CC=CC2=NC3=C(N12)NC(=O)C4=CC=CC=C43,Cc1cccc2nc3c4ccccc4c(=O)[nH]c3n12,0,0,0,0,0,0,0


In [14]:
pub = pub['Can_SMILES']

In [17]:
pub

0         Cc1nn(C)c2ncc3c(c12)C(=O)N(c1ccccc1)C3=O
1                Cc1cc(/C=C/C(=O)c2ccccc2)cc(C)c1C
2           Cc1nc2cc3c(cc2c(=O)n1/N=C/c1ccco1)OCO3
3                    c1ccc2c(c1)c1[nH]cnc1c1ccoc21
4                Cc1cccc2nc3c4ccccc4c(=O)[nH]c3n12
                            ...                   
105047                              COc1cncc(OC)n1
105048                         CC12C=CC=CN1C=CC=C2
105049                              CN(C)C(=O)CCCl
105050                              CN1CCN(C=O)CC1
105051                                 CC=Nn1cnnc1
Name: Can_SMILES, Length: 105052, dtype: object

In [20]:
pub_inchi = [smiles_to_inchi(smi) for smi in pub]



In [19]:
e_mol_inchi

['SDKNRANMZNPJQT-UHFFFAOYSA-N',
 'VHUCZMOFPPWAMU-MDZDMXLPSA-N',
 'FHODSCYHLOAPJS-FRKPEAEDSA-N',
 'JLZQEUBSMVOXES-UHFFFAOYSA-N',
 'HTKOKKOVQKATLA-UHFFFAOYSA-N',
 'JPTNPSZJDHEUFK-UHFFFAOYSA-N',
 'SOOIBVIVGQQXKC-UHFFFAOYSA-N',
 'ZDABXFUNGSDQRN-UHFFFAOYSA-N',
 'DJVNIZRUROIYHP-UHFFFAOYSA-N',
 'WMNRJMXQVUQNAM-UHFFFAOYSA-N',
 'DYAZPVTUJDWMKF-UHFFFAOYSA-N',
 'ZREWYLZFFAQUQO-UHFFFAOYSA-N',
 'RARIGSUTWNWABP-UHFFFAOYSA-N',
 'FTMDCIZRWZYMNU-UHFFFAOYSA-N',
 'CRVJPGRMLNHQLC-UHFFFAOYSA-N',
 'NESVGLWWFBOVIY-UHFFFAOYSA-N',
 'DPJGQPSYBWEMHW-UHFFFAOYSA-N',
 'GYKBNNSJFMLXTA-UHFFFAOYSA-N',
 'NYLCTMQHPFIJCL-UHFFFAOYSA-N',
 'XUFJHTRVFXOZOI-UHFFFAOYSA-N',
 'QNULDXGNJUBSID-UHFFFAOYSA-N',
 'SQQOXRNSADVEKZ-UHFFFAOYSA-N',
 'XMHWVYKLRBMJEF-UHFFFAOYSA-N',
 'LVDCFTVFEPEBSA-UHFFFAOYSA-N',
 'LDCAQGBKMDSYGC-UHFFFAOYSA-N',
 'ODXOOZPEIBOMFV-UHFFFAOYSA-N',
 'FCNAOJXSMDJGML-UHFFFAOYSA-N',
 'VIWUZDDDWGXKHU-UHFFFAOYSA-N',
 'KTAAQEHLGMUIRW-UHFFFAOYSA-N',
 'TWYFKGGOHHQTDL-UHFFFAOYSA-N',
 'JVAVTGGGYFPSIB-UHFFFAOYSA-N',
 'SXNYQF

In [1]:
pub_file='e_mol_inchi.txt'
df_existing = pd.read_csv(pub_file, sep='\t')

    

NameError: name 'pd' is not defined

In [27]:
df_existing.head()

Unnamed: 0,inchl_key
0,OTVRYZXVVMZHHW-FNOPAARDSA-N
1,QNEPTKZEXBPDLF-AWEQCRHCSA-N
2,SJDMTGSQPOFVLR-ZVMQTDSHSA-N
3,ILYZCZGIDUYDOI-UHFFFAOYSA-N
4,JWQMRBBWZUKWFJ-UHFFFAOYSA-N


In [29]:
df_appended = df_existing.append(pub_inchi, ignore_index=True)

In [31]:
len(df_existing)

16859199

In [30]:
len(df_appended)

16964251

In [32]:
df_appended.drop_duplicates().to_csv('e_mol_inchi.txt', index=False, columns=['inchl_key'], sep='\t')