In [11]:
import pandas as pd
import numpy as np

data = pd.read_csv('../data/QM_137k.csv')
duplicate_counts = data['smiles'].value_counts()
print("Количество одинаковых значений в столбце 'smiles':")
print(duplicate_counts[duplicate_counts > 1].count())


Количество одинаковых значений в столбце 'smiles':
2587


In [12]:
import pandas as pd
from rdkit import Chem
from tqdm import tqdm
import os

def canonical_smiles(smiles):
    if pd.isna(smiles):
        return None, False, True
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            return Chem.MolToSmiles(mol), True, False
        else:
            return None, False, False
    except:
        return None, False, False

data['Canonical_smiles'], data['Conversion_Success'], data['Is_NaN'] = zip(*[canonical_smiles(smile) for smile in tqdm(data['smiles'], desc='Processing data')])

folder_path = './data'
files = os.listdir(folder_path)


Processing data: 100%|██████████| 136219/136219 [00:22<00:00, 6149.13it/s]


In [13]:

combined_df = pd.DataFrame()

for file in files:
    if file.endswith('.csv'):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path, sep=';')
        df['Canonical_Smiles'], df['Conversion_Success'], df['Is_NaN'] = zip(*[canonical_smiles(smile) for smile in tqdm(df['Smiles'], desc=f'Processing {file}')])
        combined_df = pd.concat([combined_df, df])

combined_df_sorted = combined_df[combined_df['Conversion_Success'] == True]

combined_df_sorted ['Canonical_Smiles']
data ['Canonical_smiles']

Processing approved_drug_chembl.csv:   0%|          | 0/4192 [00:00<?, ?it/s]

Processing approved_drug_chembl.csv: 100%|██████████| 4192/4192 [00:01<00:00, 3496.22it/s]
Processing phase_2.csv: 100%|██████████| 2474/2474 [00:00<00:00, 4826.42it/s]
Processing phase_3.csv: 100%|██████████| 1735/1735 [00:00<00:00, 4592.10it/s]
Processing phase_1.csv: 100%|██████████| 1595/1595 [00:00<00:00, 5674.79it/s]


0                            CNC(=S)N/N=C/c1c(O)ccc2ccccc12
1                             O=C(NCCn1cccc1)c1cccc2ccccc12
2                         C=C(C)[C@H]1C[C@@H]2OO[C@H]1C=C2C
3                                            OCCCc1cc[nH]n1
4                             CC(=N)NCc1cccc(CNCc2ccncc2)c1
                                ...                        
136214    CC(C)(O)C#Cc1ccc(B(O)O)c([C@H](Cc2cc(F)cc(F)c2...
136215    CC(C)(C)OC(=O)N1CC(CC#N)(n2cc(B3OC(C)(C)C(C)(C...
136216                           CC1(C)OB(C2=CCNCC2)OC1(C)C
136217         CC(C)(C)OC(=O)Nc1cc(B2OC(C)(C)C(C)(C)O2)ccn1
136218              CC1(C)OB(c2cc(Br)cc(C(F)(F)F)c2)OC1(C)C
Name: Canonical_smiles, Length: 136219, dtype: object

In [14]:
data_intersection = data[data['Canonical_smiles'].isin(combined_df_sorted['Canonical_Smiles'])].copy()
data_intersection = data_intersection.drop(['Canonical_smiles', 'Conversion_Success', 'Is_NaN'], axis=1)
data_intersection.reset_index(inplace=True)

data_intersection.reset_index(inplace=True)
df = data_intersection[['index', 'smiles']].copy()
df.to_csv('smiles_id.csv')


In [15]:
df.shape

(994, 2)