In [26]:
import os
import sys
import pathlib

root_dir = str(pathlib.Path().resolve().parents[2])
sys.path.append(root_dir)


In [27]:
from KGGraph.KGGChem.standardize import SmileStandardizer
dataset_name=["bace", "bbbp", "sider", "clintox", "tox21", "toxcast", "esol", "freesolv", "lipo", "qm7", "qm8", "qm9"]
for dataset in dataset_name:
    SmileStandardizer.standardize(f"Data/contamination/test_{dataset}.txt", f"Data/cleanup/standsmi_test_{dataset}.txt")

Standardizing SMILES: 100%|██████████| 152/152 [00:00<00:00, 20578.23it/s]
Standardizing SMILES: 100%|██████████| 204/204 [00:00<00:00, 24282.37it/s]
Standardizing SMILES: 100%|██████████| 143/143 [00:00<00:00, 22583.13it/s]
Standardizing SMILES: 100%|██████████| 148/148 [00:00<00:00, 23122.02it/s]
Standardizing SMILES:   0%|          | 0/784 [00:00<?, ?it/s][19:42:42] Explicit valence for atom # 8 Al, 6, is greater than permitted
Standardizing SMILES: 100%|██████████| 784/784 [00:00<00:00, 26850.56it/s]


Error processing NC(=O)NC1N=C(O[AlH3](O)O)NC1=O: Explicit valence for atom # 8 Al, 6, is greater than permitted


Standardizing SMILES: 100%|██████████| 858/858 [00:00<00:00, 25256.43it/s]
Standardizing SMILES: 100%|██████████| 113/113 [00:00<00:00, 22898.65it/s]
Standardizing SMILES: 100%|██████████| 65/65 [00:00<00:00, 20838.47it/s]
Standardizing SMILES: 100%|██████████| 420/420 [00:00<00:00, 23814.20it/s]
Standardizing SMILES: 100%|██████████| 683/683 [00:00<00:00, 44376.26it/s]
Standardizing SMILES: 100%|██████████| 2179/2179 [00:00<00:00, 37591.31it/s]
Standardizing SMILES:   0%|          | 0/13389 [00:00<?, ?it/s][19:42:42] Can't kekulize mol.  Unkekulized atoms: 1 5
Standardizing SMILES: 100%|██████████| 13389/13389 [00:00<00:00, 44698.46it/s]
[19:42:43] Can't kekulize mol.  Unkekulized atoms: 1 5


In [45]:
def load_smiles(filepath):
    """Load SMILES strings from a file and return as a set."""
    with open(filepath, 'r') as f:
        smiles = {line.strip().split()[0] for line in f if line.strip()}
    return smiles

def check_overlap(test_smiles, pretrain_smiles):
    """Return the intersection between test and pretrain SMILES sets."""
    return test_smiles.intersection(pretrain_smiles)

def analyze_dataset_overlap(dataset_names, pretrain_path, regression_datasets):
    pretrain_smiles = load_smiles(pretrain_path)

    reg_contaminated = 0
    reg_total = 0
    cls_contaminated = 0
    cls_total = 0

    for dataset in dataset_names:
        test_path = f"Data/cleanup/standsmi_test_{dataset}.txt"
        test_smiles = load_smiles(test_path)
        intersect = check_overlap(test_smiles, pretrain_smiles)

        n_overlap = len(intersect)
        n_total = len(test_smiles)

        if dataset in regression_datasets:
            reg_contaminated += n_overlap
            reg_total += n_total
        else:
            cls_contaminated += n_overlap
            cls_total += n_total

        print(f"[{dataset}] Overlap: {n_overlap} / {n_total}")

    print("\n===== Summary =====")
    print(f"Regression datasets contaminated: {reg_contaminated} / {reg_total}")
    print(f"Classification datasets contaminated: {cls_contaminated} / {cls_total}")
    print(f"Contamination ratio: {reg_contaminated+cls_contaminated} / {reg_total+cls_total} = {(reg_contaminated+cls_contaminated)/(reg_total+cls_total)*100:.2f}")


In [75]:
dataset_name=["bace", "bbbp", "sider", "clintox", "tox21", "toxcast", "esol", "freesolv", "lipo"]
regression_datasets = ["esol", "freesolv", "lipo"]
pretrain_path = "Data/cleanup/standsmi_chembl29.txt"
analyze_dataset_overlap(dataset_name, pretrain_path, regression_datasets)

[bace] Overlap: 30 / 152
[bbbp] Overlap: 145 / 204
[sider] Overlap: 88 / 143
[clintox] Overlap: 56 / 148
[tox21] Overlap: 692 / 783
[toxcast] Overlap: 787 / 858
[esol] Overlap: 88 / 113
[freesolv] Overlap: 53 / 65
[lipo] Overlap: 418 / 420

===== Summary =====
Regression datasets contaminated: 559 / 598
Classification datasets contaminated: 1798 / 2288
Contamination ratio: 2357 / 2886 = 81.67


In [63]:
import pandas as pd

pre_2mzinc15 = pd.read_csv("Data/regression/qm9/raw/qm9.csv")

In [65]:
pre_2mzinc15

Unnamed: 0,mol_id,smiles,A,B,C,mu,alpha,homo,lumo,gap,...,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,C,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.643290,-401.014647,-372.471772
1,gdb_2,N,293.60975,293.541110,191.393970,1.6256,9.46,-0.2570,0.0829,0.3399,...,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,O,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,C#C,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,C#N,0.00000,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,...,0.016601,-93.411888,-93.409370,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,gdb_133881,C1C2C3C4C5OC14C5N23,3.59483,2.198990,1.904230,1.6637,69.37,-0.2254,0.0588,0.2842,...,0.127406,-400.633868,-400.628599,-400.627654,-400.663098,23.658,-1603.983913,-1614.898804,-1623.788097,-1492.819438
133881,gdb_133882,C1N2C3C2C2C4OC12C34,3.65648,2.142370,1.904390,1.2976,69.52,-0.2393,0.0608,0.3002,...,0.127495,-400.629713,-400.624444,-400.623500,-400.658942,23.697,-1601.376613,-1612.291504,-1621.181424,-1490.211511
133882,gdb_133883,C1N2C3C4C5C2C13CN45,3.67118,2.143140,1.895010,1.2480,73.60,-0.2233,0.0720,0.2953,...,0.140458,-380.753918,-380.748619,-380.747675,-380.783148,23.972,-1667.045429,-1678.830048,-1688.312964,-1549.143391
133883,gdb_133884,C1N2C3C4C5CC13C2C45,3.52845,2.151310,1.865820,1.9576,77.40,-0.2122,0.0881,0.3003,...,0.152222,-364.720374,-364.714974,-364.714030,-364.749650,24.796,-1794.600439,-1807.210860,-1817.286772,-1670.349892


In [66]:
pre_2mzinc15['smiles'].to_csv("Data/contamination/qm9.txt", index=False, header=False)

In [67]:
SmileStandardizer.standardize(f"Data/contamination/qm9.txt", f"Data/cleanup/standsmi_qm9.txt")

Standardizing SMILES:  24%|██▍       | 32752/133885 [00:01<00:04, 25190.72it/s][20:27:37] Can't kekulize mol.  Unkekulized atoms: 1 5
[20:27:37] Can't kekulize mol.  Unkekulized atoms: 1 5
Standardizing SMILES: 100%|██████████| 133885/133885 [00:05<00:00, 25636.89it/s]
