In [1]:
!pip install transformers torch
!pip install ipywidgets --upgrade
!pip install transformers[torch]
!pip install accelerate -U

import torch
print(torch.__version__)
print(torch.version.cuda)

print(torch.cuda.is_available())

2.0.1+cu117
11.7
True


In [14]:
# Load data
with open('smiles_train.txt', 'r') as file:
    smiles_data = file.read().splitlines()

# You might want to preprocess or shuffle the data here
import random
random.shuffle(smiles_data)


In [15]:
import torch
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

1
NVIDIA GeForce RTX 3060 Laptop GPU


In [16]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)
# Additionally, add the EOS token as PAD token to ensure the model does not generate past the maximum length.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
model.config.pad_token_id = model.config.eos_token_id

cuda


In [17]:
from transformers import TextDataset, DataCollatorForLanguageModeling

# Use the TextDataset and DataCollator
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='smiles_train.txt',
    block_size=128
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)




In [18]:
dataset

<transformers.data.datasets.language_modeling.TextDataset at 0x289a3520390>

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=16,  # Smaller batch size
    gradient_accumulation_steps=4,    # Accumulate gradients over 4 steps
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True  # Enable mixed precision
)
from transformers import TrainingArguments


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

# Start training
trainer.train()


Step,Training Loss
500,1.3088
1000,1.0511
1500,1.0045
2000,0.9783
2500,0.9629
3000,0.9489
3500,0.937
4000,0.9299
4500,0.9237
5000,0.9215


TrainOutput(global_step=5386, training_loss=0.9910021986254873, metrics={'train_runtime': 22276.3634, 'train_samples_per_second': 15.473, 'train_steps_per_second': 0.242, 'total_flos': 2.2515403751424e+16, 'train_loss': 0.9910021986254873, 'epoch': 2.0})

In [20]:
# Assuming 'model' is your model instance and 'path_to_save_directory' is your directory path
model.save_pretrained('goodgptmol')
tokenizer.save_pretrained('goodgptmoltokenizer')

('goodgptmoltokenizer\\tokenizer_config.json',
 'goodgptmoltokenizer\\special_tokens_map.json',
 'goodgptmoltokenizer\\vocab.json',
 'goodgptmoltokenizer\\merges.txt',
 'goodgptmoltokenizer\\added_tokens.json')

In [24]:
model.eval()

def generate_smiles(model, tokenizer, num_generate=15000):
    generated = set()
    device = "cuda" if torch.cuda.is_available() else "cpu"  # Check if GPU is available and set device accordingly
    model = model.to(device)  # Move model to the correct device
    
    start_token = tokenizer.bos_token or tokenizer.cls_token or "<|endoftext|>"  # Ensure there is a start token

    while len(generated) < num_generate:
        # Encode with a start token and ensure it's on the right device
        inputs = tokenizer(start_token, return_tensors="pt", add_special_tokens=False).to(device)
        
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=100,
            num_return_sequences=5,
            do_sample=True,  # Enable sampling
            top_k=50,        # Top-k sampling
            temperature=0.8  # Adjust temperature to tweak diversity
        )
        
        for output in outputs:
            smile = tokenizer.decode(output, skip_special_tokens=True)
            if smile not in generated:
                generated.add(smile)
                
    return list(generated)

generated_smiles = generate_smiles(model, tokenizer)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Setting `pad_token_i

In [25]:
generated_smiles

from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor
import rdkit
print(rdkit.__version__)
print("PRINTING VALID GENERATED SMILES:")
# Function to draw and validate SMILES strings
def validate_smiles(smiles_list):
    with open('submission_gpt2_finetune.txt', 'w') as file:  # Open file in write mode
        for smiles in smiles_list:
            mol = Chem.MolFromSmiles(smiles)  # Validate SMILES string
            if mol:  # Check if the molecule is valid
                print(smiles)  # Print the valid SMILES to the console
                file.write(smiles + '\n')



validate_smiles(generated_smiles)


2023.09.6
PRINTING VALID GENERATED SMILES:
CC1=C(C)C=C(CC(=O)O)C(=O)C1
O=C(CSc1nnc(-c2ccccc2)o1)N(Cc1ccccc1)CC(CCCN1CCN(c2ccccn2)CC1)C1CCCCC1
O=C(CSc1nnc(SCC(=O)O)n
CCOC(=O)C(C)(O)C(=O)OCC
COC(=O)C(C)NC(=O)C1CCCN(S(=O)(=O)c2ccc(NC(=O)c3ccco3)cc2)C1
O=C1OC2(CCN(Cc3ccccc3OCc3ccccc3)CC2)C
c1cccc(NC(=O)CSc2nnc(COc3ccc4c(c3)OCO4)o2)c1
COc1ccc(Cn2nc(C(=O)NCc3ccco3)cc2N=C(O)c2ccccc2)cc1
COc1cc2c(cc1OC)CN1CCC(CNCc3ccc
CCN(CC)c1nc(NCC)nc2c1cnn2Cc1cccnc1
CCOC(=O)c1c(NC(=O)c2ccccc2C)sc(NC(=O)c2ccccc2)c1C#N
Cc1ccc(NC(=O)CSc2ncnc3scc(-c4ccccc4)c23
COc1ccccc1Nc1nc2nc(N3CCCN(C)CC3)cn2c(=O)n1NCc1ccccc1OC
CCN(CC)CCNC(=O)c1ccc(NC(=O)c2ccc(C(=O)NCCc3ccccc3)cc2)cc1
COC(=O)C(NC(=
O=C(O)CCCC(Cc1ccccc1)NC(=O)c1ccccc1Br
CC(C)C(NC(=O)c1ccc(NC(=O)C2C(C#N)=C(N)Oc3ccccc32)cc1)C(=O)O
CCCCC(OC(=O)CCC(=O)O)(c1cc
Nc1ccc2c(c1)nc(NCCCN2CCCCCC2)n2C1OC(CO)C(O)C1O
CC(=O)N1CCC(N(CCN2CCN(C(=O)c3ccc(Cl)c([N+](=O)[O-])c3)CC2)C2CCCCC2)CC1
C=CCn1c
Cn1cc(S(=O)(=O)Oc2ccccc2)c(=O)c2ccc(C(=O)O)cc21
COc1cc(-c2cnc(Nc3ccc(Cl)cc3[N+](=

[04:20:03] SMILES Parse Error: unclosed ring for input: 'CC(C)C1NC(=O)C(NC(=O)OCc2ccccc2)C(C)C
COc1ccc(CCc2ccc(-c3ccc4c(c3)CCN(C(C)=O)CC4)cc2)cc1
CC(=O)NCC1(CC)COC(COC(=O)c2ccc(Oc'
[04:20:03] Can't kekulize mol.  Unkekulized atoms: 16 17 18 31 32
[04:20:03] SMILES Parse Error: unclosed ring for input: 'CCOC(=O)C1CCOC1CN(CC(=O)NCc1ccccc1)C(=O)C(C)(C)C(=O)N1CC(=O)N(C)Cc1ccccc1
O=C(O)c1cccc(C23CC4CC(CC(C4)C2)C3)c1
COC(=O)c'
[04:20:03] SMILES Parse Error: unclosed ring for input: 'CN1CCN(c2cc3c(=O)n(C)cnc3cc2-3)CC1
COc1cc(C=NNC2=NCCc3ccccc32)cc(OC)c1OC
Cc1ccc(N2C(=O)C3SC(=Cc4ccco4)NC3=O)cc1
O=C1c2c(cc'
[04:20:03] SMILES Parse Error: unclosed ring for input: 'COC(=O)C12CCN(Cc3ccc(NC(=O)c4ccc(C)cc4)c(C)c3)CC2
COc1cccc(Nc2ccc(C(=O)NCc3ccc(C)cc3)cc2)c1
Cc1cc2ncnc(Nc3cc(C(=O)O)cc(C'
[04:20:03] Explicit valence for atom # 4 C, 5, is greater than permitted
[04:20:03] SMILES Parse Error: unclosed ring for input: 'Cc1c(Cl)ccc2c1CCN1C(=O)CCCC1C(=O)O
CCOC(=O)c1ccc(S(=O)(=O)N2CCN(Cc3c[nH]cn3)CC2)cc1
C

N#Cc1c(C(=O)N2CCN(c3nc(-c4ccccc4)cn4ncnc34)CC2)c(=O)n(-c2ccccc2)n1C
Cn1cnc2ccccc2c1=O
CN1CCN(c2cccc(-c3nc4ccccc4o3)c2)CC1
CC(
O=C(C=Cc1ccccc1)C1CCS(=O)(=O)C1
CCOC(=O)CCC(=O)Nc1ccccc1
CCCCCCCCCCCCCCCCCC(=O)NCc1ccccc1
O=C(Nc1nc(-c2ccccc2)cs1)c1ccc(Cl)cc1
COc
CC(=O)N1CCN(c2nc3ccccc3n2CCN(C)C)CC1
COc1cc(OC)cc(CC(C)C)c1OCC(=O)Nc1ncc(C2CCCC2)o1
CC(C)(C)OC(=O)NC(Cc1ccc(O)c2ccccc12)C(=
Cc1nnnn1-c1ccc2c(c1)CCCC2
COc1ccc(C=CC(=O)c2ccc(O)c(C)c2)cc1OCC(=O)N1CCCC1
COc1ccccc1NC(=O)CSc1nnc(C)n1N
COc1ccc(C=CC2C(=
N#Cc1cccc(N2CCN(C(=O)CSc3nnc(Cc4ccc(Cl)cc4)n3-c3ccccc3)CC2)c1
Cc1[nH]cnc1SCC(=O)N1CCN(c2nccc(NC(=O)c3ccccc3)n2)CC1
COc1
C=C1CCC2(C)C3CCC4=CC(=O)CCC4(C)C3CCC12C
CCCC(=O)Nc1ccc(C(=O)NCc2ccccc2)cc1
COc1ccc(-c2csc(Nc3cc(C)ccc3C)n2)cc1
CC(=O)Nc1ccc(S
COc1ccc(OC2CCN(CC=C(c3ccc(Cl)cc3)C(=O)NC(C)(C)C)CC2)cc1
CCn1c(NCc2ccco2)nnc1N=C(S)Nc1ccccn1
COc1ccc(N2CCN(C(=O)c3cc(-c4ccccc4
Cc1cnc2ccc(Cl)cc2c1CC(=O)N1CCC(Cn2ccnn2)CC1
COc1ccc(C2=NC(=O)C(c3ccc(Br)cc3)=C2c2cnn3c(=O)c(C)nc23)cc1
COc1ccc(CC(=O)Nc2cc(C


[04:20:03] SMILES Parse Error: extra open parentheses for input: 'CNC(=O)C(Cc1c[nH]c2ccc(OC)cc12)NC(=O)C(CCSC)NC(=O)C(CCSC)NC(=O)C(Cc1c[nH]c2ccc(OC)cc12)NC(=O)C(CCSC)NC(=O)C(Cc1c[nH]c2cc'
[04:20:03] Can't kekulize mol.  Unkekulized atoms: 1 2 4
[04:20:03] SMILES Parse Error: unclosed ring for input: 'CC(=O)OC1CCC23C(C)(C)C(OC(=O)c4ccc(OC)c(OC)c4)C2(C)C3OC(C)=O
CCCCCCCCCCCCCCCCCC(=O)Oc1ccc(S(=O)(=O)N2CCOCC2)cc1
CC(=O)OC1C(O)C('
[04:20:03] SMILES Parse Error: unclosed ring for input: 'CC(C)n1c(=O)c2ccccc2n2c3c(nc2n(CCCN4CCC(C(=O)NC(C)(C)C)CC4)c21)CCCC3
Cc1ccc(NC(=O)c2ccccc2Nc2ncccn2)cc1
CCN1CCN(CCCNC(=O)C2CCN'
[04:20:03] SMILES Parse Error: syntax error while parsing: =O)c1ccc2[nH]c(-c3ccccc3)[nH]c2c1
CC(C)CC(=O)NCC1OC(n2c(=O)[nH]c3ccccc32)C(O)C1O
Cc1nc(C=NNC(=O)c2ccc(C)o2)sc1-c1ccc2
[04:20:03] SMILES Parse Error: Failed parsing SMILES '=O)c1ccc2[nH]c(-c3ccccc3)[nH]c2c1
CC(C)CC(=O)NCC1OC(n2c(=O)[nH]c3ccccc32)C(O)C1O
Cc1nc(C=NNC(=O)c2ccc(C)o2)sc1-c1ccc2' for input: '=O)c1ccc2[nH]c(-c3cccc

CC(=O)NCCCCCCCC(=O)Nc1ccc(S(=O)(=O)N(CCCCCC(C)C)CC(=O)Nc2ccc(C(=O)O)cc2)cc1
CCCCCCCC(=O)NC(CC(C)C)C(=O)NC(CC(C)C)C(=O)O
CO
Cc1ccco1
CC(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccc(Cl)cc1)NC(=O)C(Cc1ccccc1)NC(=O)OC(C)(C)C)C(=O)NC(Cc1ccccc1)C(=O)O
O=C
O=C(Cc1c[nH]c2ccccc12)N1CCOCC1
COc1cc(N)c2cc(-c3cc(=O)c4c(OC)cccc4n3C)ccc2c1
CN(C)CCc1ccc(OCc2ccccc2)cc1
CC(C)(C)c1ccc(CCNP(=
Cc1nc2cc(C)c(OCCCN3CCOCC3)cc2oc1=O
O=C(C=Cc1ccccc1)NCC1CCSC1
COc1ccc(OC)c(-c2cc(O)cc(C)c2)c1
CCC(C)S(=O)(=O)N1CCC(Cc2nnc(C(=
CC(C)(C)NC(=O)C(Cc1ccccc1)NC(=O)C(Cc1ccc[nH]1)NC(=O)c1ccccc1
O=C(C1CCCN(Cc2ccccc2)C1)NC1CCN(C(=O)c2ccccc2)CC1
Cc1ccc(NC(
CC(C)(N)C1CCCC1C(=O)N1CCN(c2cccc(Nc3ncnc4ccccc34)c2)CC1
O=C(CSc1nc2ccccc2[nH]1)NCCc1ccc(Cl)cc1
COc1cc(CNCC(=O)NCC(C)(C)N2CCN(c3
O=C(C=Cc1ccccc1)n1ccc2c(C=NNc3ccccc3)cccc21
CC1(C)C(=O)OC2C(C)(CCC3(O)C4CC5C(C)(C)CCC(OC(=O)C(CC)CC)C(C)(C)C5CCC43C)C1(C)C
CC
COc1cccc(Nc2nc(NC(C)C)ccn2)c1
CNc1ccc(C(=O)Nc2nc3ccccc3s2)cc1
CC(=O)C1(C)CC2CCC1C2(C)C
CNC(=O)C(Cc1ccc(OC)cc1)NC(=O)C(CC(
CCCCNCCCC

[04:20:03] SMILES Parse Error: syntax error while parsing: 1Cc2ccccc2C1=CCC(=O)O
Cc1cc(C)c(C(=O)N2CCN(CC(=O)NCc3cc(C)no3)CC2)c(C)c1
Cc1cc(=O)oc2ccc(C=Cc3ccc(O)cc3)cc12
C=CCOC1(CC=
[04:20:03] SMILES Parse Error: Failed parsing SMILES '1Cc2ccccc2C1=CCC(=O)O
Cc1cc(C)c(C(=O)N2CCN(CC(=O)NCc3cc(C)no3)CC2)c(C)c1
Cc1cc(=O)oc2ccc(C=Cc3ccc(O)cc3)cc12
C=CCOC1(CC=' for input: '1Cc2ccccc2C1=CCC(=O)O
Cc1cc(C)c(C(=O)N2CCN(CC(=O)NCc3cc(C)no3)CC2)c(C)c1
Cc1cc(=O)oc2ccc(C=Cc3ccc(O)cc3)cc12
C=CCOC1(CC='
[04:20:03] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 9
[04:20:03] SMILES Parse Error: unclosed ring for input: 'CC(C)C(=O)OC1C(C)(C)C(=O)N2C1c1ccc(OCC(=O)Nc2c(C#N)cncc2C#N)cc1
CC(C)(C)OC(=O)CC1C(=O)NCCN1C(=O)c1ccc(S(=O)(=O)N2CC'
[04:20:03] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5
[04:20:03] non-ring atom 0 marked aromatic
[04:20:03] SMILES Parse Error: syntax error while parsing: =O)C(NCCCOc1ccc(Br)cc1)NCCCOc1ccccc1
COc1ccc(NC(=O)CSc2ccc(OC)cc2)cc1
O=c1[nH]c2cc(COc3ccccc3O)ccc2ncn1-c1ccccc1

CCC(=O)Nc1ccc(-c2ccco2)cc1NC(=O)c1ccccc1O
CCC(CC)OC(=O)C1=C(C)N(c2cccc(-c3ccc(Br)cc3)c2)CC2(CCN(CC=C(C)C)CC2)C1=O
COc1ccc(-n2c
c1ccc2c(c1)c1c(cnn1I)CCCCC2
O=C(O)c1cc(Cc2ccccc2)cc(Cc2ccccc2)c1
O=C(C=Cc1ccc(O)cc1)NCCOCc1ccccc1
CC(C)(C)c1ccc(C(=O)N2
COc1ccc(-c2nc(SCc3ccc(C(N)=O)cc3)co2)cc1
CSc1nc2ccc3c(Cc4ccc(C#N)cc4)n[nH]c3c2c(=O)[nH]1
O=C1CC2(O)c3ccccc3C3(C=C2C(=O
Cc1oc2nc(C)cc(C(=O)N(Cc3ccco3)C(C)C)c2c1C
CC(C)c1cc(CNS(=O)(=O)c2ccc(Cl)cc2)cc(C(C)C)c1O
O=C(O)c1ccc(OCCOc2ccccc2)cc1
O=C(O)CSc1nc2ccccc2c(=O)n1Cc1ccc(Cl)cc1
Cc1cccc(S(=O)(=O)NCCCC(Cc2ccccc2)N2CCCCC2)c1
COC(=O)c1ccc2c(C)c(O)cc(=O)oc2c1
CCOC
COC(=O)C(C)NC(=O)c1cc(N)on1
COc1cccc(N2CC(COc3ccc4c(c3)OCCO4)C2=O)c1
N#CC(=Cc1ccc([N+](=O)[O-])cc1)c1cc(Cl)ccc1O
CCCCCN1C(=
CCCCc1ccc(CN(CCc2cccs2)C(=O)c2ccccc2)cc1
CC(=O)OC1CC2(C)C3CCC4=C(CCC5(C)C(OC(=O)NC(=O)C(C)(C)C)CCC45C)C3(C)CCC2C1C
Cc1cc(C)n(
Nc1ncnc2c1ncn2C1OC(CO)C(O)C1O
CC(C)OC(=O)N1CCN(C(=O)C2CCS(=O)(=O)C2)CC1
COC(=O)C(Cc1ccc(OCC(O)CN(C)CCOc2ccccc2)cc1)NC(=
NC(CC(O)COc1ccc(OC(=O)N

[04:20:04] SMILES Parse Error: unclosed ring for input: 'CC(C)C(O)C(C)C1CCC2C3C=CC(C)(C)CC3(C)C3CCC12C
CC(C)(C)c1nnc(SCC(=O)Nc2ccccc2Cl)o1
CC(C)CC(NC(=O)C(N)C(CC(C)C)NC(=O)C(NC(='
[04:20:04] SMILES Parse Error: unclosed ring for input: 'CCCCCCSc1nn(CCCC)c2c1OCC(c1ccc(OC)cc1)c1ccc(OC)cc1
C=CCN1CCc2c([nH]c3c2C2C(C)CC(NC(=O)Cc4ccccc4)=C2C(=O)N(C)C2(C)CC3)C1
Cc'
[04:20:04] SMILES Parse Error: syntax error while parsing: =O)OCC1OC(n2cc(C(=O)NCCCCc3ccccc3)c3ccccc32)C(O)C(O)C1O
Cc1cccc(-c2nc(SCc3ccccc3)co2)c1
Cc1ccc(S(=O)(=O)N2CCN(c3cccc(C(=O)
[04:20:04] SMILES Parse Error: Failed parsing SMILES '=O)OCC1OC(n2cc(C(=O)NCCCCc3ccccc3)c3ccccc32)C(O)C(O)C1O
Cc1cccc(-c2nc(SCc3ccccc3)co2)c1
Cc1ccc(S(=O)(=O)N2CCN(c3cccc(C(=O)' for input: '=O)OCC1OC(n2cc(C(=O)NCCCCc3ccccc3)c3ccccc32)C(O)C(O)C1O
Cc1cccc(-c2nc(SCc3ccccc3)co2)c1
Cc1ccc(S(=O)(=O)N2CCN(c3cccc(C(=O)'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'O=C1Nc2ccccc2C12CC3CC(CC(C3)C1)C2
C=C(C)C1CCC2(C)C3CCC4C(CC(C)(C)C=CC(C)(C)C4CCC23C)C1

OC1CCCN(CCCNC(=O)c2cccnc2)C1
COC(=O)Nc1ccc(S(=O)(=O)N(C)CCc2ccc3ccccc3c2)cc1
c1ccc(-c2nc3c(n2)CCC3)cc1
COC(=O)c1cccc2c1OC1C(C
CCOc1ccc(NC(=O)Nc2ccc(OC)cc2OC)cc1
CCNC(=O)C1CCN(C(=O)N2CCCC2c2ccc(NC(=O)C=Cc3ccc(C)cc3)cc2)CC1
CCn1nccc1SCC(=O)Nc1ccc(C)cc
CCN(C(=O)C1CC2(COC(=O)C(C)(C)C)CCC3(C)C(=CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C2C)C1
Cc1ccccc1C(=O)Nc1ccccc1NC(=O)c1cc(-c2ccccc2
CC(C)CC1CCC2C3CCC4=CC(=O)CCC4=CC3(C)C(C)C2(C)CC1(C)C
CCCCNc1ccc(Nc2nccc3c2C(=O)NN3C(=O)c2ccc(OC)cc2)cc1
CC1(C)Oc2ccc(Nc3n
Cc1ccc(S(=O)(=O)N2CCN(CCCC(=O)NCc3ccc(C)cc3)CC2)cc1C
CCC1(C)C(=O)NC(=O)C2(C)C(C=C(C)C)CCC3C(CCC4C(C)(C)C(=O)C(C)=C(C)C
O=C(Nc1ccc(C(c2ccco2)N2CCOCC2)cc1)c1ccc(Cl)cc1Cl
CCCN(CCC1CC(C)(C)CC(C)(C)C1)S(=O)(=O)c1ccccc1
CCOc1ccc(-c2ccc(C(=O)NCc3ccco3)cc
COc1ccc(-c2nc3ccccc3o2)cc1
COc1cc(NS(=O)(=O)CC2CC2)ccc1C(=O)c1ccccc1
CCC1(O)C(=O)OC(C)C2C3C(OC(=O)c4ccc(OC)cc4)CCC3(C)C(=CCC=CC
COc1ccc(C2c3ccccc3C(=O)N(C3CCCCCC3)C2=O)cc1OC
OC1OC(OC(=O)C2CCCC2)C(O)C(O)C1O
Cc1ccc(S(=O)(=O)N2CCCC(Nc3cccc4nccnc34)C2)cc1
CNS(=

[04:20:04] SMILES Parse Error: extra close parentheses while parsing: C#CC#CCCCCCCCCCCCCCCCC)c1ccc2c3c(c4ccc(OCC(=O)NS(C)(=O)=O)cc4oc3c13)OCO4
CC(C)(C)c1cc(C(=O)N2CC(c3ccc(Cl)c(Cl)c3)C2)no1
CCc1cc(C(
[04:20:04] SMILES Parse Error: Failed parsing SMILES 'C#CC#CCCCCCCCCCCCCCCCC)c1ccc2c3c(c4ccc(OCC(=O)NS(C)(=O)=O)cc4oc3c13)OCO4
CC(C)(C)c1cc(C(=O)N2CC(c3ccc(Cl)c(Cl)c3)C2)no1
CCc1cc(C(' for input: 'C#CC#CCCCCCCCCCCCCCCCC)c1ccc2c3c(c4ccc(OCC(=O)NS(C)(=O)=O)cc4oc3c13)OCO4
CC(C)(C)c1cc(C(=O)N2CC(c3ccc(Cl)c(Cl)c3)C2)no1
CCc1cc(C('
[04:20:04] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 19
[04:20:04] Can't kekulize mol.  Unkekulized atoms: 4 5 12 15 16 17 18 19 20
[04:20:04] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 26 27
[04:20:04] Can't kekulize mol.  Unkekulized atoms: 7 8 11 12 13 16 19 20 23
[04:20:04] SMILES Parse Error: unclosed ring for input: 'O=C(O)CC12CCCC1C3CC1CN3C(=O)C1=CCCC1C3
CC(C)(C)c1nc2n(c(N)c1=O)C(=O)c1ccccc1-2
CC(C)CC(NC(=O)C(N)CCCC(=O)N(C)Cc1ccc(OCC'
[04:20

Cc1nc2ccccc2n1Cc1ccc(N=C(O)CN(C)Cc2ccccc2)cc1
CCOC(=O)N1CCN(C(=O)c2cc(C)sc2C)CC1
CC(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(N)CCNC
CC[S+](C)(C)CCCOc1ccc(C(=O)O)cc1
CC(=O)c1ccc(NC(=O)c2ccccc2Cl)cc1
Cc1ccccc1-c1cccc(C(=O)NCc2ccc(S(=O)(=O)Nc3ccccc3)cc2)c1
CO
Clc1ccccc1C1=NN(C(=O)c2ccc(-c3ccccn3)cc2)C(=O)C1
Cc1ccc(-c2cc(=O)c3ccc(C(C)=O)cc3o2)o1
CC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC(C)=CCCC1
NC(=O)c1ccccc1NC(=O)CSc1nnc(-c2ccccc2)o1
COc1ccccc1S(=O)(=O)N1CCC(n2cc(C(=O)NCc3ccccn3)nn2)CC1
Cc1cc(N2CCCCC2)c2cc(C#N)ccc2n1

CC(=O)N1CCC(CN2CCC(Oc3cncc(Cc4ccccc4)n3)CC2)CC1
CC(=O)Nc1ccc(Nc2ncccn2)cc1
COc1ccc(S(=O)(=O)N(c2ccccc2)C(C)N2CCC(Oc3ccccc3)CC2
CCN1CCN(c2nc(-c3ccco3)c3ccccc3n2)CC1
Clc1ccc(Cl)cc1C1CCCCC1
CCOC(=O)c1ccc(-c2ccc(Nc3ncnc4c3CCN(C5(c6ccc(Cl)cc6)CC5)CC4)o2)cc1
CC(=
CNC(=O)c1ccc(C)c(Nc2ccc(C(=O)NC(C)Cc3ccc4c(c3)COC4)nn2)c1
CC(C)CN1CCC2C3CCC4=CC(=O)CCC4(C)C3CCC12C
CCN(CC)C(=O)N1CCN(c2cc(
NC(CSCc1ccccc1)C(=O)NC(CC(=O)NC(CC(=O)O)C(C)C)C(=O)N1CCCC1C(=O)NC(Cc1ccccc1)C(N)=O
CC(C)CC(NC(=O)C(CO)NC(=O)C(Cc1c

[04:20:04] SMILES Parse Error: syntax error while parsing: (Cc1ccc(Cl)cc1)Nc1ccc2c(c1)CCN(C(=O)Cn1cncn1)CC2
C=C1C(OC(C)=O)CC2(C)C3=C(C(=O)c4ccc(Cl)cc43)C1C1(C)CCC(O)C21
O=C1NC2
[04:20:04] SMILES Parse Error: Failed parsing SMILES '(Cc1ccc(Cl)cc1)Nc1ccc2c(c1)CCN(C(=O)Cn1cncn1)CC2
C=C1C(OC(C)=O)CC2(C)C3=C(C(=O)c4ccc(Cl)cc43)C1C1(C)CCC(O)C21
O=C1NC2' for input: '(Cc1ccc(Cl)cc1)Nc1ccc2c(c1)CCN(C(=O)Cn1cncn1)CC2
C=C1C(OC(C)=O)CC2(C)C3=C(C(=O)c4ccc(Cl)cc43)C1C1(C)CCC(O)C21
O=C1NC2'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'CC(C)C(C#N)=C(O)C(C#N)=C(O)C(C#N)=C(O)C(C#N)=C(O)C(C#N)=C(O)C(C#N)=C(O)C(C#N)=C(C#N)C1(C)O
CC(=O)Oc1ccc(OC(=O)'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'CC1CN(CC(C)C)CC(=O)Nc2ccc3c(c2)C(O)CC3=O
CCOC(=O)C=Cc1cc(OC)c(OC)c(OC)c1
COc1ccc(-c2nc(-c3ccc(O)cc3)ncc2C(=O)NCc2ccc(O)cc2'
[04:20:04] non-ring atom 0 marked aromatic
[04:20:04] SMILES Parse Error: unclosed ring for input: 'CC(O)C1CC2(C)C(CCC(C)C2(C)C)C1CC(OC(=O)c1ccccc1)C2
COc1ccc(OC)c(Sc2nc

Cc1cc(C)cc(C2=NN(c3ccnc4ccccc34)C(c3ccccc3)C2)c1
CC(C)(C)c1cccnc1C(=O)Nc1ccc(-c2ccc(O)cc2)cn1
CCN(CC)C(=O)Cn1cc(C)c(-c2ccc(
O=c1cc(CSc2ncnc3c2CCN(C4CC4)CC3)oc2ccccc12
OC1CN(C(C)CO)Cc2cc(-c3nc4ccccc4[nH]3)ccc2O1
Cc1ccc(S(=O)(=O)NC(=O)c2cccc(C)c2)cc1

NS(=O)(=O)c1ccc(NC(=O)c2cc(Cl)ccc2OCc2cccnc2)cc1
Cc1cc(C)cc(NC(=O)Nc2ccc(-c3ccc(O)c(NC(=O)c4ccccc4)c3)cc2)c1
CCOC(=O)C1=C(
Nc1ccc(-c2ccccc2Nc2ncnc3ccccc23)cc1
COCc1nc2c(c(=O)n1C)-c1c(c(-c3ccccc3)nn1C)CC(O)C(CO)O2
O=C(O)CCCCc1c(C(=O)O)c2c(c(C=CC
Cc1ccc(NC(=O)CSc2nc(=O)c3ccccc3n2Cc2ccccc2)cc1
CCCC(=O)OC1CC2CCC1N(C)C(=O)C=Cc1ccc(OP(=O)(O)O)cc1
CC(=O)NCC1=CC(=O)c2cccc(OC
O=C(NCc1ccco1)c1cc2ccc(Br)cc2[nH]1
O=C(CCCCCCCCCC[n+]1ccccc1[N+](=O)[O-])NCCNCC1CC2(CCN(CCCO)CC2)OC1=O
CSc1cccc(-c2nnc(SCc3cnc[nH]3
Cc1nn(C)cc1C(=O)Nc1cccc(N(C)C)c1
NCC(O)C(CC(C)C)NC(=O)C1CCCN1S(=O)(=O)c1ccccc1
Cc1cccc(C)c1OCC(=O)Nc1cccc(-c2cccc([N+](=O
CC1CCN(C(=O)c2nccs2)CC1
O=C(O)c1ccc(Nc2ccc(-c3cnc(N4CCOCC4)nc3)cc2)cc1
Cc1ccc(S(=O)(=O)N2CCCCC2C(=O)Nc2ccccc2)cc1
COc1ccc(
Cc1

[04:20:04] SMILES Parse Error: unclosed ring for input: 'CC(=O)OC1CCC2(C)C(=CCC3C2CCC2(C)C(=C(O)C(=O)OC)CCC314)C1(C)C
NC(=O)C1(c2ccccc2)CCN(C(=O)c2ccc3c(C)n[nH]c3c2)CC1
Cc1ccccc'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'C=C1C(=O)OC2CC=C1C(=O)c1ccccc1
Cc1cccc(NS(=O)(=O)c2ccc3c(c2)N(Cc2ccccc2)CCO3)c1
CC(=O)Nc1ccccc1C(=O)c1ccccc1
COc1ccccc1'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'Cc1ccc2c(c1)N=C(N1CCOCC1)c1nn2c1CCCC2
CC(C)CC(NC(=O)C(Cc1ccccc1)NC(=O)C(N)CC(=O)N1CCCC1)C(=O)NC(Cc1ccccc1)C(O)CC(=O)NC('
[04:20:04] SMILES Parse Error: unclosed ring for input: 'Cc1ccc(NC(=O)CSc2nnc3n2Cc2ccc(Cl)cc2)c(C)c1
O=C(Nc1ccc(Oc2ccc(C(=O)NCCc3ccccc3)cc2)cc1)c1ccc(Br)cc1
CCOc1ccc(NC(=O)CCCCCn2'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'CNC(=O)C(Cc1cccc(S(=O)(=O)N2CCc3ccccc32)c1)NC(=O)C(C(C)C)NC(=O)OC1CC(OC(C)=O)C(C)(OC(C)=O)C2C1OC(C)=O
CC(C)(C)c1cc(C(=O)N'
[04:20:04] SMILES Parse Error: unclosed ring for input: 'c1ccn2c1CN1CCN(C(=O)CCCc3ccccc3OC)CC1
Cc1cccc(C

CCNC(=O)C(Cc1ccc(O)cc1)N(C)C
CCCCCCCCCCCCOCCOCCOCCOCCOCC
COc1ccc(C(=O)Nc2ccc3c(c2)OCO3)cc1
Cn1cc(C=C2C(=O)Nc3ccc(C(=O)O)cc
CC1CN(C(=O)CCl)C(=O)c2c(Cl)cccc2N1S(=O)(=O)c1ccccc1Cl
O=[N+]([O-])c1cccs1
Cc1ccccc1NC(=O)C1CCCN1C(=O)c1ccc(C#N)cc1
Cc1oc(-c
O=C(C=Cc1ccc2c(-c3ccc(Cl)cc3)coc2c1)c1ccccc1
CC(C)(C)n1c(=O)n(-c2ccccc2)c2cc(NC(=O)CCc3ccccc3)ccc21
COc1cc(C=CC(=O)c2ccc(O)
c1cc(Cc2ccc3nc(N4CCN(C5CCOC5)CC4)sc3c2)ccn1
CN1CCN(c2ccc(Oc3ccccc3)cc2)CC1
CC(C)c1nnc(NC(=O)Cn2ccc(=O)[nH]c2=O)n1C
Cc1ccc
CC(=O)C=Cc1ccc(OCc2cccc3cc[nH]c23)cc1
CC(C)(C)c1ccc(N2CCN(C(=O)c3ccc(Cl)cc3)CC2)nc1
CCN(CC)C1CCN(c2ccc3c(c2)COC(=O)CCC3)CC1

Cc1cccc(C)c1NC(=O)c1coc2cc3c(cc12)OCO3
COc1ccc(C(=O)NC(C)C)c(OC)c1
CCCCCCCC(=O)Nc1ccc(C(=O)OCC(=O)N2CCCC2C(=O)NC(Cc2ccc(OC
NCCCOc1ccc(C(=O)OCc2cccs2)cc1
O=C(CSc1nc2ccccc2[nH]1)NCCCCCCCCCN1CCOCC1
COc1ccc(C=CC(=O)NCCC(=O)OCc2ccccc2)cc1OC
CN1CCN(c2cccc(N3CCN
COC(=O)c1cccc(C(=O)NCc2ccccc2OC)c1
CC(C)=CCCC(C)C1CCC(C)C2CCC3(C)C(CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C12
Cc1cccc(C)c1-c1nc2c(c(C)

[04:20:05] SMILES Parse Error: extra close parentheses while parsing: CCC(NC(=O)C(N)CCCCn1cccn1)C(=O)NC(CC(N)=O)C(=O)NC(C(=O)O)C(C)C)C(C)C
C=CCN1CC(C)(C=C)OC(C(=O)OC)=CC1=O
Cc1cccc(C)c1S(=O
[04:20:05] SMILES Parse Error: Failed parsing SMILES 'CCC(NC(=O)C(N)CCCCn1cccn1)C(=O)NC(CC(N)=O)C(=O)NC(C(=O)O)C(C)C)C(C)C
C=CCN1CC(C)(C=C)OC(C(=O)OC)=CC1=O
Cc1cccc(C)c1S(=O' for input: 'CCC(NC(=O)C(N)CCCCn1cccn1)C(=O)NC(CC(N)=O)C(=O)NC(C(=O)O)C(C)C)C(C)C
C=CCN1CC(C)(C=C)OC(C(=O)OC)=CC1=O
Cc1cccc(C)c1S(=O'
[04:20:05] SMILES Parse Error: syntax error while parsing: CC(C)(C)NC(=O)C1CCCN1C(=O)C(Cc1ccccc1)NC(=O)C(NC(=O)c1ccc(O)cc1)C(Cc1ccccc1)C(O)CC(=O)N1CCCC1C(=O)C(NC(=O)OCc1ccccc1)C(
[04:20:05] SMILES Parse Error: Failed parsing SMILES 'CC(C)(C)NC(=O)C1CCCN1C(=O)C(Cc1ccccc1)NC(=O)C(NC(=O)c1ccc(O)cc1)C(Cc1ccccc1)C(O)CC(=O)N1CCCC1C(=O)C(NC(=O)OCc1ccccc1)C(' for input: 'CC(C)(C)NC(=O)C1CCCN1C(=O)C(Cc1ccccc1)NC(=O)C(NC(=O)c1ccc(O)cc1)C(Cc1ccccc1)C(O)CC(=O)N1CCCC1C(=O)C(NC(=O)OCc1ccccc1)C('
[04:20:05] SMIL

Cc1ccccc1N1CCN(c2ccc(C(=O)O)cc2)CC1
CCCCNc1nc(-c2ccc(S(=O)(=O)N3CCN(C(=O)c4ccc(OC)cc4)CC3)cc2)cs1
CCCOc1ccccc1N1C(=O)N2CCCC2
O=C(c1cccc([N+](=O)[O-])c1)N1CCN(c2ccccc2)CC1
O=C(NN=Cc1cccc(C=Cc2ccc3c(c2)OCCO3)c1)c1ccco1
CC(C)(C)C(=O)Nc1nc(N)c2ccccc2
Cc1ccc(CNC(=O)Cn2c(-c3cc(C)no3)nc3ccccc32)cc1
Cc1ccc(Nc2ncc3c(NCCC#N)nc(N)nc3n2)c(C)c1
COc1ccc(C(=O)Nc2cccc3c2cnn3-c2ccc(
COc1cc(C(=O)NC2CNC(N)=NC2c2ccccc2)ccc1OC
Cc1nc(NC(=O)CCCc2ccccc2)cc(C)c1C#N
CC(C)c1ccc(-c2nc3cc(CCNCCNCCCN(C)C)ccc3[nH]2)cc1
COC
CCN1CCN(C(=S)SSCC(=O)Nc2ccc3c(c2)OCO3)CC1
CC1=NN(CC(C)=O)C(=O)C2(C)C(CCC3C4CCC(O)(C)C4CCC32)C1=O
CSc1ncnc2c1ncn2C1OC(CO)C(O)
C=CCn1nc(-c2ccc(Cl)cc2)c2ccccc2c1=O
CC1CN(C(=O)c2ccc(N3CCC(CCNC(=N)N)CC3)cc2)CCC1N
CCCCCCCCCCCCCCCC=CCC=CCC=CCC=CCC=CCCCC(=O)NCCCCC
CC(=O)c1cc
CC(C)Oc1ccc(C=NNC(=O)c2ccco2)cc1
CN(C)CCn1nc(C(=O)N2CCN(CC(O)c3ccccc3)CC2)c2sc3c(c2c1=O)CCCC3
Cc1cc(C)c(C2(c3ccccc3)CCN(
Clc1ccc(S(=O)(=O)Nc2nc(N3CCOCC3)nc3ccccc23)cc1
Cc1nc(C=Cc2cccc(C#N)c2)c(C)n1C1CCN(C(=O)C2CCCNC2)CC1
CCCCCCCCCCCCC

[04:20:05] Can't kekulize mol.  Unkekulized atoms: 3 4 5 6 10 12 13 14 15 16 17
[04:20:05] SMILES Parse Error: syntax error while parsing: =C1C(=Cc2cccc(OCc3c[nH]cn3)c2)=C(O)c2ccccc21
Cc1ccccc1-c1nnc(NC(=O)CCNC(=O)c2cc(Cl)ccc2Cl)o1
CCOC(=O)c1cc(-c2ccc(OC)cc2)c(C)
[04:20:05] SMILES Parse Error: Failed parsing SMILES '=C1C(=Cc2cccc(OCc3c[nH]cn3)c2)=C(O)c2ccccc21
Cc1ccccc1-c1nnc(NC(=O)CCNC(=O)c2cc(Cl)ccc2Cl)o1
CCOC(=O)c1cc(-c2ccc(OC)cc2)c(C)' for input: '=C1C(=Cc2cccc(OCc3c[nH]cn3)c2)=C(O)c2ccccc21
Cc1ccccc1-c1nnc(NC(=O)CCNC(=O)c2cc(Cl)ccc2Cl)o1
CCOC(=O)c1cc(-c2ccc(OC)cc2)c(C)'
[04:20:05] Can't kekulize mol.  Unkekulized atoms: 2 4 5 7 8 11 12 13 15 16 17 18 19 20 22
[04:20:05] Can't kekulize mol.  Unkekulized atoms: 10 11 12 17 18 19 20 21 22
[04:20:05] Can't kekulize mol.  Unkekulized atoms: 9 10 11 12 13 14 15 16 28
[04:20:05] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 8
[04:20:05] Can't kekulize mol.  Unkekulized atoms: 13 14 15 16 17 18 19 20 21 22 23 24 25
[04:20:05] Can't keku

In [10]:
model = GPT2LMHeadModel.from_pretrained('goodgptmol.pt')

In [11]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [27]:
def validate_and_filter_smiles(submission_path, training_smiles):
    unique_smiles = set()  # To store unique and valid SMILES
    valid_count = 0  # Counter for valid SMILES

    with open(submission_path, 'r') as file:
        for line in file:
            smiles = line.strip()
            if smiles and smiles not in training_smiles and smiles not in unique_smiles:
                mol = Chem.MolFromSmiles(smiles)
                if mol:  # Checks if the SMILES is valid
                    unique_smiles.add(smiles)
                    valid_count += 1

    # Save the filtered SMILES to a new file
    with open('filtered_unique_smiles.txt', 'w') as file:
        for smiles in unique_smiles:
            file.write(smiles + '\n')

    print(f"Total valid and unique SMILES: {valid_count}")

validate_and_filter_smiles('submission_gpt2_finetune.txt', smiles_data)


[20:40:59] SMILES Parse Error: extra open parentheses for input: 'O=C(CSc1nnc(SCC(=O)O)n'
[20:40:59] SMILES Parse Error: unclosed ring for input: 'O=C1OC2(CCN(Cc3ccccc3OCc3ccccc3)CC2)C'
[20:40:59] SMILES Parse Error: extra open parentheses for input: 'COc1cc2c(cc1OC)CN1CCC(CNCc3ccc'
[20:40:59] SMILES Parse Error: extra open parentheses for input: 'Cc1ccc(NC(=O)CSc2ncnc3scc(-c4ccccc4)c23'
[20:40:59] SMILES Parse Error: syntax error while parsing: COC(=O)C(NC(=
[20:40:59] SMILES Parse Error: Failed parsing SMILES 'COC(=O)C(NC(=' for input: 'COC(=O)C(NC(='
[20:40:59] SMILES Parse Error: extra open parentheses for input: 'CCCCC(OC(=O)CCC(=O)O)(c1cc'
[20:40:59] SMILES Parse Error: unclosed ring for input: 'C=CCn1c'
[20:40:59] SMILES Parse Error: extra open parentheses for input: 'COC(=O)c1c(C)n(C'
[20:40:59] SMILES Parse Error: unclosed ring for input: 'Cc1cc(CN(C)C(=O)c2cc(C)no2)nc(NCCc2ccccc2)'
[20:40:59] SMILES Parse Error: extra open parentheses for input: 'Cc1ccc(S(=O)(=O)Nc2nnc(-c3ccc

Total valid and unique SMILES: 27347


[20:59:39] SMILES Parse Error: extra open parentheses for input: 'Nc1ccc(N=C'
[20:59:39] SMILES Parse Error: unclosed ring for input: 'Cc1cc'
[20:59:39] SMILES Parse Error: extra open parentheses for input: 'O=C(Nc1ccccc1)N1CCC(Oc2cccc3c2OC(Cn2cc(COc4'


In [34]:
import os
import pickle
from evaluation.utils import canonicalize_smiles, getstats, loadmodel
import fcd
import numpy as np

def compute_fcd_for_batch(smiles_list, model, ref_mean, ref_cov):
    results = []
    canonical_smiles = canonicalize_smiles(smiles_list)
    valid_smiles = [sm for sm in canonical_smiles if sm]
    if valid_smiles:
        mean_gen, cov_gen = getstats(valid_smiles, model)
        try:
            fcd_values = fcd.calculate_frechet_distance(mean_gen, cov_gen, ref_mean, ref_cov)
            print(fcd_values)
            results.extend(zip(valid_smiles, [fcd_values] * len(valid_smiles)))
        except ValueError:
            results.extend((sm, float('inf')) for sm in valid_smiles)  # Assign high FCD for failed cases
    return results

def process_in_batches(submission_smiles, batch_size, model, ref_mean, ref_cov):
    batch_results = []
    for i in range(0, len(submission_smiles), batch_size):
        batch = submission_smiles[i:i + batch_size]
        batch_results.extend(compute_fcd_for_batch(batch, model, ref_mean, ref_cov))
    return batch_results

# Load model and reference stats
model = loadmodel()
with open('./evaluation/data/test_stats.p', 'rb') as f:
    ref_mean, ref_cov = pickle.load(f)

# Load SMILES
with open('filtered_unique_smiles.txt', 'r') as f:
    submission_smiles = [line.strip() for line in f if line.strip()]

# Compute FCD in batches
batch_size = 500  # Define a reasonable batch size
fcd_results = process_in_batches(submission_smiles, batch_size, model, ref_mean, ref_cov)
sorted_fcd_results = sorted(fcd_results, key=lambda x: x[1])  # Sort by FCD score

# Save the top 10,000 SMILES
with open('top_10000_smiles.txt', 'w') as file:
    for smile, fcd_score in sorted_fcd_results[:10000]:
        file.write(f"{smile}\n")

print("Top 10,000 SMILES with the lowest FCD have been saved to 'top_10000_smiles.txt'.")

Saved ChemNet model to 'C:\Users\Ripple\AppData\Local\Temp\tmplh18fz2a\chemnet.pt'
5.244838315948215
5.371683116756444
5.158154555423167
5.214083475388989
5.186769718764907
5.419715115472314
5.323821334338945
5.34548230735399
5.468794618642178
5.088212274967248
5.406801003399863
5.377295691618258
4.969249774851519
5.2437332113168225
5.215168258053083
5.161996270017127
5.109632925747107
5.131794113487331
5.219704210337326
5.485106574589523
5.142841762334399
5.164375474946752
5.385023359789642
5.16296496788172
5.45129543276542
5.151291897111264
5.032119425721476
5.104290195728623
5.152440810356893
5.360349335160436
5.324799762607427
5.323508335564355
5.231622280382496
5.647635247564494
5.220655448650888
5.251770266716079
5.488358702396553
5.153339517802536
5.26242688475827
5.420597434254901
5.2640473240802095
5.172768778672079
5.337760646768416
5.236693333715223
5.284317612641544
5.149540392268406
4.962869879935965
5.375940615769082
5.282424004448572
5.37203495812021
5.22865423689521
5.5