In [34]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import safe

# Load the trained model and tokenizer
checkpoint_path = ".saved_model/phi1_5-safmol_0528/checkpoint-1600"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path)

# Set the model to evaluation mode
model.eval()

# Define generation parameters
generation_args = {
    "max_length": 128,  # Adjust the maximum length of the generated sequence
    "num_return_sequences": 10,  # Number of sequences to generate
    "temperature": 1.0,  # Sampling temperature
    "do_sample": True,  # Use sampling instead of greedy decoding
}

# Generate a sequence without a prompt
input_ids = torch.tensor([[tokenizer.pad_token_id]])  # Use the pad token ID as a dummy input

with torch.no_grad():
    generated_sequences = model.generate(input_ids, **generation_args)

# Decode the generated sequence

def decode_generated_sequences(tokenizer, generated_sequences):
    decoded_texts = []
    for sequence in generated_sequences:
        decoded_text = tokenizer.decode(sequence, skip_special_tokens=True)
        decoded_texts.append(decoded_text)
    return decoded_texts
decoded_texts = decode_generated_sequences(tokenizer, generated_sequences)

In [35]:

# Generate formatted texts
formatted_texts = [f"String {i+1}: {text}" for i, text in enumerate(decoded_texts)]

# Print the generated SAFE strings
print("Generated SAFE strings:\n" + "\n".join(formatted_texts))

# Create a set to store the unique lengths
unique_lengths = set(len(text) for text in generated_sequences)

# Print the unique lengths
print("Unique lengths of generated sequences:", unique_lengths)


Generated SAFE strings:
String 1: C3ccc(F)sc1nc5ccn(F)c(F)ccc(F)cc1F.C5(F)(S34)C1=OCCO94.C914.C9CC(F)(F)F(F[P(F)(F)F(F[)(F[)F1[)C16C=C1[O-].C76.CC04.n15CCCC
String 2: C45C3n2nACCC2=N12.C=O.c13ccccc1.C88=O.[C@H]12[C@H]8C26.N16CCCCC5CC1.C5(C9=O)C3C(=O)c2ccccn2[1.C9[C(=O)C17.C38.C319C.C17CC.C9(=O
String 3: C7.C763.C1C(C)C.c16ccc3nc16.C13P(Cl)c2ccccn5n12[Cl3c1ccc(Cl)cc14.c14CN(=N)C=CCC=3c2cnccc2n1.C83.c18ccc8ccc15.O35[CCO16.c16
String 4: C8CC4.c14cccnc1.c15cnnn6c1.O63.C36CN5CC4.[C@@H]7=O.[C@@H]7(O)(O64.c2cccc(CO2)c3ccccc3)CO.C3(C)(C)C.C21CC1C1C1C1C[n
String 5: Cc1ccc8nc2nc14.C58=O.N=C(O)(C)CN=2.C3(F)F.N32.C4.C14CCC=5.C=5CC1.N/C4(F)C(F)F(=O)C(F)F1CCn(C)3[CC1.C3(F)F(F)F1CC11O.
String 6: Cncn3n1ncc2ccCl8c8Cc2c1.[C@H]18[nH]Oc1[nH]c7c(F)cccc(F)cc18.[C@H]17[C@(C)C[nH]5[nH]5OC\52.C67.C39.O36.C19CCN(C1)CCN4CCC3.[C@H](
String 7: C7=nc2c([C@@H]3c3ccc%10cc2cccc5ccc2o1.c15cc8cc(Cl)c(N)cc9n1.n19[C@@H](C)C(C)C/C(C)CC=CC3C.C==c2cccccc2c1C26=O=C9C(C)C=C3C
String 8: CN=C4O.c14ccc(N)cc1.C56.c15cccc(F)c1cc[n