In [1]:
using PyCall
using JLD
include("df_parser.jl")

py"""
from smiles_tools import return_tokens
from smiles_tools import SmilesEnumerator
from SmilesPE.pretokenizer import atomwise_tokenizer
"""

py"""
def augment_smiles(string, n):
    sme = SmilesEnumerator()
    output = []
    for i in range(n):
        output.append(sme.randomize_smiles(string))
    
    return output
"""

augment_smiles(str, n) = py"augment_smiles"(str, n)
atomwise_tokenizer(str) = py"atomwise_tokenizer"(str)
return_tokens(str, vocab) = py"return_tokens"(str, vocab)

return_tokens (generic function with 1 method)

In [2]:
n = 10
max_length = 196
override = true

true

In [3]:
df = df_parser.getdf(joinpath(@__DIR__, "sero_2_filtered_dataset.csv"))

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CC[C@H]([C@@H]1[C@H](C[C@@](O1)(CC)[C@H]2CC[C@...,Active
1,CC1=C(OC2=C1C=C(C=C2)OC)C(=O)NC3=NC4=CC=CC=C4N3,Active
2,CCCN1C[C@@H](C[C@H]2[C@H]1CC3=CNC4=CC=CC2=C34)...,Active
3,C1CN(CCN1CC2=CC=CC=C2)C(=O)CCCN3C(=O)CSC4=C3C=...,Active
4,CC1=CC(=C(C(=C1)C2=NNC3=C2C(N(C3=O)CC4=CC=CO4)...,Active
...,...,...
4825,CC(=O)C1=NN(C(=O)NC1=O)C2=CC=C(C=C2)OC,Inactive
4826,C[C@H]1CN(C(=O)C2=C(C(=CC=C2)NS(=O)(=O)C3=CC=C...,Inactive
4827,COC1=C(C=C(C=C1)/C=C(/C#N)\C(=O)NC2=CC(=CC=C2)...,Inactive
4828,C1CSC2=N[C@H](CN21)C3=CC=CC=C3.Cl,Inactive


In [4]:
println("Generating augmentations...")

smiles = let temp_df
    temp_df = df_parser.dfToStringMatrix(df)
    for i in 1:length(temp_df[:, begin])
        for augmented in augment_smiles(temp_df[:, begin][i], n)
            temp_df = vcat(temp_df, String[augmented temp_df[:, end][i]])
        end
    end
    temp_df
end

println("Generated augmented dataframe, now processing tokens...")

Generating augmentations...
Generated augmented dataframe, now processing tokens...


In [5]:
activity = reduce(hcat, [i == "Active" ? [1, 0] : [0, 1] for i in smiles[:, end]])'

53130×2 adjoint(::Matrix{Int64}) with eltype Int64:
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 ⋮  
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1

In [41]:
strings = []
activity = []

vocab_path = joinpath(@__DIR__, "vocab.csv")

vocab = df_parser.dfToStringMatrix(df_parser.getdf(vocab_path))

tokenizer = Dict(j => i for (i, j) in enumerate(vocab))
reverse_tokenizer = Dict(value => key for (key, value) in tokenizer)

for i in 1:length(smiles[:, begin])
    returned_tokens, validToken = return_tokens(smiles[:, begin][i], tokenizer)
    if validToken && override
        println("$i | Overriding token")
        continue
    elseif validToken && !override
        throw("Not a valid token")
    end

    processed_tokens = [tokenizer[j] for j in returned_tokens]
    if typeof(max_length) != Bool && length(processed_tokens) <= max_length
        push!(strings, processed_tokens)
        push!(activity, smiles[:, end][i] == "Active" ? [1, 0] : [0, 1])
    end

    if i % 100 == 0
        println("$i | strings: $(length(strings)), activity: $(length(activity))")
    end
end

100 | strings: 100, activity: 100
200 | strings: 200, activity: 200
300 | strings: 300, activity: 300
400 | strings: 400, activity: 400
500 | strings: 500, activity: 500
600 | strings: 600, activity: 600
700 | strings: 700, activity: 700
800 | strings: 800, activity: 800
900 | strings: 900, activity: 900
1000 | strings: 1000, activity: 1000
1100 | strings: 1100, activity: 1100
1200 | strings: 1200, activity: 1200
1300 | strings: 1300, activity: 1300
1400 | strings: 1400, activity: 1400
1500 | strings: 1500, activity: 1500
1600 | strings: 1600, activity: 1600
1700 | strings: 1700, activity: 1700
1800 | strings: 1800, activity: 1800
1900 | strings: 1900, activity: 1900
2000 | strings: 2000, activity: 2000
2100 | strings: 2100, activity: 2100
2200 | strings: 2200, activity: 2200
2300 | strings: 2300, activity: 2300
2400 | strings: 2400, activity: 2400
2500 | strings: 2500, activity: 2500
2600 | strings: 2600, activity: 2600
2700 | strings: 2700, activity: 2700
2800 | strings: 2800, activi

21400 | strings: 21400, activity: 21400
21500 | strings: 21500, activity: 21500
21600 | strings: 21600, activity: 21600
21700 | strings: 21700, activity: 21700
21800 | strings: 21800, activity: 21800
21900 | strings: 21900, activity: 21900
22000 | strings: 22000, activity: 22000
22100 | strings: 22100, activity: 22100
22200 | strings: 22200, activity: 22200
22300 | strings: 22300, activity: 22300
22400 | strings: 22400, activity: 22400
22500 | strings: 22500, activity: 22500
22600 | strings: 22600, activity: 22600
22700 | strings: 22700, activity: 22700
22800 | strings: 22800, activity: 22800
22900 | strings: 22900, activity: 22900
23000 | strings: 23000, activity: 23000
23100 | strings: 23100, activity: 23100
23200 | strings: 23200, activity: 23200
23300 | strings: 23300, activity: 23300
23400 | strings: 23400, activity: 23400
23500 | strings: 23500, activity: 23500
23600 | strings: 23600, activity: 23600
23700 | strings: 23700, activity: 23700
23800 | strings: 23800, activity: 23800


41900 | strings: 41900, activity: 41900
42000 | strings: 42000, activity: 42000
42100 | strings: 42100, activity: 42100
42200 | strings: 42200, activity: 42200
42300 | strings: 42300, activity: 42300
42400 | strings: 42400, activity: 42400
42500 | strings: 42500, activity: 42500
42600 | strings: 42600, activity: 42600
42700 | strings: 42700, activity: 42700
42800 | strings: 42800, activity: 42800
42900 | strings: 42900, activity: 42900
43000 | strings: 43000, activity: 43000
43100 | strings: 43100, activity: 43100
43200 | strings: 43200, activity: 43200
43300 | strings: 43300, activity: 43300
43400 | strings: 43400, activity: 43400
43500 | strings: 43500, activity: 43500
43600 | strings: 43600, activity: 43600
43700 | strings: 43700, activity: 43700
43800 | strings: 43800, activity: 43800
43900 | strings: 43900, activity: 43900
44000 | strings: 44000, activity: 44000
44100 | strings: 44100, activity: 44100
44200 | strings: 44200, activity: 44200
44300 | strings: 44300, activity: 44300


In [42]:
strings[begin:5]

5-element Vector{Any}:
 [18, 18, 32, 2, 30, 7, 32, 2, 18, 31  …  18, 2, 16, 23, 3, 48, 3, 23, 5, 47]
 [18, 18, 7, 16, 18, 2, 23, 18, 8, 16  …  18, 18, 16, 18, 18, 16, 18, 10, 22, 9]
 [18, 18, 18, 22, 7, 18, 30, 2, 18, 32  …  25, 2, 16, 23, 3, 2, 16, 23, 3, 23]
 [18, 7, 18, 22, 2, 18, 18, 22, 7, 18  …  16, 18, 9, 18, 16, 18, 18, 16, 22, 10]
 [18, 18, 7, 16, 18, 18, 2, 16, 18, 2  …  18, 16, 18, 11, 3, 23, 3, 23, 3, 18]

In [43]:
activity = reduce(hcat, activity)'

53130×2 adjoint(::Matrix{Int64}) with eltype Int64:
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 ⋮  
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1