In [1]:
using PyCall
using JLD

In [2]:
function getdf(path)
    py"""
    import pandas as pd

    def read_csv(path):
        return pd.read_csv(path)
    """
    data = py"read_csv"(path)
    return data
end

function dfToMatrix(df)
    data_matrix = Array{Float64}(undef, 0, length(df.columns))

    for i in df.index
        data_matrix = vcat(data_matrix, [convert(Float64, j) for j in df.loc[convert(Int64, i) + 1]]')
    end

    return data_matrix
end

function dfToStringMatrix(df)
    data_matrix = Array{String}(undef, 0, length(df.columns))

    for i in df.index
        data_matrix = vcat(data_matrix, reshape([j for j in df.loc[convert(Int64, i) + 1]], 1, length(df.columns)))
    end

    return data_matrix
end

dfToStringMatrix (generic function with 1 method)

In [3]:
df = getdf(joinpath(@__DIR__, "sero_filtered_dataset.csv"))

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CC[C@H]([C@@H]1[C@H](C[C@@](O1)(CC)[C@H]2CC[C@...,Active
1,CC1=C(OC2=C1C=C(C=C2)OC)C(=O)NC3=NC4=CC=CC=C4N3,Active
2,CCCN1C[C@@H](C[C@H]2[C@H]1CC3=CNC4=CC=CC2=C34)...,Active
3,C1CN(CCN1CC2=CC=CC=C2)C(=O)CCCN3C(=O)CSC4=C3C=...,Active
4,CC1=CC(=C(C(=C1)C2=NNC3=C2C(N(C3=O)CC4=CC=CO4)...,Active
...,...,...
4825,CC1=C(C(=NN1CC(=O)NC2=C(C=CC(=C2)S(=O)(=O)N(C)...,Inactive
4826,C1=CC=C2C(=C1)C=CC(=C2Br)OCC(=O)NC(=S)NC3=CC=C...,Inactive
4827,CCN(CC)CCNC(=O)CN1C(=O)C=CC(=N1)C2=CC=C(C=C2)C,Inactive
4828,C1C(=O)NC2=CC=CC=C2N1C(=O)COC(=O)C3=CC=C(O3)[N...,Inactive


In [4]:
smiles = dfToStringMatrix(df)

4830×2 Matrix{String}:
 "CC[C@H]([C@@H]1[C@H](C[C@@](O1)" ⋯ 73 bytes ⋯ "C(=C(C=C3)C)O)C(=O)[O-])O.[Na+]"  …  "Active"
 "CC1=C(OC2=C1C=C(C=C2)OC)C(=O)NC3=NC4=CC=CC=C4N3"                                                                                             "Active"
 "CCCN1C[C@@H](C[C@H]2[C@H]1CC3=CNC4=CC=CC2=C34)CSC.CS(=O)(=O)O"                                                                               "Active"
 "C1CN(CCN1CC2=CC=CC=C2)C(=O)CCCN3C(=O)CSC4=C3C=CC=N4"                                                                                         "Active"
 "CC1=CC(=C(C(=C1)C2=NNC3=C2C(N(C3=O)CC4=CC=CO4)C5=CC(=CC=C5)O)O)C"                                                                            "Active"
 "COC1=CC=C(C=C1)NC(=O)CC2=NC3=CC=CC=C3N2"                                                                                                  …  "Active"
 "C1=CC2=C(C=C1O)C(=CN2)CCN.Cl"                                                                                           

In [5]:
py"""
from smiles_tools import return_tokens
from smiles_tools import SmilesEnumerator
"""

In [8]:
vocab = dfToStringMatrix(getdf(joinpath(@__DIR__, "vocab.csv")))

tokenizer = Dict(j => i for (i, j) in enumerate(vocab))
reverse_tokenizer = Dict(value => key for (key, value) in tokenizer)

Dict{Int64, String} with 71 entries:
  5  => "/"
  56 => "[NH3+]"
  16 => "B"
  35 => "[C@H]"
  55 => "[NH2+]"
  20 => "F"
  60 => "[Nd+3]"
  30 => "[Ba+2]"
  19 => "Cl"
  32 => "[C-]"
  49 => "[N+]"
  6  => "1"
  67 => "[S-]"
  45 => "[I-]"
  44 => "[Hg]"
  9  => "4"
  31 => "[Br-]"
  64 => "[Pt+2]"
  61 => "[O-]"
  29 => "[B-]"
  46 => "[K+]"
  57 => "[NH4+]"
  70 => "[Zn+2]"
  4  => "."
  13 => "8"
  ⋮  => ⋮

In [9]:
py"""
def augment_smiles(string, n):
    sme = SmilesEnumerator()
    output = []
    for i in range(n):
        output.append(sme.randomize_smiles(string))
    
    return output
"""

augment_smiles(str, n) = py"augment_smiles"(str, n)
return_tokens(str) = py"return_tokens"(str)

return_tokens (generic function with 1 method)

In [10]:
n = 10

for i in 1:length(smiles[:, begin])
    for augmented in augment_smiles(smiles[:, begin][i], n)
        smiles = vcat(smiles, String[augmented smiles[:, end][i]])
    end
end

In [12]:
activity = reduce(hcat, [i == "Active" ? [1, 0] : [0, 1] for i in smiles[:, end]])'

53130×2 adjoint(::Matrix{Int64}) with eltype Int64:
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 ⋮  
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1

In [13]:
function standardizeCase(str)
    str = titlecase(str)
    str = replace(str, "h" => "H")
end

standardizeCase (generic function with 1 method)

In [None]:
strings = []
activity = []

for i in 1:length(smiles[:, begin])
    try
        push!(strings, [tokenizer[standardizeCase(j)] for j in return_tokens(smiles[:, begin][i])[begin]])
        push!(activity, smiles[:, end][i] == "Active" ? [1, 0] : [0, 1])
        # https://discourse.julialang.org/t/using-push/30935/2
    catch
    end

    if i % 100 == 0
        println("$i | strings: $(length(strings)), activity: $(length(activity))")
    end
end

100 | strings: 100, activity: 100
200 | strings: 200, activity: 200
300 | strings: 300, activity: 300
400 | strings: 400, activity: 400
500 | strings: 500, activity: 500
600 | strings: 600, activity: 600
700 | strings: 700, activity: 700
800 | strings: 800, activity: 800
900 | strings: 900, activity: 900
1000 | strings: 1000, activity: 1000
1100 | strings: 1100, activity: 1100
1200 | strings: 1200, activity: 1200
1300 | strings: 1300, activity: 1300


In [13]:
# strings = [[tokenizer[standardizeCase(j)] for j in return_tokens(i)[begin]] for i in smiles[:, begin]]
# activity = reduce(hcat, [i == "Active" ? [1, 0] : [0, 1] for i in smiles[:, end]])'

activity = reduce(hcat, activity)'

@assert length(strings) == size(activity)[begin]

In [14]:
convert_back(x) = join([i in keys(reverse_tokenizer) ? reverse_tokenizer[i] : "" for i in x])

convert_back (generic function with 1 method)

In [15]:
max_length = maximum(length.(strings))

190

In [16]:
function pad_features(input_strings, length_max)
    features = []
    for i in input_strings
        dim = size(i)[1]
        pad_size = length_max - dim 
        if pad_size > 0
            pad_array = zeros(Int64, pad_size)
            result = append!(pad_array, i)
        else
            result = i[1:length_max]
        end
        push!(features, result)
    end
    return features
end

pad_features (generic function with 1 method)

In [17]:
padded_features = pad_features(strings, max_length)

156844-element Vector{Any}:
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 15, 18, 18, 15, 18, 9, 3, 23, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 15, 18, 18, 15, 18, 9, 3, 23, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 18, 15, 18, 18, 15, 18, 8, 3, 22]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  15, 18, 18, 15, 18, 18, 15, 18, 8, 17]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  3, 18, 3, 23, 10, 3, 18, 3, 18, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  15, 18, 2, 18, 15, 18, 7, 23, 3, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 6, 18, 3, 18, 18, 2, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  3, 23, 3, 23, 3, 23, 3, 23, 3, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 18, 15, 18, 11, 10, 3, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  8, 3, 23, 3, 23, 18, 3, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 8, 3, 23, 18, 3, 23, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 9, 15, 22, 25, 22, 15, 18, 9, 8]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  19, 3, 49, 2, 15, 23, 3, 61, 3, 23]
 ⋮
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  

In [18]:
# save to jld and then process rest in python
save("unencoded_data.jld", "features", padded_features, compress=true)

In [19]:
save("augmented_activity.jld", "activity", Matrix(activity), compress=true)

In [20]:
function onehot(x)
    onehot_array = zeros(length(keys(tokenizer)) + 1)
    onehot_array[x + 1] = 1
    return onehot_array
end

onehot (generic function with 1 method)

In [21]:
parsed = [onehot.(i) for i in padded_features]
println(length(parsed))

156844


In [25]:
# py"""
# import numpy as np 

# def to_numpy(x):
#     return np.array(x)
# """

# py"to_numpy"(parsed)

In [None]:
save("encoded_data.jld", "encoded_data", parsed, compress=true)

In [None]:
matrix_parsed = [mapreduce(permutedims, vcat, i) for i in parsed]

In [None]:
X = [matrix_parsed[i] for i in 1:length(strings)]
Y = convert(Matrix{Float32}, activity)

In [None]:
save("augmented_data.jld", "X", X, "Y", Y, compress=true)