In [1]:
using Flux
using Flux.Data: DataLoader
using PyCall
using TOML
using Pkg
using JLD
using DelimitedFiles
using Plots
using PlotThemes
using BSON: @save

In [2]:
env_path = TOML.parse(read("mltools.toml", String))["env_path"]
Pkg.activate(env_path)

[32m[1m  Activating[22m[39m environment at `C:\Users\nikhi\OneDrive\Documents\julia\MLTools\Project.toml`


In [3]:
import MLTools as mlt

In [4]:
df = mlt.getdf(joinpath(@__DIR__, "filtered_dataset.csv"))

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=C(C=C3)B...,Active
1,COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=CC=C3)N)...,Active
2,C1CN(CCN1)C(=O)C2=C(SC(=C2)C3=CC=CC=C3)N,Active
3,C1=CC=C(C=C1)C(=O)C2=C(C(=C(N2)N)C(=O)N)C3=CC=...,Active
4,CC1=C(C(=O)OC(C1)[C@@H](C)C2CC[C@@H]3[C@@]2(CC...,Active
...,...,...
18229,C1CN(CCC1CNC(=O)NC2=CC=CC=C2)C3=CC=C(C=C3)S(=O...,Inactive
18230,CN1C(=O)C(=C(N(C1=O)CC2=CC=CC=C2)N)C(=O)CN3CCOCC3,Inactive
18231,COC1=C(C=C(C=C1)Cl)NC(=S)NCCC2=CC=C(C=C2)S(=O)...,Inactive
18232,C1CN(CCN1C2=CC=C(C=C2)[N+](=O)[O-])C(=O)CN3C(=...,Inactive


In [5]:
smiles = mlt.dfToStringMatrix(df)

18234×2 Matrix{String}:
 "COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=C(C=C3)Br)N)C(=O)C4=CC=CC=C4)OC"                                                         …  "Active"
 "COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=CC=C3)N)C(=O)C4=CC=CC=C4)OC"                                                                "Active"
 "C1CN(CCN1)C(=O)C2=C(SC(=C2)C3=CC=CC=C3)N"                                                                                         "Active"
 "C1=CC=C(C=C1)C(=O)C2=C(C(=C(N2)N)C(=O)N)C3=CC=CC=C3Br"                                                                            "Active"
 "CC1=C(C(=O)OC(C1)[C@@H](C)C2CC[C@@H]3[C@@]2(CCC4C3C[C@@H]5[C@]6([C@@]4(C(=O)C=C[C@@H]6O)C)O5)C)CO"                                "Active"
 "CC1CCC/C=C/C(=O)C2=C(CC(=O)O1)C=C(C=C2O)O"                                                                                     …  "Active"
 "CC1C2C(NC(=O)C23C(/C=C(/CCC(C(/C=C/C3=O)O)O)\\C)C=C1C)CC(C)C"                                                                   

In [6]:
function readPyFile(path)
    py"""
    exec(open($(path)).read(), globals(), locals())
    """
end

readPyFile("tokenization.py")
readPyFile("smilesenumeration.py")

In [7]:
vocab = mlt.dfToStringMatrix(mlt.getdf(joinpath(@__DIR__, "vocab.csv")))

tokenizer = Dict(j => i for (i, j) in enumerate(vocab))
reverse_tokenizer = Dict(value => key for (key, value) in tokenizer)

Dict{Int64, String} with 71 entries:
  5  => "/"
  56 => "[NH3+]"
  16 => "B"
  35 => "[C@H]"
  55 => "[NH2+]"
  20 => "F"
  60 => "[Nd+3]"
  30 => "[Ba+2]"
  19 => "Cl"
  32 => "[C-]"
  49 => "[N+]"
  6  => "1"
  67 => "[S-]"
  45 => "[I-]"
  44 => "[Hg]"
  9  => "4"
  31 => "[Br-]"
  64 => "[Pt+2]"
  61 => "[O-]"
  29 => "[B-]"
  46 => "[K+]"
  57 => "[NH4+]"
  70 => "[Zn+2]"
  4  => "."
  13 => "8"
  ⋮  => ⋮

In [8]:
py"""
def augment_smiles(string, n):
    sme = SmilesEnumerator()
    output = []
    for i in range(n):
        output.append(sme.randomize_smiles(string))
    
    return output
"""

augment_smiles(str, n) = py"augment_smiles"(str, n)
return_tokens(str) = py"return_tokens"(str)

return_tokens (generic function with 1 method)

In [9]:
n = 10

for i in 1:length(smiles[:, begin])
    for augmented in augment_smiles(smiles[:, begin][i], n)
        smiles = vcat(smiles, String[augmented smiles[:, end][i]])
    end
end

In [10]:
activity = reduce(hcat, [i == "Active" ? [1, 0] : [0, 1] for i in smiles[:, end]])'

200574×2 adjoint(::Matrix{Int64}) with eltype Int64:
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 1  0
 ⋮  
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1
 0  1

In [11]:
function standardizeCase(str)
    str = titlecase(str)
    str = replace(str, "h" => "H")
end

standardizeCase (generic function with 1 method)

In [12]:
strings = []
activity = []

for i in 1:length(smiles[:, begin])
    try
        push!(strings, [tokenizer[standardizeCase(j)] for j in return_tokens(smiles[:, begin][i])[begin]])
        push!(activity, smiles[:, end][i] == "Active" ? [1, 0] : [0, 1])
        # https://discourse.julialang.org/t/using-push/30935/2
    catch
    end

    if i % 100 == 0
        println("$i | strings: $(length(strings)), activity: $(length(activity))")
    end
end

100 | strings: 100, activity: 100
200 | strings: 200, activity: 200
300 | strings: 300, activity: 300
400 | strings: 400, activity: 400
500 | strings: 500, activity: 500
600 | strings: 600, activity: 600
700 | strings: 700, activity: 700
800 | strings: 800, activity: 800
900 | strings: 900, activity: 900
1000 | strings: 1000, activity: 1000
1100 | strings: 1100, activity: 1100
1200 | strings: 1200, activity: 1200
1300 | strings: 1300, activity: 1300
1400 | strings: 1400, activity: 1400
1500 | strings: 1500, activity: 1500
1600 | strings: 1600, activity: 1600
1700 | strings: 1700, activity: 1700
1800 | strings: 1800, activity: 1800
1900 | strings: 1900, activity: 1900
2000 | strings: 2000, activity: 2000
2100 | strings: 2100, activity: 2100
2200 | strings: 2200, activity: 2200
2300 | strings: 2300, activity: 2300
2400 | strings: 2400, activity: 2400
2500 | strings: 2500, activity: 2500
2600 | strings: 2600, activity: 2600
2700 | strings: 2700, activity: 2700
2800 | strings: 2800, activi

In [13]:
# strings = [[tokenizer[standardizeCase(j)] for j in return_tokens(i)[begin]] for i in smiles[:, begin]]
# activity = reduce(hcat, [i == "Active" ? [1, 0] : [0, 1] for i in smiles[:, end]])'

activity = reduce(hcat, activity)'

@assert length(strings) == size(activity)[begin]

In [14]:
convert_back(x) = join([i in keys(reverse_tokenizer) ? reverse_tokenizer[i] : "" for i in x])

convert_back (generic function with 1 method)

In [15]:
max_length = maximum(length.(strings))

190

In [16]:
function pad_features(input_strings, length_max)
    features = []
    for i in input_strings
        dim = size(i)[1]
        pad_size = length_max - dim 
        if pad_size > 0
            pad_array = zeros(Int64, pad_size)
            result = append!(pad_array, i)
        else
            result = i[1:length_max]
        end
        push!(features, result)
    end
    return features
end

pad_features (generic function with 1 method)

In [17]:
padded_features = pad_features(strings, max_length)

156844-element Vector{Any}:
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 15, 18, 18, 15, 18, 9, 3, 23, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 15, 18, 18, 15, 18, 9, 3, 23, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 18, 15, 18, 18, 15, 18, 8, 3, 22]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  15, 18, 18, 15, 18, 18, 15, 18, 8, 17]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  3, 18, 3, 23, 10, 3, 18, 3, 18, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  15, 18, 2, 18, 15, 18, 7, 23, 3, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 6, 18, 3, 18, 18, 2, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  3, 23, 3, 23, 3, 23, 3, 23, 3, 23]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 18, 15, 18, 11, 10, 3, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  8, 3, 23, 3, 23, 18, 3, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 8, 3, 23, 18, 3, 23, 18, 3, 18]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  18, 9, 15, 22, 25, 22, 15, 18, 9, 8]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  19, 3, 49, 2, 15, 23, 3, 61, 3, 23]
 ⋮
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0  …  

In [18]:
# save to jld and then process rest in python
# save("unencoded_data.jld", "features", padded_features, compress=true)

In [19]:
save("augmented_activity.jld", "activity", Matrix(activity), compress=true)

In [20]:
function onehot(x)
    onehot_array = zeros(length(keys(tokenizer)) + 1)
    onehot_array[x + 1] = 1
    return onehot_array
end

onehot (generic function with 1 method)

In [21]:
parsed = [onehot.(i) for i in padded_features]
println(length(parsed))

156844


In [25]:
# py"""
# import numpy as np 

# def to_numpy(x):
#     return np.array(x)
# """

# py"to_numpy"(parsed)

In [None]:
save("encoded_data.jld", "encoded_data", parsed, compress=true)

In [None]:
matrix_parsed = [mapreduce(permutedims, vcat, i) for i in parsed]

In [None]:
X = [matrix_parsed[i] for i in 1:length(strings)]
Y = convert(Matrix{Float32}, activity)

In [None]:
save("augmented_data.jld", "X", X, "Y", Y, compress=true)