In [1]:
using PyCall
using JLD

In [2]:
function getdf(path)
    py"""
    import pandas as pd

    def read_csv(path):
        return pd.read_csv(path)
    """
    data = py"read_csv"(path)
    return data
end

function dfToMatrix(df)
    data_matrix = Array{Float64}(undef, 0, length(df.columns))

    for i in df.index
        data_matrix = vcat(data_matrix, [convert(Float64, j) for j in df.loc[convert(Int64, i) + 1]]')
    end

    return data_matrix
end

function dfToStringMatrix(df)
    data_matrix = Array{String}(undef, 0, length(df.columns))

    for i in df.index
        data_matrix = vcat(data_matrix, reshape([j for j in df.loc[convert(Int64, i) + 1]], 1, length(df.columns)))
    end

    return data_matrix
end

py"""
from smiles_tools import return_tokens
from smiles_tools import SmilesEnumerator
from SmilesPE.pretokenizer import atomwise_tokenizer
import os
"""

py"""
def augment_smiles(string, n):
    sme = SmilesEnumerator()
    output = []
    for i in range(n):
        output.append(sme.randomize_smiles(string))
    
    return output
"""

augment_smiles(str, n) = py"augment_smiles"(str, n)
atomwise_tokenizer(str) = py"atomwise_tokenizer"(str)
return_tokens(str, vocab) = py"return_tokens"(str, vocab)

return_tokens (generic function with 1 method)

In [3]:
n = 2
debug = false

false

In [6]:
ARG = [2, false, "dopa", "sero"]

4-element Vector{Any}:
     2
 false
      "dopa"
      "sero"

In [7]:
dfs = [getdf(joinpath(@__DIR__, "$(ARG[i])_filtered_dataset.csv")) for i in 3:length(ARG)]

2-element Vector{PyObject}:
 PyObject                            PUBCHEM_EXT_DATASOURCE_SMILES PUBCHEM_ACTIVITY_OUTCOME
0      COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=C(C=C3)B...                   Active
1      COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=CC=C3)N)...                   Active
2               C1CN(CCN1)C(=O)C2=C(SC(=C2)C3=CC=CC=C3)N                   Active
3      C1=CC=C(C=C1)C(=O)C2=C(C(=C(N2)N)C(=O)N)C3=CC=...                   Active
4      CC1=C(C(=O)OC(C1)[C@@H](C)C2CC[C@@H]3[C@@]2(CC...                   Active
...                                                  ...                      ...
18229    C1COCCN1C2=NC3=CC=CC=C3N=C2SCC(=O)NC4=CC=CC=C4F                 Inactive
18230              CC1=CC=CC=C1NC(=O)C2=CC3=CC=CC=C3C=C2                 Inactive
18231  CC1=CC(=CC=C1)N(CC(CN2CCOCC2)O)S(=O)(=O)C3=CC=...                 Inactive
18232    C1=CC(=CC=C1/C=C/C(=O)OCC(=O)NC2=NC=C(C=C2)Cl)F                 Inactive
18233  CC1=NN(C(=C1C(=O)N(C)CC(=O)NC2=CC(=CC=C2)OC)Cl...    

In [8]:
smiles = let temp_df
    temp_df = dfToStringMatrix.(dfs)
    for df_num in 1:length(temp_df)
        for i in 1:length(temp_df[df_num][:, begin])
            for augmented in augment_smiles(temp_df[df_num][:, begin][i], n)
                temp_df[df_num] = vcat(temp_df[df_num], String[augmented temp_df[df_num][:, end][i]])
            end
        end
    end
    temp_df
end

2-element Vector{Matrix{String}}:
 ["COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=C(C=C3)Br)N)C(=O)C4=CC=CC=C4)OC" "Active"; "COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=CC=C3)N)C(=O)C4=CC=CC=C4)OC" "Active"; … ; "c1(C(N(C)CC(=O)Nc2cc(OC)ccc2)=O)c(C)nn(CC(C)C)c1Cl" "Inactive"; "C(N(C(=O)c1c(Cl)n(CC(C)C)nc1C)C)C(Nc1cccc(OC)c1)=O" "Inactive"]
 ["CC[C@H]([C@@H]1[C@H](C[C@@](O1)(CC)[C@H]2CC[C@@]([C@@H](O2)C)(CC)O)C)C(=O)[C@@H](C)[C@H]([C@H](C)CCC3=C(C(=C(C=C3)C)O)C(=O)[O-])O.[Na+]" "Active"; "CC1=C(OC2=C1C=C(C=C2)OC)C(=O)NC3=NC4=CC=CC=C4N3" "Active"; … ; "C1CN(CC(=O)Nc2ccc(NC(c3c(Cl)cccc3)=O)cc2Cl)CCC1.Cl" "Inactive"; "c1ccc(Cl)c(C(=O)Nc2cc(Cl)c(NC(CN3CCCCC3)=O)cc2)c1.Cl" "Inactive"]

In [35]:
strings = [[] for df_num in 1:length(smiles)]
activity = [[] for df_num in 1:length(smiles)]
vocabs = []

Any[]

In [36]:
for df_num in 1:length(smiles)
    for i in 1:length(smiles[df_num][:, begin])
        tokens = [j for j in atomwise_tokenizer(smiles[df_num][:, begin][i])]
        push!(strings[df_num], tokens)
        push!(activity[df_num], smiles[df_num][:, end][i] == "Active" ? [1, 0] : [0, 1])

        if i % 100 == 0
            println("$df_num | $i | strings: $(length(strings[df_num])), activity: $(length(activity[df_num]))")
        end
    end
end

1 | 100 | strings: 100, activity: 100
1 | 200 | strings: 200, activity: 200
1 | 300 | strings: 300, activity: 300
1 | 400 | strings: 400, activity: 400
1 | 500 | strings: 500, activity: 500
1 | 600 | strings: 600, activity: 600
1 | 700 | strings: 700, activity: 700
1 | 800 | strings: 800, activity: 800
1 | 900 | strings: 900, activity: 900
1 | 1000 | strings: 1000, activity: 1000
1 | 1100 | strings: 1100, activity: 1100
1 | 1200 | strings: 1200, activity: 1200
1 | 1300 | strings: 1300, activity: 1300
1 | 1400 | strings: 1400, activity: 1400
1 | 1500 | strings: 1500, activity: 1500
1 | 1600 | strings: 1600, activity: 1600
1 | 1700 | strings: 1700, activity: 1700
1 | 1800 | strings: 1800, activity: 1800
1 | 1900 | strings: 1900, activity: 1900
1 | 2000 | strings: 2000, activity: 2000
1 | 2100 | strings: 2100, activity: 2100
1 | 2200 | strings: 2200, activity: 2200
1 | 2300 | strings: 2300, activity: 2300
1 | 2400 | strings: 2400, activity: 2400
1 | 2500 | strings: 2500, activity: 2500
1 

1 | 19500 | strings: 19500, activity: 19500
1 | 19600 | strings: 19600, activity: 19600
1 | 19700 | strings: 19700, activity: 19700
1 | 19800 | strings: 19800, activity: 19800
1 | 19900 | strings: 19900, activity: 19900
1 | 20000 | strings: 20000, activity: 20000
1 | 20100 | strings: 20100, activity: 20100
1 | 20200 | strings: 20200, activity: 20200
1 | 20300 | strings: 20300, activity: 20300
1 | 20400 | strings: 20400, activity: 20400
1 | 20500 | strings: 20500, activity: 20500
1 | 20600 | strings: 20600, activity: 20600
1 | 20700 | strings: 20700, activity: 20700
1 | 20800 | strings: 20800, activity: 20800
1 | 20900 | strings: 20900, activity: 20900
1 | 21000 | strings: 21000, activity: 21000
1 | 21100 | strings: 21100, activity: 21100
1 | 21200 | strings: 21200, activity: 21200
1 | 21300 | strings: 21300, activity: 21300
1 | 21400 | strings: 21400, activity: 21400
1 | 21500 | strings: 21500, activity: 21500
1 | 21600 | strings: 21600, activity: 21600
1 | 21700 | strings: 21700, acti

1 | 38300 | strings: 38300, activity: 38300
1 | 38400 | strings: 38400, activity: 38400
1 | 38500 | strings: 38500, activity: 38500
1 | 38600 | strings: 38600, activity: 38600
1 | 38700 | strings: 38700, activity: 38700
1 | 38800 | strings: 38800, activity: 38800
1 | 38900 | strings: 38900, activity: 38900
1 | 39000 | strings: 39000, activity: 39000
1 | 39100 | strings: 39100, activity: 39100
1 | 39200 | strings: 39200, activity: 39200
1 | 39300 | strings: 39300, activity: 39300
1 | 39400 | strings: 39400, activity: 39400
1 | 39500 | strings: 39500, activity: 39500
1 | 39600 | strings: 39600, activity: 39600
1 | 39700 | strings: 39700, activity: 39700
1 | 39800 | strings: 39800, activity: 39800
1 | 39900 | strings: 39900, activity: 39900
1 | 40000 | strings: 40000, activity: 40000
1 | 40100 | strings: 40100, activity: 40100
1 | 40200 | strings: 40200, activity: 40200
1 | 40300 | strings: 40300, activity: 40300
1 | 40400 | strings: 40400, activity: 40400
1 | 40500 | strings: 40500, acti

2 | 2700 | strings: 2700, activity: 2700
2 | 2800 | strings: 2800, activity: 2800
2 | 2900 | strings: 2900, activity: 2900
2 | 3000 | strings: 3000, activity: 3000
2 | 3100 | strings: 3100, activity: 3100
2 | 3200 | strings: 3200, activity: 3200
2 | 3300 | strings: 3300, activity: 3300
2 | 3400 | strings: 3400, activity: 3400
2 | 3500 | strings: 3500, activity: 3500
2 | 3600 | strings: 3600, activity: 3600
2 | 3700 | strings: 3700, activity: 3700
2 | 3800 | strings: 3800, activity: 3800
2 | 3900 | strings: 3900, activity: 3900
2 | 4000 | strings: 4000, activity: 4000
2 | 4100 | strings: 4100, activity: 4100
2 | 4200 | strings: 4200, activity: 4200
2 | 4300 | strings: 4300, activity: 4300
2 | 4400 | strings: 4400, activity: 4400
2 | 4500 | strings: 4500, activity: 4500
2 | 4600 | strings: 4600, activity: 4600
2 | 4700 | strings: 4700, activity: 4700
2 | 4800 | strings: 4800, activity: 4800
2 | 4900 | strings: 4900, activity: 4900
2 | 5000 | strings: 5000, activity: 5000
2 | 5100 | strin

In [37]:
for df_num in 1:length(smiles)
    push!(vocabs, Set(reduce(vcat, strings[df_num])))
end

In [38]:
length(vocabs[end])

49

In [39]:
vocabs = union(vocabs...)

Set{String} with 59 elements:
  "1"
  "C"
  "P"
  "[C@@H]"
  "[C-]"
  "2"
  "[Hg]"
  "="
  "/"
  "."
  "[C@@]"
  "[Ca+2]"
  "#"
  "\\"
  "-"
  ")"
  "[B-]"
  "[C@H]"
  "[NH4+]"
  "Br"
  "5"
  "[I-]"
  "[n-]"
  "[n+]"
  "("
  ⋮ 

In [21]:
Set(reduce(vcat, strings[begin]))

Set{String} with 59 elements:
  "n"
  "[NH2+]"
  "[Na]"
  "[Si]"
  "[N-]"
  "[Na+]"
  "Br"
  "o"
  "F"
  "s"
  "3"
  "[Co+3]"
  "[C-]"
  "7"
  "[C@@H]"
  "[n+]"
  "C"
  "8"
  "[I-]"
  "9"
  "[C@]"
  "[O-]"
  "[nH+]"
  "1"
  "="
  ⋮ 

In [23]:
py"""
tokens = [i for sublist in $(strings[begin]) for i in sublist]
tokens = list(set(tokens))
tokens.sort()
"""

In [24]:
py"tokens"

59-element Vector{String}:
 "#"
 "("
 ")"
 "-"
 "."
 "/"
 "1"
 "2"
 "3"
 "4"
 "5"
 "6"
 "7"
 ⋮
 "[Na]"
 "[O-]"
 "[Si]"
 "[n+]"
 "[n-]"
 "[nH+]"
 "[nH]"
 "\\"
 "c"
 "n"
 "o"
 "s"

In [33]:
Set(reduce(vcat, [atomwise_tokenizer(i) for i in dfs[end]["PUBCHEM_EXT_DATASOURCE_SMILES"]]))

Set{String} with 40 elements:
  "C"
  "6"
  "8"
  "("
  "[I-]"
  "[N+]"
  "F"
  "/"
  "[K+]"
  "I"
  "[NH+]"
  ")"
  "\\"
  "3"
  "4"
  "P"
  "S"
  "[N-]"
  "9"
  "[C@]"
  "[O-]"
  "#"
  "[NH3+]"
  "[Br-]"
  "[Cl-]"
  ⋮ 

In [46]:
join(sort(collect(vocabs)))

"#()-./123456789=BrCClFINOPS[As][B-][Br-][C-][C@@H][C@@][C@H][C@][Ca+2][Cl+3][Cl-][Co+3][Hg][I-][K+][N+][N-][NH+][NH2+][NH3+][NH4+][Na+][Na][O-][Si][n+][n-][nH+][nH]\\cnos"

In [51]:
occursin("dfs[begin]["PUBCHEM_EXT_DATASOURCE_SMILES"]

PyObject 0        COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=C(C=C3)B...
1        COC1=C(C=C(C=C1)C2=C(NC(=C2C(=O)C3=CC=CC=C3)N)...
2                 C1CN(CCN1)C(=O)C2=C(SC(=C2)C3=CC=CC=C3)N
3        C1=CC=C(C=C1)C(=O)C2=C(C(=C(N2)N)C(=O)N)C3=CC=...
4        CC1=C(C(=O)OC(C1)[C@@H](C)C2CC[C@@H]3[C@@]2(CC...
                               ...                        
18229      C1COCCN1C2=NC3=CC=CC=C3N=C2SCC(=O)NC4=CC=CC=C4F
18230                CC1=CC=CC=C1NC(=O)C2=CC3=CC=CC=C3C=C2
18231    CC1=CC(=CC=C1)N(CC(CN2CCOCC2)O)S(=O)(=O)C3=CC=...
18232      C1=CC(=CC=C1/C=C/C(=O)OCC(=O)NC2=NC=C(C=C2)Cl)F
18233    CC1=NN(C(=C1C(=O)N(C)CC(=O)NC2=CC(=CC=C2)OC)Cl...
Name: PUBCHEM_EXT_DATASOURCE_SMILES, Length: 18234, dtype: object

In [57]:
# maximum([length.(strings[section]) for section in 1:length(strings)])


196