In [4]:
using Flux
using Flux.Data: DataLoader
# using CUDA
using PyCall
using TOML
using CSV
using DataFrames
# using DataSets
using JLD
using DelimitedFiles
# using Plots
# using PlotThemes
using BSON: @save

In [2]:
# full_vocab = open(Vector{UInt8}, dataset("NikhilMukraj/smiles_vocab")) do buf
#     CSV.read(buf, DataFrame)
# end

# vocab = Matrix(full_vocab)

# buf = open(Vector{UInt8}, dataset("NikhilMukraj/preprocessed_d3_activity"))
# open("data.jld", "w") do f
#     write(f, buf)
# end

71×1 Matrix{String7}:
 "#"
 "("
 ")"
 "."
 "/"
 "1"
 "2"
 "3"
 "4"
 "5"
 ⋮
 "[P+]"
 "[Pt+2]"
 "[Pt]"
 "[S+]"
 "[S-]"
 "[Se]"
 "[Si]"
 "[Zn+2]"
 "\\"

In [9]:
vocab = CSV.read("vocab.csv", DataFrame).tokens

71-element Vector{String7}:
 "#"
 "("
 ")"
 "."
 "/"
 "1"
 "2"
 "3"
 "4"
 "5"
 "6"
 "7"
 "8"
 ⋮
 "[Nd+3]"
 "[O-]"
 "[OH-]"
 "[P+]"
 "[Pt+2]"
 "[Pt]"
 "[S+]"
 "[S-]"
 "[Se]"
 "[Si]"
 "[Zn+2]"
 "\\"

In [10]:
data = load("data.jld")

Dict{String, Any} with 2 entries:
  "Y" => Float32[1.0 0.0; 1.0 0.0; … ; 0.0 1.0; 0.0 1.0]
  "X" => [[1.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0; … ; 0.0 0.0 … 0.0 0.0; 0.0 0.0…

In [11]:
X = data["X"]
Y = data["Y"]

18234×2 Matrix{Float32}:
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 ⋮    
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0

In [12]:
function trainTestSplit(X, Y; percent_train=.7, add_transpose=false)
    @assert size(X)[begin] == size(Y)[begin]

    train_length = floor(Int, size(X)[begin] * percent_train)

    if length(size(X)) == 1
        trainX = zeros(0,1)
        testX = zeros(0,1)
    else
        trainX = zeros(0,size(X)[end])
        testX = zeros(0,size(X)[end])
    end

    if length(size(Y)) == 1
        trainY = zeros(0,1)
        testY = zeros(0,1)
    else
        trainY = zeros(0,size(Y)[end])
        testY = zeros(0,size(Y)[end])
    end

    indices = collect(1:size(X)[begin])
    train_indices = []
    
    count = 0
    while count < train_length
        idx = rand(1:length(indices))
        push!(train_indices, indices[idx])
        deleteat!(indices, idx)
        count += 1
    end

    for i in train_indices
        trainX = vcat(trainX, X[i, :]')
        trainY = vcat(trainY, Y[i, :]')
    end

    for i in indices
        testX = vcat(testX, X[i, :]')
        testY = vcat(testY, Y[i, :]')
    end

    if add_transpose
        trainX = trainX'
        trainY = trainY'

        testX = testX'
        testY = testY'
    end

    return trainX, trainY, testX, testY
end

trainTestSplit (generic function with 1 method)

In [13]:
trainX, trainY, testX, testY = trainTestSplit(X, Y, percent_train=.9)
size(trainX), size(trainY), size(testX), size(testY)

((16410, 1), (16410, 2), (1824, 1), (1824, 2))

In [14]:
trainX = convert.(Matrix{Float32}, trainX)
trainY = convert(Matrix{Float32}, Float32.(trainY))
testX = convert.(Matrix{Float32}, testX)
testY = convert(Matrix{Float32}, Float32.(testY))

1824×2 Matrix{Float32}:
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 1.0  0.0
 ⋮    
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0
 0.0  1.0

In [15]:
trainX = reshape(trainX, 1, size(trainX)[begin])
trainY = trainY'
testX = reshape(testX, 1, size(testX)[begin])
testY = testY'

2×1824 adjoint(::Matrix{Float32}) with eltype Float32:
 1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  …  0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0     1.0  1.0  1.0  1.0  1.0  1.0  1.0

In [16]:
N = length(vocab) + 1

model = Chain(
    LSTM(N, 128), relu,
    LSTM(128, 16), relu,
    Dense(16, 2), softmax
) # |> gpu

Chain(
  Recur(
    LSTMCell(72, 128),                  [90m# 103_168 parameters[39m
  ),
  NNlib.relu,
  Recur(
    LSTMCell(128, 16),                  [90m# 9_312 parameters[39m
  ),
  NNlib.relu,
  Dense(16, 2),                         [90m# 34 parameters[39m
  NNlib.softmax,
)[90m         # Total: 12 trainable arrays, [39m112_514 parameters,
[90m          # plus 4 non-trainable, 288 parameters, summarysize [39m440.305 KiB.

In [17]:
function eval_model(x)    
    out = model(x)[:, end]
    Flux.reset!(model)

    return out'
end

function vcatTranspose(x)
    temp_matrix = Matrix{Float32}(undef, 0, size(x[begin])[end])

    for i in x 
        temp_matrix = vcat(temp_matrix, i)
    end

    return temp_matrix
end

function gpu_vcat(x)
    mat = zeros(size(x)[end], 2) |> gpu
    for i = 1:length(x)
        @inbounds mat[i, :] = Matrix(x[i])
    end
    
    return mat
end

function loss(x, y)
    pred = eval_model.(x)
    pred = vcatTranspose(pred)

    return Flux.crossentropy(pred', y)
end

function gpu_loss(x, y)
    pred = gpu_vcat(eval_model.(x))'

    return Flux.crossentropy(pred, y)
end

gpu_loss (generic function with 1 method)

In [18]:
function accuracy(ŷ, y)
    # check accuracy of prediction
    # ŷ
    # ŷ = eval_model(x)
    return sum(argmax.(ŷ) .== argmax.(y)) / length(y)
end

function gpu_acc(ŷ, y)
    return sum(argmax.(eachrow(ŷ)) .== argmax.(eachrow(y))) / length(y)
end

gpu_acc (generic function with 1 method)

In [13]:
# trainX = trainX |> gpu
# trainY = trainY |> gpu
# testX = testX |> gpu
# testY = testY |> gpu

In [19]:
data = DataLoader((trainX, trainY), batchsize=32, shuffle=true)
opt = ADAM(0.001, (0.9, 0.999))

ADAM(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [47]:
for 
    println(j)
    break
end

Float32[0.0, 1.0]


In [95]:
for n, batch in enumerate(data)
    gs = []
    xbatch = batch[begin]
    ybatch = batch[end]
    for (x, y) in zip(xbatch, eachrow(ybatch')) 
        Flux.reset!(model)
        ∇ = gradient(Flux.params(model)) do
            [model(x[:, i] |> gpu) for i in 1:size(x[:, begin:end-1])[end]]
            Flux.crossentropy(model(x[:, end] |> gpu), y |> gpu)
        end

        push!(gs, ∇)
    end

    Flux.update!(opt, Flux.params(model), [sum(i)/length(i) for i in eachrow(gpu_vcat(gs')')])
    println("Completed batch $n")
end

# for x, y in batch 
#     ...
# end

# yes i know this is literally the same as Flux.train! but it doesnt know how to deal with
# conversion of vectors of vectors to matrices it thinks its part of the differential but its not
# and zygote buffers dont make sense so here we are

# We take the average of the gradients of all the training examples and then use that mean gradient 
# to update our parameters. So that’s just one step of gradient descent in one epoch.

In [15]:
epochs = 1:100

training_losses = []
testing_losses = []
training_accs = []
testing_accs = []

for i in epochs
    println("Epoch: $i")
    Flux.train!(gpu_loss, Flux.params(model), data, opt)
    println("Finished train at epoch $i")

    pred = eval_model.(trainX)
    test_pred = eval_model.(testX)
    pred = gpu_vcat(pred)
    test_pred = gpu_vcat(pred)

    push!(training_losses, Flux.crossentropy(pred', trainY))
    push!(testing_losses, Flux.crossentropy(test_pred', testY))

    push!(training_accs, gpu_acc(pred, trainY'))
    push!(testing_accs, gpu_acc(test_pred, testY'))

    println("Training Accuracy: $(training_accs[i])")
    println("Testing Accuracy: $(testing_accs[i])")

    println("Training Loss: $(training_losses[i])")
    println("Testing Loss: $(testing_losses[i])")
end

Epoch: 1


UndefVarError: UndefVarError: opt not defined

In [16]:
@save "rnn.bson" model

In [17]:
writedlm("training_accs.txt", training_accs)
writedlm("testing_accs.txt", testing_accs)
writedlm("training_losses.txt", training_losses)
writedlm("testing_losses.txt", testing_losses)