In [None]:
# script source --> https://colab.research.google.com/github/ageron/julia_notebooks/blob/master/Julia_Colab_Notebook_Template.ipynb

%%shell
set -e

#---------------------------------------------------#
JULIA_VERSION="1.8.3" # any version ≥ 0.7.0
JULIA_PACKAGES="IJulia"
JULIA_PACKAGES_IF_GPU="CUDA" # or CuArrays for older Julia versions
JULIA_NUM_THREADS=2
#---------------------------------------------------#

if [ -z `which julia` ]; then
  # Install Julia
  JULIA_VER=`cut -d '.' -f -2 <<< "$JULIA_VERSION"`
  echo "Installing Julia $JULIA_VERSION on the current Colab Runtime..."
  BASE_URL="https://julialang-s3.julialang.org/bin/linux/x64"
  URL="$BASE_URL/$JULIA_VER/julia-$JULIA_VERSION-linux-x86_64.tar.gz"
  wget -nv $URL -O /tmp/julia.tar.gz # -nv means "not verbose"
  tar -x -f /tmp/julia.tar.gz -C /usr/local --strip-components 1
  rm /tmp/julia.tar.gz

  # Install Packages
  nvidia-smi -L &> /dev/null && export GPU=1 || export GPU=0
  if [ $GPU -eq 1 ]; then
    JULIA_PACKAGES="$JULIA_PACKAGES $JULIA_PACKAGES_IF_GPU"
  fi
  for PKG in `echo $JULIA_PACKAGES`; do
    echo "Installing Julia package $PKG..."
    julia -e 'using Pkg; pkg"add '$PKG'; precompile;"' &> /dev/null
  done

  # Install kernel and rename it to "julia"
  echo "Installing IJulia kernel..."
  julia -e 'using IJulia; IJulia.installkernel("julia", env=Dict(
      "JULIA_NUM_THREADS"=>"'"$JULIA_NUM_THREADS"'"))'
  KERNEL_DIR=`julia -e "using IJulia; print(IJulia.kerneldir())"`
  KERNEL_NAME=`ls -d "$KERNEL_DIR"/julia*`
  mv -f $KERNEL_NAME "$KERNEL_DIR"/julia

  echo ''
  echo "Successfully installed `julia -v`!"
  echo "Please reload this page (press Ctrl+R, ⌘+R, or the F5 key) then"
  echo "jump to the 'Checking the Installation' section."
fi

Installing Julia 1.8.3 on the current Colab Runtime...
2023-08-29 18:30:22 URL:https://storage.googleapis.com/julialang2/bin/linux/x64/1.8/julia-1.8.3-linux-x86_64.tar.gz [130030846/130030846] -> "/tmp/julia.tar.gz" [1]
Installing Julia package IJulia...
Installing Julia package CUDA...
Installing IJulia kernel...
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mInstalling julia kernelspec in /root/.local/share/jupyter/kernels/julia-1.8

Successfully installed julia version 1.8.3!
Please reload this page (press Ctrl+R, ⌘+R, or the F5 key) then
jump to the 'Checking the Installation' section.




In [1]:
versioninfo()

Julia Version 1.8.3
Commit 0434deb161e (2022-11-14 20:14 UTC)
Platform Info:
  OS: Linux (x86_64-linux-gnu)
  CPU: 2 × Intel(R) Xeon(R) CPU @ 2.20GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, broadwell)
  Threads: 2 on 2 virtual cores
Environment:
  LD_LIBRARY_PATH = /usr/lib64-nvidia
  JULIA_NUM_THREADS = 2


In [2]:
using Pkg
Pkg.add(["Flux", "MLDatasets", "JSON", "CUDA", "Statistics"])

[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m   Installed[22m[39m OffsetArrays ──────────────── v1.12.10
[32m[1m   Installed[22m[39m cuDNN ─────────────────────── v1.1.0
[32m[1m   Installed[22m[39m ContextVariablesX ─────────── v0.1.3
[32m[1m   Installed[22m[39m ShowCases ─────────────────── v0.1.0
[32m[1m   Installed[22m[39m LoggingExtras ─────────────── v1.0.1
[32m[1m   Installed[22m[39m NNlibCUDA ─────────────────── v0.2.7
[32m[1m   Installed[22m[39m Unitful ───────────────────── v1.17.0
[32m[1m   Installed[22m[39m ZipFile ───────────────────── v0.10.1
[32m[1m   Installed[22m[39m ConcurrentUtilities ───────── v2.2.1
[32m[1m   Installed[22m[39m InlineStrings ─────────────── v1.4.0
[32m[1m   Installed[22m[39m Optimisers ────────────────── v0.2.20
[32m[1m   Installed[22m[39m InitialValues ─────────────── v0.3.1
[32m[1m   Installed[22m[39m NNli

In [3]:
using Flux
using Flux.Data: DataLoader
using MLDatasets

# Load MNIST dataset
train_X, train_y = MNIST(split=:train)[:]
test_X, test_y = MNIST(split=:test)[:]

# Reshape dataset
train_X = reshape(train_X, (28, 28, 1, :))
test_X = reshape(test_X, (28, 28, 1, :))

# One-hot encode target values
train_y = Flux.onehotbatch(train_y, 0:9)
test_y = Flux.onehotbatch(test_y, 0:9)

# Normalize data
train_X /= 255.0
test_X /= 255.0

println("Shape of train_X: $(size(train_X))")
println("Shape of test_X: $(size(test_X))")


This program has requested access to the data dependency MNIST.
which is not currently installed. It can be installed automatically, and you will not see this message again.

Dataset: THE MNIST DATABASE of handwritten digits
Authors: Yann LeCun, Corinna Cortes, Christopher J.C. Burges
Website: http://yann.lecun.com/exdb/mnist/

[LeCun et al., 1998a]
    Y. LeCun, L. Bottou, Y. Bengio, and P. Haffner.
    "Gradient-based learning applied to document recognition."
    Proceedings of the IEEE, 86(11):2278-2324, November 1998

The files are available for download at the offical
website linked above. Note that using the data
responsibly and respecting copyright remains your
responsibility. The authors of MNIST aren't really
explicit about any terms of use, so please read the
website to make sure you want to download the
dataset.



Do you want to download the dataset from ["https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz", "https://ossci-datasets.s3.amazonaws.com/mn

# GPU

#Small

In [18]:
using Flux

calc_device = gpu

model_type = "SMALL"

model = Chain(

    Flux.flatten,

    Dense(28*28, 128),
    selu,
    Dense(128, 10),  # No activation here
    softmax
)  |> calc_device



Chain(
  Flux.flatten,
  Dense(784 => 128),                    [90m# 100_480 parameters[39m
  NNlib.selu,
  Dense(128 => 10),                     [90m# 1_290 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 4 arrays, [39m101_770 parameters, 576 bytes.

# Medium

In [14]:
using Flux

calc_device = gpu

model_type = "MEDIUM"

model = Chain(
    Conv((3, 3), 1=>32, pad=(1, 1), selu),
    MaxPool((2, 2)),
    Conv((3, 3), 32=>64, pad=(1, 1), selu),
    MaxPool((2, 2)),

    Flux.flatten,


    Dense(3136, 64),
    selu,
    Dense(64, 10),  # No activation here
    softmax
)  |> calc_device


Chain(
  Conv((3, 3), 1 => 32, selu, pad=1),   [90m# 320 parameters[39m
  MaxPool((2, 2)),
  Conv((3, 3), 32 => 64, selu, pad=1),  [90m# 18_496 parameters[39m
  MaxPool((2, 2)),
  Flux.flatten,
  Dense(3136 => 64),                    [90m# 200_768 parameters[39m
  NNlib.selu,
  Dense(64 => 10),                      [90m# 650 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 8 arrays, [39m220_234 parameters, 1.719 KiB.

# Big

In [None]:
using Flux

calc_device = gpu

model_type = "BIG"

model = Chain(
    # Convolutional layers
    Conv((3, 3), 1=>64, pad=(1,1), stride=(1,1), selu),
    MaxPool((2,2)),
    Conv((3, 3), 64=>128, pad=(1,1), stride=(1,1), selu),
    MaxPool((2,2)),
    Conv((3, 3), 128=>256, pad=(1,1), stride=(1,1), selu),
    MaxPool((2,2)),

    # Flatten layer
    Flux.flatten,

    # Dense layers
    Dense(256*3*3, 512),
    selu,
    Dense(512, 256),
    selu,
    Dense(256, 10),
    softmax
)  |> gpu

Chain(
  Conv((3, 3), 1 => 64, selu, pad=1),   [90m# 640 parameters[39m
  MaxPool((2, 2)),
  Conv((3, 3), 64 => 128, selu, pad=1),  [90m# 73_856 parameters[39m
  MaxPool((2, 2)),
  Conv((3, 3), 128 => 256, selu, pad=1),  [90m# 295_168 parameters[39m
  MaxPool((2, 2)),
  Flux.flatten,
  Dense(2304 => 512),                   [90m# 1_180_160 parameters[39m
  NNlib.selu,
  Dense(512 => 256),                    [90m# 131_328 parameters[39m
  NNlib.selu,
  Dense(256 => 10),                     [90m# 2_570 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 12 arrays, [39m1_683_722 parameters, 2.570 KiB.

# LOSS and ACC

In [19]:
# Define loss function and optimizer
using Statistics

loss(x, y) = Flux.crossentropy(model(x), y)
accuracy(X, y) = Statistics.mean(Flux.onecold(model(X)) .== Flux.onecold(y))
optimizer = ADAM(0.001)

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [20]:
using Flux
using CUDA

# Training configuration
batch_size = 64
num_epochs = 10


# Initialize variables for monitoring
total_training_time = 0.0
total_memory_usage = 0
throughputs = []

monitoring_data = Dict(
    "epoch" => [],
    "epoch_memory_usage" => [],
    "epoch_time" => [],
    "batch_processing_time" => [],
    "throughput" => [],
    "accuracy" => [],
    "loss" => [],
)

# Start training loop
for epoch in 1:num_epochs
    println("Epoch $epoch/$num_epochs")

    # Initialize variables for epoch-level monitoring
    total_batch_processing_time = 0.0
    epoch_memory_use = 0
    epoch_start_time = time()


    initial_mem =

    loss_value = 0.0
    accuracy_value = 0.0

    coef = 0


    # mem_info = CUDA.memoryinfo()

    for batch_start in 1:batch_size:size(train_X, 4)

        initial_mem = CUDA.used_memory()

        batch_start_time = time()

        batch_end = min(batch_start + batch_size - 1, size(train_X, 4))

        batch_x = train_X[:, :, :, batch_start:batch_end]
        batch_y = train_y[:, batch_start:batch_end]


        # Perform training step
        #loss_value = Flux.train!(loss, Flux.params(model), DataLoader((batch_x, batch_y)), optimizer)

        x, y = calc_device(batch_x), calc_device(batch_y)
        gradients = Flux.gradient(() -> loss(x, y), Flux.params(model))
        Flux.Optimise.update!(optimizer, Flux.params(model), gradients)

        # collect the loss and accuracy from each batch
        loss_value += loss(x, y)
        accuracy_value += accuracy(x, y)
        # accuracy_value += (accuracy(x, y) / 938)


        batch_processing_time = time() - batch_start_time
        total_batch_processing_time += batch_processing_time

        # Calculate memory usage
        # epoch_memory_use += CUDA.memory_status().free

        # If the gpu memory get reset, add to the epoch memory_use the mean of each batch_memory_use
        batch_memory_use = CUDA.used_memory() - initial_mem
        if batch_memory_use <= 0
          batch_memory_use = floor(Int, epoch_memory_use / coef)
        end

        epoch_memory_use += batch_memory_use


        coef +=1
    end


    # Calculate the actual memory used by the GPU in this iteration

    accuracy_value /= coef
    loss_value /= coef

    epoch_end_time = time()
    epoch_time = epoch_end_time - epoch_start_time
    total_training_time += epoch_time
    total_memory_usage += epoch_memory_use

    num_samples = size(train_X, 4)
    throughput = num_samples / epoch_time
    push!(throughputs, throughput)

    println(" - Memory Usage: $epoch_memory_use bytes")
    println(" - Epoch Time: $epoch_time seconds")
    println(" - Batch Processing Time: $total_batch_processing_time seconds")
    println(" - Throughput: $throughput samples/second")

    # Evaluate accuracy and convergence
    println(" - Accuracy: $accuracy_value")
    println(" - Loss: $loss_value")

    push!(monitoring_data["epoch"], epoch)
    push!(monitoring_data["epoch_memory_usage"], epoch_memory_use)
    push!(monitoring_data["epoch_time"], epoch_time)
    push!(monitoring_data["batch_processing_time"], total_batch_processing_time)
    push!(monitoring_data["throughput"], throughput)
    push!(monitoring_data["accuracy"], accuracy_value)
    push!(monitoring_data["loss"], loss_value)
end


println("Total Training Time: $total_training_time seconds")
println("Average Memory Usage: $(total_memory_usage / num_epochs) bytes")


Epoch 1/10
 - Memory Usage: 1522261557 bytes
 - Epoch Time: 2.965359926223755 seconds
 - Batch Processing Time: 2.9471702575683594 seconds
 - Throughput: 20233.631495926755 samples/second
 - Accuracy: 0.7373567430703625
 - Loss: 1.1405798939626608
Epoch 2/10
 - Memory Usage: 1521440840 bytes
 - Epoch Time: 1.85306715965271 seconds
 - Batch Processing Time: 1.835587978363037 seconds
 - Throughput: 32378.750919769587 samples/second
 - Accuracy: 0.8801472547974414
 - Loss: 0.4568845271142815
Epoch 3/10
 - Memory Usage: 1521440840 bytes
 - Epoch Time: 1.9247949123382568 seconds
 - Batch Processing Time: 1.907130479812622 seconds
 - Throughput: 31172.152220161213 samples/second
 - Accuracy: 0.8989372334754797
 - Loss: 0.3626590719275764
Epoch 4/10
 - Memory Usage: 1521440840 bytes
 - Epoch Time: 1.9522359371185303 seconds
 - Batch Processing Time: 1.934335470199585 seconds
 - Throughput: 30733.990118304584 samples/second
 - Accuracy: 0.9064998667377399
 - Loss: 0.32721084431766956
Epoch 5/1

In [21]:
using JSON
json_filename = model_type * "_GPU_JULIA_epoch_data.json"

stringdata = JSON.json(monitoring_data)

# write the file with the stringdata variable information
open(json_filename, "w") do f
        write(f, stringdata)
     end

1182

# CPU

In [None]:
using Flux

calc_device = cpu

model = Chain(
    Conv((3, 3), 1=>32, pad=(1, 1), selu),
    MaxPool((2, 2)),
    Conv((3, 3), 32=>64, pad=(1, 1), selu),
    MaxPool((2, 2)),
    x -> reshape(x, :, size(x, 4)),  # Flatten layer
    Dense(3136, 64),
    selu,
    Dense(64, 10),  # No activation here
    softmax
)  |> calc_device


Chain(
  Conv((3, 3), 1 => 32, selu, pad=1),   [90m# 320 parameters[39m
  MaxPool((2, 2)),
  Conv((3, 3), 32 => 64, selu, pad=1),  [90m# 18_496 parameters[39m
  MaxPool((2, 2)),
  var"#1#2"(),
  Dense(3136 => 64),                    [90m# 200_768 parameters[39m
  NNlib.selu,
  Dense(64 => 10),                      [90m# 650 parameters[39m
  NNlib.softmax,
) [90m                  # Total: 8 arrays, [39m220_234 parameters, 861.398 KiB.

In [None]:
# Define loss function and optimizer
using Statistics

loss(x, y) = Flux.crossentropy(model(x), y)
accuracy(X, y) = Statistics.mean(Flux.onecold(model(X)) .== Flux.onecold(y))
optimizer = ADAM(0.001)

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [None]:
using Flux
using Flux.Data: DataLoader
using MLDatasets
using JSON
using CUDA


# Training configuration
batch_size = 64
num_epochs = 10

# Initialize variables for monitoring
total_training_time = 0.0
total_memory_usage = 0
throughputs = []

monitoring_data = Dict(
    "epoch" => [],
    "epoch_memory_usage" => [],
    "epoch_time" => [],
    "batch_processing_time" => [],
    "throughput" => [],
    "accuracy" => [],
    "loss" => [],
)

# Start training loop
for epoch in 1:num_epochs
    println("Epoch $epoch/$num_epochs")

    # Initialize variables for epoch-level monitoring
    total_batch_processing_time = 0.0
    epoch_memory_use = 0
    epoch_start_time = time()

    loss_value = 0.0
    accuracy_value = 0.0



    for batch_start in 1:batch_size:size(train_X, 4)


        batch_start_time = time()

        batch_end = min(batch_start + batch_size - 1, size(train_X, 4))

        batch_x = train_X[:, :, :, batch_start:batch_end]
        batch_y = train_y[:, batch_start:batch_end]


        # Perform training step
        #loss_value = Flux.train!(loss, Flux.params(model), DataLoader((batch_x, batch_y)), optimizer)

        x, y = calc_device(batch_x), calc_device(batch_y)
        gradients = Flux.gradient(() -> loss(x, y), Flux.params(model))
        Flux.Optimise.update!(optimizer, Flux.params(model), gradients)

        # collect the loss and accuracy from each batch
        loss_value += (loss(x, y) / 938)
        accuracy_value += (accuracy(x, y) / 938)


        batch_processing_time = time() - batch_start_time
        total_batch_processing_time += batch_processing_time

        # Calculate memory usage
        # epoch_memory_use += CUDA.memory_status().free
    end

    epoch_end_time = time()
    epoch_time = epoch_end_time - epoch_start_time
    total_training_time += epoch_time
    total_memory_usage += epoch_memory_use

    num_samples = size(train_X, 4)
    throughput = num_samples / epoch_time
    push!(throughputs, throughput)

    println(" - Memory Usage: $epoch_memory_use bytes")
    println(" - Epoch Time: $epoch_time seconds")
    println(" - Batch Processing Time: $total_batch_processing_time seconds")
    println(" - Throughput: $throughput samples/second")

    # Evaluate accuracy and convergence
    println(" - Accuracy: $accuracy_value")
    println(" - Loss: $loss_value")

    push!(monitoring_data["epoch"], epoch)
    push!(monitoring_data["epoch_memory_usage"], epoch_memory_use)
    push!(monitoring_data["epoch_time"], epoch_time)
    push!(monitoring_data["batch_processing_time"], total_batch_processing_time)
    push!(monitoring_data["throughput"], throughput)
    push!(monitoring_data["accuracy"], accuracy_value)
    push!(monitoring_data["loss"], loss_value)
end


println("Total Training Time: $total_training_time seconds")
println("Average Memory Usage: $(total_memory_usage / num_epochs) bytes")
println("Average Throughput: $(sum(throughputs) / num_epochs) samples/second"))


Epoch 1/10
 - Memory Usage: 0 bytes
 - Epoch Time: 6.511749029159546 seconds
 - Batch Processing Time: 6.502673149108887 seconds
 - Throughput: 9214.114323405372 samples/second
 - Accuracy: 0.869986007462691
 - Loss: 0.43604071559457225
Epoch 2/10
 - Memory Usage: 0 bytes
 - Epoch Time: 4.527939081192017 seconds
 - Batch Processing Time: 4.519378185272217 seconds
 - Throughput: 13251.06166936427 samples/second
 - Accuracy: 0.9611040778251647
 - Loss: 0.13175277564869248
Epoch 3/10
 - Memory Usage: 0 bytes
 - Epoch Time: 4.935431003570557 seconds
 - Batch Processing Time: 4.925978183746338 seconds
 - Throughput: 12156.992967097052 samples/second
 - Accuracy: 0.9773121002132252
 - Loss: 0.076912162796134
Epoch 4/10


In [None]:
using JSON
json_filename = "CPU_JULIA_epoch_data.json"

stringdata = JSON.json(monitoring_data)

# write the file with the stringdata variable information
open(json_filename, "w") do f
        write(f, stringdata)
     end

1107