In [None]:
using Plots
using DataFrames
using CSV
using Dates
using TimeZones
plotlyjs()

In [None]:
function get_timestamp(year, month, day, hour, minute, second)
    dt0 = ZonedDateTime(Dates.DateTime(2020, 9, 8, 0, 0, 0), tz"UTC")
    dt = ZonedDateTime(Dates.DateTime(year, month, day, hour, minute, second), tz"UTC")
    return Int((dt-dt0).value/1000)
end

function get_run_inner_indices(vec)
    run_inners = falses(size(vec)[1])
    for n = 2:size(vec)[1]-1
        if vec[n-1] == vec[n] && vec[n] == vec[n+1]
            run_inners[n] = true
        end
    end
    return run_inners
end

function get_jumps(vec, dt)
    jump_idcs = []
    for n = 2:size(vec)[1]
        if abs(vec[n] - vec[n-1]) > dt
            append!(jump_idcs, n)
        end
    end
    return jump_idcs
end

## HD Usage

In [None]:
# load and concatenate the hd usage dataframe
dfs = Array{Any}(undef, 63)
itr = 1
for fn = readdir()
    println("Loading $(fn)")
    if first(fn, 2) == "hd" && last(fn,4) == ".csv"
        dfs[itr] = DataFrame(CSV.File(fn))
        itr += 1
    end 
end
df = vcat(dfs...)
print(nrow(df))

In [None]:
# add a column for seconds since 2020 09 08 00 00 00
transform!(df, [:year, :month, :day, :hour, :minute, :second] => ByRow(get_timestamp) => :time)
df = df[!, Not([(x in ["year", "month", "day", "hour", "minute", "second"]) for x in names(df)])]
# sort lexicographically by user number and time
sort!(df, [:user, :time])

In [None]:
# insert 0mb records where time difference is large + begin and end,
# and remove strings of duplicated sizes

# get user number starting indices
user_changes = get_jumps(df[!, "user"], 0.5)
prepend!(user_changes, [1])
append!(user_changes, size(df[!, "user"])[1]+1)

final_df = nothing
for n = 1:size(user_changes)[1]-1
    print("Processing user changepoint $n/$(size(user_changes)[1]-1)   \r")
    
    # find all rows for this user
    df_user = df[user_changes[n]:(user_changes[n+1]-1), ["user", "time", "used_mb"]]
        
    # add a 0mb row to beginning and end
    df_beg = DataFrame(time = [df[1, "time"]], used_mb = [0], user = df_user[1,"user"])
    df_end = DataFrame(time = [df[end, "time"]], used_mb = [0], user = df_user[1,"user"])
    df_user = vcat(df_beg, df_user, df_end)
    
    # find rows where the time difference is greater than 1 day
    jump_idcs = get_jumps(df_user[!, "time"], 24*60*60)
    # add 0 entries for those
    zero_start_times = df_user[jump_idcs .- 1, "time"] .+ 1
    zero_end_times = df_user[jump_idcs, "time"] .- 1
    df_to_add = DataFrame(time = vcat(zero_start_times, zero_end_times), 
                        used_mb = zeros(size(zero_start_times)[1]*2),
                        user = df_user[1, "user"])
    # append and sort
    df_extended = vcat(df_user, df_to_add)
    sort!(df_extended, [:time])
    # remove strings of duplicated sizes
    inner_idcs = get_run_inner_indices(df_extended[!, "used_mb"])
    df_extended = df_extended[Not(inner_idcs), :]
    # append to df
    if isnothing(final_df)
        final_df = df_extended
    else
        final_df = vcat(final_df, df_extended)
    end
end
df = final_df

In [None]:
# compute the hd totals
dd = Dict()
totalhd = [0.]
df_ts = sort(df, ["time"])
totaltimes = [df_ts[1, "time"]]
for n = 1:nrow(df_ts)
    usr = df_ts[n, "user"]
    append!(totaltimes, df_ts[n, "time"])
    append!(totalhd, totalhd[end] + df_ts[n, "used_mb"])
    if haskey(dd, usr)
        totalhd[end] -= dd[usr]
    end
    dd[usr] = df_ts[n, "used_mb"]
end

In [None]:
# get user number starting indices
user_changes = get_jumps(df[!, "user"], 0.5)
prepend!(user_changes, [1])
append!(user_changes, size(df[!, "user"])[1]+1)

p = plot(size = (1600, 1200))
cur_user = df[1, "user"]
for n = 1:size(user_changes)[1]-1
    print("Processing user changepoint $n/$(size(user_changes)[1]-1)   \r")
    
    # find all rows for this user
    df_user = df[user_changes[n]:(user_changes[n+1]-1), ["user", "time", "used_mb"]]
    plot!(p, df_user[!, "time"]/3600/24, df_user[!,"used_mb"], legend=false, alpha=0.8)
end
ylabel!(p, "Storage Used (MB)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

In [None]:
p = plot(size = (1600, 1200))
plot!(p, totaltimes/3600/24, totalhd/1000, linewidth = 3)
ylabel!(p, "Storage Used (GB)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

## Memory Usage

In [None]:
# load and concatenate the hd usage dataframe
dfs = Array{Any}(undef, 64)
itr = 1
for fn = readdir()
    println("Loading $(fn)")
    if first(fn, 7) == "sar_non" && last(fn,4) == ".csv"
        dfs[itr] = DataFrame(CSV.File(fn))[!, ["year", "month", "day", "hour", "minute", "second", "%memused", "kbmemused", "kbswpused", "kbactive", "kbinact"]]
        itr += 1
    end 
end
df = vcat(dfs...)
print(nrow(df))

In [None]:
# add a column for seconds since 2020 09 08 00 00 00
transform!(df, [:year, :month, :day, :hour, :minute, :second] => ByRow(get_timestamp) => :time)
df = df[!, Not([(x in ["year", "month", "day", "hour", "minute", "second"]) for x in names(df)])]
# sort lexicographically by user number and time
sort!(df, [:time])

In [None]:
p = plot(size = (1600, 1200))
plot!(p, df[!, "time"]/3600/24, df[!, "%memused"], label = "memory")
ylabel!(p, "Memory Used (%)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

In [None]:
p = plot(size = (1600, 1200))
plot!(p, df[!, "time"]/3600/24, df[!, "kbmemused"]/1000000, label = "memory")
plot!(p, df[!, "time"]/3600/24, df[!, "kbactive"]/1000000, label = "active")
plot!(p, df[!, "time"]/3600/24, df[!, "kbinact"]/1000000, label = "inactive")
plot!(p, df[!, "time"]/3600/24, df[!, "kbswpused"]/1000000, label = "swap")
ylabel!(p, "Memory Used (GB)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

## CPU Usage

In [None]:
# load and concatenate the hd usage dataframe
dfs = Array{Any}(undef, 64)
itr = 1
for fn = readdir()
    println("Loading $(fn)")
    if first(fn, 7) == "sar_cpu" && last(fn,4) == ".csv"
        # load the CSV
        dftmp = DataFrame(CSV.File(fn))[!, ["year", "month", "day", "hour", "minute", "second", "CPU", "%usr", "%sys", "%iowait"]]
        # just look at the "all" CPU column to reduce by factor of 64
        filter!(row -> row.CPU == "all", dftmp)
        # add a column for seconds since 2020 09 08 00 00 00 and drop all but time
        transform!(dftmp, [:year, :month, :day, :hour, :minute, :second] => ByRow(get_timestamp) => :time)
        dftmp = dftmp[!, Not([(x in ["year", "month", "day", "hour", "minute", "second"]) for x in names(dftmp)])]
        
        dfs[itr] = dftmp
        itr += 1
    end 
end
df = vcat(dfs...)
print(nrow(df))

In [None]:
# sort lexicographically by user number and time
sort!(df, [:time])

In [None]:
p_usr = plot(size = (1600, 1200))
p_sys = plot(size = (1600, 1200))
p_iowait = plot(size = (1600, 1200))
plot!(p_usr, df[!, "time"]/3600/24, df[!, "%usr"])
plot!(p_sys, df[!, "time"]/3600/24, df[!, "%sys"])
plot!(p_iowait, df[!, "time"]/3600/24, df[!, "%iowait"])
# for cpu = 1:64
#     df_cpu = filter(row -> row.CPU == cpu, df)
#     plot!(p_usr, df[!, "time"]/3600/24, df[!, "%usr"])
#     plot!(p_sys, df[!, "time"]/3600/24, df[!, "%sys"])
#     plot!(p_iowait, df[!, "time"]/3600/24, df[!, "%iowait"])
# end
ylabel!(p_usr, "User CPU Used (%)")
xlabel!(p_usr, "Day (since Sep 6, 2020)")
ylabel!(p_sys, "Sys CPU Used (%)")
xlabel!(p_sys, "Day (since Sep 6, 2020)")
ylabel!(p_iowait, "IOWait CPU Used (%)")
xlabel!(p_iowait, "Day (since Sep 6, 2020)")
p

In [None]:
# plot totals
p = plot(size = (1600, 1200))
plot!(p, df[!, "time"]/3600/24, df[!, "%usr"], label="%usr")
plot!(p, df[!, "time"]/3600/24, df[!, "%sys"], label="%sys")
plot!(p, df[!, "time"]/3600/24, df[!, "%iowait"], label="%iowait")
ylabel!(p, "CPU Used (%)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

## Docker Memory Stats

In [None]:
# load and concatenate the memory usage dataframe
dfs = Array{Any}(undef, 60)
itr = 1
for fn = readdir()
    println("Loading $(fn)")
    if first(fn, 11) == "docker_stat" && last(fn,4) == ".csv"
        dfs[itr] = DataFrame(CSV.File(fn))[!, ["year", "month", "day", "hour", "minute", "second", "user", "mem_usage_mb"]]
        itr += 1
    end 
end
df = vcat(dfs...)
print(nrow(df))

In [None]:
# filter to get rid of non-numeric container names (non-jupyterhub user containers)
filter!(row -> typeof(row.user) == Int || !isnothing(tryparse(Int, row.user)), df)
# convert user to numeric
for n = 1:size(df)[1]
    if typeof(df[n, "user"]) != Int
        df[n, "user"] = parse(Int, df[n, "user"])
    end
end
df[!, "user"] = convert.(Int, df[!, "user"])


In [None]:
# add a column for seconds since 2020 09 08 00 00 00
transform!(df, [:year, :month, :day, :hour, :minute, :second] => ByRow(get_timestamp) => :time)
df = df[!, Not([(x in ["year", "month", "day", "hour", "minute", "second"]) for x in names(df)])]
# sort lexicographically by user number and time
sort!(df, [:user, :time])

In [None]:
# insert 0mb records where time difference is large + begin and end,
# and remove strings of duplicated sizes

# get user number starting indices
user_changes = get_jumps(df[!, "user"], 0.5)
prepend!(user_changes, [1])
append!(user_changes, size(df[!, "user"])[1]+1)

final_df = nothing
for n = 1:size(user_changes)[1]-1
    print("Processing user changepoint $n/$(size(user_changes)[1]-1)   \r")
    
    # find all rows for this user
    df_user = df[user_changes[n]:(user_changes[n+1]-1), ["user", "mem_usage_mb", "time"]]
        
    # add a 0mb row to beginning and end
    df_beg = DataFrame(time = [df[1, "time"]], mem_usage_mb = [0], user = df_user[1,"user"])
    df_end = DataFrame(time = [df[end, "time"]], mem_usage_mb = [0], user = df_user[1,"user"])
    df_user = vcat(df_beg, df_user, df_end)
    
    # find rows where the time difference is greater than 30 mins
    jump_idcs = get_jumps(df_user[!, "time"], 30*60)
    # add 0 entries for those
    zero_start_times = df_user[jump_idcs .- 1, "time"] .+ 1
    zero_end_times = df_user[jump_idcs, "time"] .- 1
    df_to_add = DataFrame(time = vcat(zero_start_times, zero_end_times), 
                        mem_usage_mb = zeros(size(zero_start_times)[1]*2),
                        user = df_user[1, "user"])
    # append and sort
    df_extended = vcat(df_user, df_to_add)
    sort!(df_extended, [:time])
    # remove strings of duplicated sizes
    inner_idcs = get_run_inner_indices(df_extended[!, "mem_usage_mb"])
    df_extended = df_extended[Not(inner_idcs), :]
    # append to df
    if isnothing(final_df)
        final_df = df_extended
    else
        final_df = vcat(final_df, df_extended)
    end
end
df = final_df

In [None]:
# compute the memory totals
dd = Dict()
totalmem = [0.]
df_ts = sort(df, ["time"])
totaltimes = [df_ts[1, "time"]]
for n = 1:nrow(df_ts)
    usr = df_ts[n, "user"]
    append!(totaltimes, df_ts[n, "time"])
    append!(totalmem, totalmem[end] + df_ts[n, "mem_usage_mb"])
    if haskey(dd, usr)
        totalmem[end] -= dd[usr]
    end
    dd[usr] = df_ts[n, "mem_usage_mb"]
end

In [None]:
# get user number starting indices
user_changes = get_jumps(df[!, "user"], 0.5)
prepend!(user_changes, [1])
append!(user_changes, size(df[!, "user"])[1]+1)

p = plot(size = (1600, 1200))
cur_user = df[1, "user"]
for n = 1:size(user_changes)[1]-1
    print("Processing user changepoint $n/$(size(user_changes)[1]-1)   \r")
    
    # find all rows for this user
    df_user = df[user_changes[n]:(user_changes[n+1]-1), ["user", "time", "mem_usage_mb"]]
    plot!(p, df_user[!, "time"]/3600/24, df_user[!,"mem_usage_mb"]/1000, legend=false, alpha=0.8)
end
ylabel!(p, "Memory Used (GB)")
xlabel!(p, "Day (since Sep 6, 2020)")
p

In [None]:
p = plot(size = (1600, 1200))
plot!(p, totaltimes/3600/24, totalmem/1000)
ylabel!(p, "Memory Used (GB)")
xlabel!(p, "Day (since Sep 6, 2020)")
p