- packages

In [1]:
using StatsBase
using MLJ
using Random
using CategoricalArrays
using PrettyPrinting
using DataFrames
using LossFunctions
using XGBoost
using MLJXGBoostInterface
using CSV
using Plots

- import data set

In [2]:
data = CSV.File("data/imputed_data.csv", normalizenames = true) |> DataFrame
X = data[:, 9:56]
y = data[:, 57]
show(stdout, MIME("text/plain"), schema(X))

┌──────────────────────┬─────────┬────────────┐
│[22m _.names              [0m│[22m _.types [0m│[22m _.scitypes [0m│
├──────────────────────┼─────────┼────────────┤
│ IFNg_serum           │ Float64 │ Continuous │
│ IgE_serum            │ Float64 │ Continuous │
│ IgG1_serum           │ Float64 │ Continuous │
│ IgG2a_serum          │ Float64 │ Continuous │
│ IL_4_serum           │ Float64 │ Continuous │
│ IL_5_serum           │ Float64 │ Continuous │
│ IL_10_serum          │ Float64 │ Continuous │
│ IFNg_Ag_MLN_stim     │ Float64 │ Continuous │
│ IFN_CD3_MLN_stim     │ Float64 │ Continuous │
│ IFN_Media_MLN_stim   │ Float64 │ Continuous │
│ IL_5_Media_MLN_stim  │ Float64 │ Continuous │
│ IL_5_Ag_MLN_stim     │ Float64 │ Continuous │
│ IL_5_CD3_MLN_stim    │ Float64 │ Continuous │
│ IL_10_Media_MLN_stim │ Float64 │ Continuous │
│ IL_10_CD_MLN_stim    │ Float64 │ Continuous │
│ IL_10_Ag_MLN_stim    │ Float64 │ Continuous │
│ IL_17_Media_MLN_stim │ Float64 │ Continuous │
│ IL_17_CD_ML

- train and test set

In [3]:
Random.seed!(523)
perm = randperm(length(y))
X = X[perm,:]
y = y[perm];
train, test = partition(eachindex(y), 0.70, shuffle=true, rng=52)

([103, 4, 56, 24, 25, 83, 54, 23, 1, 102  …  104, 20, 22, 2, 115, 3, 91, 75, 14, 42], [12, 21, 52, 76, 64, 113, 34, 67, 8, 47  …  43, 110, 99, 30, 49, 84, 114, 112, 19, 48])

- available models

In [4]:
models(matching(X, y))

NamedTuple{(:name, :package_name, :is_supervised, :docstring, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :is_pure_julia, :is_wrapper, :load_path, :package_license, :package_url, :package_uuid, :prediction_type, :supports_online, :supports_weights, :input_scitype, :target_scitype, :output_scitype), T} where T<:Tuple[]

## no matching models ?

# 1. XGBoost

## 1.1 load model and machine

In [5]:
@load XGBoostCount
xgb_model = XGBoostCount()

┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main /Users/xinyuejia/.julia/packages/MLJModels/SlRVK/src/loading.jl:168


import MLJXGBoostInterface ✔


XGBoostCount(
    num_round = 100,
    booster = "gbtree",
    disable_default_eval_metric = 0,
    eta = 0.3,
    gamma = 0.0,
    max_depth = 6,
    min_child_weight = 1.0,
    max_delta_step = 0.0,
    subsample = 1.0,
    colsample_bytree = 1.0,
    colsample_bylevel = 1.0,
    lambda = 1.0,
    alpha = 0.0,
    tree_method = "auto",
    sketch_eps = 0.03,
    scale_pos_weight = 1.0,
    updater = "auto",
    refresh_leaf = 1,
    process_type = "default",
    grow_policy = "depthwise",
    max_leaves = 0,
    max_bin = 256,
    predictor = "cpu_predictor",
    sample_type = "uniform",
    normalize_type = "tree",
    rate_drop = 0.0,
    one_drop = 0,
    skip_drop = 0.0,
    feature_selector = "cyclic",
    top_k = 0,
    tweedie_variance_power = 1.5,
    objective = "count:poisson",
    base_score = 0.5,
    eval_metric = "rmse",
    seed = 0)[34m @805[39m

In [6]:
xgbm = machine(xgb_model, X, y)

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91


[34mMachine{XGBoostCount,…} @171[39m trained 0 times; caches data
  args: 
    1:	[34mSource @052[39m ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}`
    2:	[34mSource @432[39m ⏎ `AbstractVector{Count}`


## 1.2 evaluate

In [7]:
mach = fit!(xgbm)
cv=CV(nfolds=6)

┌ Info: Training [34mMachine{XGBoostCount,…} @171[39m.
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:342
[1]	train-rmse:inf
[2]	train-rmse:inf
[3]	train-rmse:1583549219354968064.000000
[4]	train-rmse:1173122969182928896.000000
[5]	train-rmse:869071514060718080.000000
[6]	train-rmse:643824531202375680.000000
[7]	train-rmse:476957286451904512.000000
[8]	train-rmse:353338919263141888.000000
[9]	train-rmse:261760114749341696.000000
[10]	train-rmse:193916811341725696.000000
[11]	train-rmse:143657224303017984.000000
[12]	train-rmse:106423963835432960.000000
[13]	train-rmse:78840867816734720.000000
[14]	train-rmse:58406800696803328.000000
[15]	train-rmse:43268849159634944.000000
[16]	train-rmse:32054380312133632.000000
[17]	train-rmse:23746483337560064.000000
[18]	train-rmse:17591844594515968.000000
[19]	train-rmse:13032367442624512.000000
[20]	train-rmse:9654622624940032.000000
[21]	train-rmse:7152325286166528.000000
[22]	train-rmse:5298577135894528.000000
[23]

CV(
    nfolds = 6,
    shuffle = false,
    rng = Random._GLOBAL_RNG())[34m @484[39m

In [11]:
evaluate!(mach, resampling=cv, measure=[l1, rms, rmslp1], verbosity=0)

┌───────────────────────────────────────────────────────────────┬───────────────
│[22m _.measure                                                     [0m│[22m _.measuremen[0m ⋯
├───────────────────────────────────────────────────────────────┼───────────────
│ \e[34mLPLoss{Int64} @213\e[39m                                │ 30600.0      ⋯
│ RootMeanSquaredError @901                                     │ 202000.0     ⋯
│ \e[34mRootMeanSquaredLogProportionalError{Float64} @758\e[39m │ 2.22         ⋯
└───────────────────────────────────────────────────────────────┴───────────────
[36m                                                               2 columns omitted[0m
_.per_observation = [[[2.0, 4.7399998e-14, ..., 21.0], [4.75e-14, 4.75e-14, ..., 4.75e-14], [52.2, 13.0, ..., 0.000471], [5.0, 14.0, ..., 4.7399998e-14], [14.0, 4.7399998e-14, ..., 4.7399998e-14], [6.0, 1.0, ..., 10.0]], missing, missing]
_.fitted_params_per_fold = [ … ]
_.report_per_fold = [ … ]


## 1.3 tuning 

### 1.3.1 tuning 1

In [12]:
r1 = range(xgb_model, :max_depth, lower=3, upper=10)
r2 = range(xgb_model, :min_child_weight, lower=0, upper=5)
tm = TunedModel(model=xgb_model, tuning=Grid(resolution=10),
                resampling=CV(nfolds = 6), ranges=[r1,r2],
                measure=rms)

mtm = machine(tm, X, y)

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91


[34mMachine{DeterministicTunedModel{Grid,…},…} @178[39m trained 0 times; caches data
  args: 
    1:	[34mSource @708[39m ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}`
    2:	[34mSource @472[39m ⏎ `AbstractVector{Count}`


In [13]:
fit!(mtm, rows = train)

┌ Info: Training [34mMachine{DeterministicTunedModel{Grid,…},…} @178[39m.
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:342
┌ Info: Attempting to evaluate 80 models.
└ @ MLJTuning /Users/xinyuejia/.julia/packages/MLJTuning/wBJ80/src/tuned_models.jl:566
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continu

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continu

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continu

│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continuous).
└ @ MLJBase /Users/xinyuejia/.julia/packages/MLJBase/Wo1cb/src/machines.jl:91
│ scitype(X) = Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}
│ input_scitype(model) = Table{var"#s46"} where var"#s46"<:(AbstractVector{var"#s9"} where var"#s9"<:Continu

[34mMachine{DeterministicTunedModel{Grid,…},…} @178[39m trained 1 time; caches data
  args: 
    1:	[34mSource @708[39m ⏎ `Table{Union{AbstractVector{Continuous}, AbstractVector{Count}}}`
    2:	[34mSource @472[39m ⏎ `AbstractVector{Count}`


In [None]:
plot(mtm)

(parameter_names = ["max_depth", "min_child_weight"],
 parameter_scales = [:linear, :linear],
 parameter_values = Any[7 5.0; 5 0.5555555555555556; … ; 6 1.1111111111111112; 4 5.0],
 measurements = [8.246014595031738, 20515.560546875, 61.42761993408203, 292.1626281738281, 296.8984375, 648975.125, 648974.875, 14220.3037109375, 301.85809326171875, 244.93785095214844  …  3802.6884765625, 8.377054214477539, 527.9736938476562, 100.88300323486328, 17503.2109375, 621.4913330078125, 1.204026e8, 2.2041985e6, 2.21489425e6, 7.595742225646973],)