In [1]:
using PorousMaterials 
using JLD2
using PyPlot
using PyCall
using CSV
using DataFrames
using StatsBase
using Formatting

# config plot settings
rcParams = PyPlot.PyDict(PyPlot.matplotlib."rcParams")
rcParams["font.size"] = 16;

┌ Info: Precompiling PorousMaterials [68953c7c-a3c7-538e-83d3-73516288599e]
└ @ Base loading.jl:1423
[33m[1m│ [22m[39m- If you have PorousMaterials checked out for development and have
[33m[1m│ [22m[39m  added Graphs as a dependency but haven't updated your primary
[33m[1m│ [22m[39m  environment's manifest file, try `Pkg.resolve()`.
[33m[1m│ [22m[39m- Otherwise you may need to report an issue with PorousMaterials
│   exception = (UndefVarError(:IdOffsetRange), Union{Ptr{Nothing}, Base.InterpreterIP}[Ptr{Nothing} @0x00007f867058648f, Ptr{Nothing} @0x00007f8670626d14, Ptr{Nothing} @0x00007f865c5bab42, Ptr{Nothing} @0x00007f8670606e79, Ptr{Nothing} @0x00007f8670625fe5, Ptr{Nothing} @0x00007f8670625a5d, Ptr{Nothing} @0x00007f8670626821, Ptr{Nothing} @0x00007f8670627447, Base.InterpreterIP in top-level CodeInfo for ArrayInterface at statement 11, Ptr{Nothing} @0x00007f86706448f3, Ptr{Nothing} @0x00007f8670646779, Ptr{Nothing} @0x00007f865a5a4ead, Ptr{Nothing} @0x00007f865a5a

## Load Descriptor Data

In [2]:
###
#  load properties
###
chem_props_filename = joinpath(pwd(), "descriptors/chemical_properties.csv")
geo_props_filename  = joinpath(pwd(), "descriptors/geometric_properties.csv")

chemical_properties  = CSV.read(chem_props_filename, DataFrame)
geometric_properties = CSV.read(geo_props_filename, DataFrame)

descriptors = outerjoin(geometric_properties, chemical_properties, on=["crystal_name"]);
rename!(descriptors, "crystal_name" => "xtal")

# WARNING: for these COFs, "⟨N⟩ (mmol/g)" = [0.0, 0.0] producing sel = NaN... need to run gcmc on these?
skip_for_now = ["13030N2_ddec.cif", "16371N2_ddec.cif", "20565N3_ddec.cif", "21090N3_ddec.cif"] 

filter!(row -> ! (row["xtal"] in skip_for_now), descriptors);

In [3]:
###
#  simulation parameters
###
adsorbates = Molecule.(["Kr", "Xe"])
mole_fxn   = [0.8, 0.2] # [Kr, Xe]
total_pressure = 1.0    # bar
partial_pressures = total_pressure * mole_fxn

temperature = 298.0 # K
ljff = LJForceField("UFF")
nb_ins_per_vol = 500
nb_cyc_per_vol = 500

benchmarked_cofs = ["05000N2_ddec.cif", "17156N2_ddec.cif"]

2-element Vector{String}:
 "05000N2_ddec.cif"
 "17156N2_ddec.cif"

In [4]:
# save target vectors and feature vectors 
save_to_file =  true # default: true  

# save figures
save_figures = false # default: false

false

In [5]:
function calculate_selectivity(n1::Float64, n2::Float64, 
                               dn1::Float64, dn2::Float64;
                               fxn1::Float64=1.0, fxn2::Float64=1.0)
    
    selectivity = (n1 / fxn1) / (n2 / fxn2)
    err_selectivity = selectivity * sqrt((dn1 / n1) ^ 2 + (dn2 / n2) ^ 2) # mole fraction in error?
    return selectivity, err_selectivity
end

function calculate_num_burn_and_sample_cycles(xtal::Crystal, cycles_per_volume::Int64; 
                                              fraction_burn_cycles::Float64=0.5)
    nb_cycles = max(5, ceil(Int, cycles_per_volume * xtal.box.Ω))
    @assert (0.0 < fraction_burn_cycles) && (fraction_burn_cycles < 1.0) 
    n_burn_cycles   = ceil(Int, nb_cycles * fraction_burn_cycles)
    n_sample_cycles = ceil(Int, nb_cycles * (1 - fraction_burn_cycles))
    return n_burn_cycles, n_sample_cycles
end

calculate_num_burn_and_sample_cycles (generic function with 1 method)

## Load Simulation Data

In [6]:
###
#  create DataFrame to populate 
###
z  =  zeros(Float64, length(descriptors[:, "xtal"]))
zs = [zeros(Float64, length(adsorbates)) for i in 1:length(descriptors[:, "xtal"])]

df = DataFrame(xtal = descriptors[:, "xtal"],
               henry_coeff              = deepcopy(zs), 
               err_henry_coeff          = deepcopy(zs),
               henry_elapsed_time       = deepcopy(zs),
               henry_total_elapsed_time = deepcopy(z),
               henry_selectivity        = deepcopy(z),
               err_henry_selectivity    = deepcopy(z),
               gcmc_uptake              = deepcopy(zs),
               err_gcmc_uptake          = deepcopy(zs),
               gcmc_pressure            = deepcopy(zs),
               gcmc_elapsed_time        = deepcopy(z),
               gcmc_selectivity         = deepcopy(z),
               err_gcmc_selectivity     = deepcopy(z)
               )

###
#  loop over xtals 
###
for (i, row) in enumerate(eachrow(df)) 
    # make sure we are on the correct row
    @assert row[:xtal] == descriptors[i, "xtal"]
    
    # load crystal
    xtal = Crystal(row[:xtal]; check_neutrality=false)

    for (j, molecule) in enumerate(adsorbates) # [Kr, Xe]
        ###
        # load and assign Henry data
        ###
        henry_filename = henry_result_savename(xtal, molecule, temperature, ljff, nb_ins_per_vol)
        @load joinpath(PorousMaterials.rc[:paths][:simulations], henry_filename) results
        
        row[:henry_coeff][j] = results["henry coefficient [mmol/(g-bar)]"]
        row[:err_henry_coeff][j] = results["err henry coefficient [mmol/(g-bar)]"]
        row[:henry_elapsed_time][j] = results["elapsed time (min)"]
    end
    # calculate total time to run set of Henry calculations
    row[:henry_total_elapsed_time] = sum(row[:henry_elapsed_time])
    
    ###
    #  load and assign GCMC data
    ###
    n_burn, n_sample = calculate_num_burn_and_sample_cycles(xtal, nb_cyc_per_vol)
    gcmc_filename = μVT_output_filename(xtal, adsorbates, temperature, 
                                        partial_pressures, ljff, n_burn, n_sample)
    @load joinpath(PorousMaterials.rc[:paths][:simulations], gcmc_filename) results

    row[:gcmc_uptake] = results["⟨N⟩ (mmol/g)"]
    row[:err_gcmc_uptake] = results["err ⟨N⟩ (mmol/g)"]
    row[:gcmc_pressure] = results["pressure (bar)"]
    row[:gcmc_elapsed_time] = results["elapsed time (min)"]
    # assert that adsorbates in correct order
    
    ###
    # calculate and assign selectivities with associated error
    ###
    selectivity, err_selectivity = calculate_selectivity(row[:henry_coeff][2],
                                                         row[:henry_coeff][1],
                                                         row[:err_henry_coeff][2],
                                                         row[:err_henry_coeff][1])
    row[:henry_selectivity] = selectivity
    row[:err_henry_selectivity] = err_selectivity
    
    # recall: adsorbates = ["Kr", "Xe"] and we want S_xe/kr
    selectivity, err_selectivity = calculate_selectivity(row[:gcmc_uptake][2],
                                                         row[:gcmc_uptake][1],
                                                         row[:err_gcmc_uptake][2],
                                                         row[:err_gcmc_uptake][1];
                                                         fxn1=0.2, fxn2=0.8) #[Xe, Kr]
    row[:gcmc_selectivity] = selectivity 
    row[:err_gcmc_selectivity] = err_selectivity
end
# write to file (only include xtal name and targets for GP)
# CSV.write(joinpath(pwd(), "target_and_cost_data.csv"), df[:, [:xtal, :henry_selectivity, :gcmc_selectivity]])
df[1:5, [:xtal, :henry_selectivity, :gcmc_selectivity, :henry_total_elapsed_time, :gcmc_elapsed_time]]

Unnamed: 0_level_0,xtal,henry_selectivity,gcmc_selectivity,henry_total_elapsed_time
Unnamed: 0_level_1,String?,Float64,Float64,Float64
1,05000N2_ddec.cif,1.58051,1.69624,3.45251
2,05001N2_ddec.cif,3.27135,3.2725,10.9069
3,07000N2_ddec.cif,2.9979,2.99372,8.65074
4,07001N2_ddec.cif,9.01379,9.65039,3.58279
5,07002N2_ddec.cif,3.60253,3.64401,6.52384


In [7]:
# look at the output for a given COF
cof_14010N2_id = findfirst([name == "14010N2_ddec.cif" for name in df[!, :xtal]])
df[cof_14010N2_id, [:xtal, :henry_selectivity, :gcmc_selectivity, :henry_total_elapsed_time, :gcmc_elapsed_time]]

Unnamed: 0_level_0,xtal,henry_selectivity,gcmc_selectivity,henry_total_elapsed_time
Unnamed: 0_level_1,String?,Float64,Float64,Float64
85,14010N2_ddec.cif,3.88053,3.90427,14.2275


## Prepare Data For GP

In [8]:
names(descriptors)

15-element Vector{String}:
 "xtal"
 "pore_diameter_Å"
 "void_fraction"
 "surface_area_m²g⁻¹"
 "crystal_density"
 "B"
 "O"
 "C"
 "H"
 "Si"
 "N"
 "S"
 "P"
 "halogens"
 "metals"

In [9]:
###
#  construct feature matrix and target vector for GP
###
x_cols = [name for name in names(descriptors) if name != "xtal"]
n_xtals = nrow(descriptors)

X = zeros(n_xtals, length(x_cols))
henry_y = zeros(n_xtals)
gcmc_y  = zeros(n_xtals)
henry_total_elapsed_time = zeros(n_xtals)
gcmc_elapsed_time = zeros(n_xtals)

for (i, row) in enumerate(eachrow(descriptors))   
    for (j, col_name) in enumerate(x_cols)
        X[i, j] = row[col_name]
    end
    # Xe/Kr Selectivity
    henry_y[i] = df[i, :henry_selectivity]
    henry_total_elapsed_time[i] = df[i, :henry_total_elapsed_time] # [min]
    gcmc_y[i]  = df[i, :gcmc_selectivity]
    gcmc_elapsed_time[i] = df[i, :gcmc_elapsed_time] # [min]
end
X # look at y too!

608×14 Matrix{Float64}:
  3.84928  0.3102   4190.07  1049.37   …  0.0        0.0        0.0  0.0  0.0
 19.7302   0.64806  3161.08   585.477     0.0        0.0        0.0  0.0  0.0
 26.1625   0.71368  3416.33   458.054     0.0        0.0        0.0  0.0  0.0
  7.82083  0.36936  2542.92  1100.22      0.0        0.0        0.0  0.0  0.0
 14.8469   0.56072  3321.89   703.114     0.0        0.0        0.0  0.0  0.0
  9.33026  0.7131   5244.15   421.68   …  0.0        0.0        0.0  0.0  0.0
  9.94867  0.73572  5444.29   383.117     0.0        0.0        0.0  0.0  0.0
 18.946    0.88136  5330.44   180.981     0.0        0.0        0.0  0.0  0.0
 27.193    0.88836  5199.24   171.075     0.0        0.0        0.0  0.0  0.0
 10.0414   0.63932  5594.39   519.979     0.0        0.0        0.0  0.0  0.0
  4.7838   0.23682  3836.5   1143.15   …  0.0        0.0        0.0  0.0  0.0
  7.51918  0.3622   3597.33   988.327     0.0        0.0        0.0  0.0  0.0
  9.8332   0.45334  3425.38   874.7     

In [10]:
###
#  save top COF feature vector so we can visualize it as a radar plot 
###
descriptors[argmax(df[:, :gcmc_selectivity]), :]
top_COF_id = argmax(df[:, :gcmc_selectivity])
@assert descriptors[argmax(df[:, :gcmc_selectivity]), :xtal] == df[argmax(df[:, :gcmc_selectivity]), :xtal]

test_ind = 1
for name in names(descriptors)
    if name == "xtal"
        continue
    end
    @assert descriptors[top_COF_id, name] == X[top_COF_id, test_ind]
    test_ind += 1
end

if save_to_file
    save(joinpath(pwd(), "figs", "top_COF_feature_vector.jld2"), 
        Dict("features" => X[top_COF_id, :])
    )
end

In [11]:
###
#  Normalize the feature vectors w.r.t. the mean
###
X_scaled = deepcopy(X) # make a copy of the data

for j in 1:length(x_cols)
    # note that the lowest allowable value is zero
    X_scaled[:, j] = (X_scaled[:, j] .- minimum(X_scaled[:, j])) / (maximum(X_scaled[:, j]) - minimum(X_scaled[:, j]))
end

@assert X_scaled != X 

# print dist. info
println("For X_scaled[:, 2] - ")
println("\tminimum:\t", minimum(X_scaled[:, 2]), "\n\tmaximum:\t", maximum(X_scaled[:, 2]))
println("\tmean:\t\t", StatsBase.mean(X_scaled[:, 2]), "\n\tstd:\t\t", StatsBase.std(X_scaled[:, 2]))
println("\tdist. width:\t", (maximum(X_scaled[:, 2]) - minimum(X_scaled[:, 2])))

For X_scaled[:, 2] - 
	minimum:	0.0
	maximum:	1.0
	mean:		0.5863775641608338
	std:		0.17866313922650942
	dist. width:	1.0


### Write to file

In [12]:
# we have to enforce the data type otherwise the JLD2 file will not work
# i.e. it will save pointers instead of the values
cofs = [String(c) for c in df[!, :xtal]];

In [16]:
###
#  save target vectors and the raw and normalized feature vectors 
###
if save_to_file
    ###
    #  targets and raw features
    ###
    save(joinpath(pwd(), "run_BO", "targets_and_raw_features.jld2"), 
                 Dict("COFs" => cofs, "feature_names" => names(descriptors)[2:end], 
                      "X" => X, 
                      "henry_y" => henry_y, "gcmc_y" => gcmc_y, 
                      "henry_total_elapsed_time" => henry_total_elapsed_time, # [min]
                      "gcmc_elapsed_time" => gcmc_elapsed_time)) # [min]

    ###
    #  targets and normalized features
    ###
    save(joinpath(pwd(), "run_BO", "targets_and_normalized_features.jld2"), 
            Dict("COFs" => cofs, "feature_names" => names(descriptors)[2:end], 
                 "X" => X_scaled, 
                 "henry_y" => henry_y, "gcmc_y" => gcmc_y, 
                 "henry_total_elapsed_time" => henry_total_elapsed_time, 
                 "gcmc_elapsed_time" => gcmc_elapsed_time))
end

# Plotting Preliminary Analysis:

In [None]:
###
#  Compute statistics about the fit
#  RMSE, R2 and Spearmann correlation
##
rmse = sqrt(mean((df[:, :gcmc_selectivity] - df[:, :henry_selectivity]).^2))

R2 = 1.0 - (sum((df[:, :gcmc_selectivity] - df[:, :henry_selectivity]).^2) / 
            sum((df[:, :gcmc_selectivity] .- mean(df[:, :gcmc_selectivity])).^2))

rho_spearman = corspearman(df[:, :gcmc_selectivity], df[:, :henry_selectivity])

In [None]:
# useful for plot formating 
mkr_sz = [8 for i in 1:length(descriptors[:, "xtal"])];
mkr_c = ["C0" for i in 1:length(descriptors[:, "xtal"])]

# color benchmark cofs red
mkr_c[findfirst(descriptors[:, "xtal"] .== benchmarked_cofs[1])] = "C3"
mkr_c[findfirst(descriptors[:, "xtal"] .== benchmarked_cofs[2])] = "C3"

In [None]:
fig = figure()

scatter(descriptors[:, "pore_diameter_Å"], df[:, :henry_total_elapsed_time], 
        facecolor="none", edgecolor=mkr_c, linewidth=0.5, s=mkr_sz)

title("cost vs pore diameter distribution")
xlabel("pore size [Å]")
ylabel("low fidelity cost [min]")

tight_layout()

In [None]:
figure()

scatter(df[:, :gcmc_elapsed_time], df[:, :henry_total_elapsed_time], 
        facecolor="none", edgecolor=mkr_c, lw=0.5, s=mkr_sz)

# track fraction above or on the equal cost line
unfavorable = sum(sum.(df[:, :henry_elapsed_time]) .>= df[:, :gcmc_elapsed_time])
favorable = length(df[:, :gcmc_elapsed_time]) - unfavorable

cost_ratio = unfavorable / length(df[:, :gcmc_elapsed_time])
println("cost ratio: $(cost_ratio) unfavorable, $(1 - cost_ratio) favorable")

# plot the line: y=x
x = range(0.0, stop=400, length=100)
y = x
semilogx(x, y; label="equal cost", color="k", linestyle="--", linewidth=1.2)

title("simulation cost comparison")
legend()
xlabel("high fidelity cost [min]")
ylabel("low fidelity cost [min]")

tight_layout()

In [None]:
figure() # figsize=(8, 8)

xmax = maximum(max.(df[:, :gcmc_selectivity], df[:, :henry_selectivity]))
x = range(0.0, stop=xmax, length=length(df[:, :henry_selectivity]))
y = x
plot(x, y, color="k", linestyle="--", zorder=0)

errorbar(df[:, :gcmc_selectivity], df[:, :henry_selectivity],
         xerr=df[:, :err_gcmc_selectivity], yerr=df[:, :err_henry_selectivity],
         ecolor="C3", marker=".", ls="none",
         mfc="none", mec="C0", ms=7, linewidth=0.75)

# put a line and calc Spearmann Correlation
str = format("R² = {:0.3f}\nRMSE = {:0.2f}\nρₛ = {:0.3f}", R2, rmse, rho_spearman)
box_styl = Dict(:facecolor => "lightgrey", :alpha => 0.75)
text(1, 17, str, color="k", size=12, bbox=box_styl)

xlim([-0.5, 21])
ylim([-0.5, 21])

gca().set_aspect("equal", adjustable="box")

# title("Fidelity Corelation", fontsize=14)
xlabel("GCMC S" * L"_{Xe/Kr}")
ylabel("Henry S" * L"_{Xe/Kr}")
tight_layout()
if save_figures
    savefig(joinpath(pwd(), "figs", "low_vs_high_fidelity_correlation_plot.png"), dpi=600, format="png")
end

### Look at relative errors in adsorption or each COF

**Note that the Kinetic diameter of Kr = 3.6 Å and Xe = 3.96 Å; 
therefore, COFs with a smaller pore diameter will not be able to accomidate the gas**

In [None]:
cof_large_error = String[]
for (i, row) in enumerate(eachrow(df))
    if any((row[:err_henry_coeff] ./ row[:henry_coeff]) .> 0.05)
        if ! (row[:xtal] in cof_large_error)
            push!(cof_large_error, row[:xtal])
        end
    elseif any((row[:err_gcmc_uptake] ./ row[:gcmc_uptake]) .> 0.05)
        if ! (row[:xtal] in cof_large_error)
            push!(cof_large_error, row[:xtal])
        end
    end
end
cof_large_error # look at their xtal structure... COFs with pore diam 8Å and 30Å should be big enough...

In [None]:
xtal_indexes = [findfirst(descriptors[:, "xtal"] .== name) for name in cof_large_error]
pore_diameters = descriptors[xtal_indexes, "pore_diameter_Å"]

### Look at the top 10% performing materials

In [None]:
top10_inds = ceil(Int, length(df[:, :xtal]) * 0.1)
println("The top ten percent (10%) of the dataset is $(top10_inds) COFs.")

henry_top10 = sort(df, :henry_selectivity, rev=true)[1:top10_inds, :]

henry_top10[1:10, [:xtal, :henry_selectivity, :gcmc_selectivity, :gcmc_elapsed_time]]

In [None]:
top10       = sort(df, :gcmc_selectivity, rev=true)[1:top10_inds, :]

@assert top10[1, :xtal] in henry_top10[:, :xtal]

total_henry_cost = sum(df[:, :henry_total_elapsed_time]) / 60 # converted to hours
t10_gcmc_cost = sum(henry_top10[:, :gcmc_elapsed_time]) / 60  # converted to hours

println("The cost of running GCMC simulations on this set is: ", t10_gcmc_cost, " [hr]")
println("The total search cost is then: ", total_henry_cost + t10_gcmc_cost, " [hr]")
henry_top10[1:10, [:xtal, :henry_total_elapsed_time, :gcmc_elapsed_time]]

In [None]:
top10[1:10, [:xtal, :gcmc_selectivity, :henry_selectivity, :gcmc_elapsed_time]]

In [None]:
println("xtals in Henry top 10pct but not in GCMC top 10 pct:")
for xtal_name in henry_top10[:, :xtal]
    if ! (xtal_name in top10[:, :xtal])
        println(xtal_name)
    end
end

In [None]:
###
#  Compute statistics about the fit
#  RMSE, R2 and Spearmann correlation
##
t10_rmse = sqrt(mean((top10[:, :gcmc_selectivity] - top10[:, :henry_selectivity]).^2))

t10_R2 = 1.0 - (sum((top10[:, :gcmc_selectivity] - top10[:, :henry_selectivity]).^2) / 
            sum((top10[:, :gcmc_selectivity] .- mean(top10[:, :gcmc_selectivity])).^2))

t10_rho_spearman = corspearman(top10[:, :gcmc_selectivity], top10[:, :henry_selectivity])

In [None]:
figure(figsize=(8, 8))

xmax = maximum(max.(top10[:, :gcmc_selectivity], top10[:, :henry_selectivity]))
x = range(0.0, stop=xmax, length=length(top10[:, :henry_selectivity]))
y = x
plot(x, y, color="k", linestyle="--", zorder=0)

errorbar(top10[:, :gcmc_selectivity], top10[:, :henry_selectivity],
         xerr=top10[:, :err_gcmc_selectivity], yerr=top10[:, :err_henry_selectivity],
         ecolor="C3", marker=".", ls="none",
         mfc="none", mec="C0", ms=6, linewidth=1.1)

# put a line and calc Spearmann Correlation
str = format("R² = {:0.3f}\nRMSE = {:0.2f}\nρₛ = {:0.3f}", t10_R2, t10_rmse, t10_rho_spearman)
box_styl = Dict(:facecolor => "lightgrey", :alpha => 0.75)
text(1, 17, str, color="k", size=10, bbox=box_styl)

gca().set_aspect("equal", adjustable="box")

title("Highest Performing COFs: Fidelity Corelation", fontsize=14)
xlabel("GCMC S" * L"_{Xe/Kr}")
ylabel("Henry S" * L"_{Xe/Kr}")
tight_layout()
if save_figures
    savefig(joinpath(pwd(), "figs", "top10pct_low_vs_high_fidelity_correlation_plot.png"), dpi=600, format="png")
end

### Selectivity Performance Plot

In [None]:
figure()

hxe = [df[ind, :henry_coeff][2] for ind in 1:length(df[:, :xtal])]

scatter(hxe, df[:, :henry_selectivity], 
        facecolor="none", edgecolor=mkr_c, s=mkr_sz)

xlabel("H" * L"_{Xe}" * " [mmol/(g-bar)]")
ylabel("S" * L"_{Xe/Kr}")

tight_layout()