In [None]:
#------------------------------
# Setup: 
#------------------------------
#---Activate the enviroment:
using Pkg;

Pkg.activate("../");
Pkg.instantiate();
Pkg.status()

#---Load CCIM rat lung data computed by NICHES:
projectpath = joinpath(@__DIR__, "../"); 
figurespath = projectpath * "figures/simData/"
if !isdir(figurespath)
    # Create the folder if it does not exist
    mkdir(figurespath)
end

#---Load the BoostingAutoEncoder module:
include(projectpath * "/src/BAE.jl");
using .BoostingAutoEncoder

#---Load required packages for this notebook:
using Plots;
using Random;
using StatsBase;
using VegaLite;  
using DataFrames;
using StatsPlots;

# Train BAE on simulated scRNA-seq data

In [None]:
#---Generate and transform data:
dataseed = 1;
n_cells = 1000;
n_genes = 200;
n_overlap = 0;
stageno = 10;
blockprob = 0.6;
noiseprob = 0.1;

X = sim_scRNAseqData(dataseed; 
    n=n_cells, 
    stageoverlap=n_overlap, 
    blockprob=blockprob, 
    noiseprob=noiseprob, 
    num_genes=n_genes,
    stageno=stageno,  
    stagep=Int(50 / stageno), 
    stagen=Int(n_cells / stageno)
);

#---Scale the data:
X_st = scale(X);
#Optional: Rescale noise genes of X_st, because values of noise genes are more extreme after scaling ...
#X_st[:, 50:end] .*= 0.8f0

n, p = size(X);

#---Create meta data:
MD = MetaData();
MD.obs_df[!, :CellGroup] = repeat(1:10, inner=100)
MD.featurename = ["$(j)" for j in 1:p];

#---Plot the binary data:
vegaheatmap(X[:, 1:100]; 
    path=figurespath * "binary_data.png", 
    Title="Binary Gene expression ($(n_genes-100) noise genes omitted)",
    xlabel="Gene", 
    ylabel="Cell",
    legend_title="Value",
    color_field="value:o",
    domain_mid=nothing,
    scheme="paired",
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Define hyperparameters for training a BAE
# Without restarts of the encoder weight matrix:
#HP = Hyperparameters(zdim=10, n_restarts=1, epochs=2000, batchsize=2^9, η=0.01, λ=0.1, ϵ=0.001, M=1);  
# With restarts of the encoder weight matrix:
HP = Hyperparameters(zdim=10, n_restarts=5, epochs=100, batchsize=2^9, η=0.01, λ=0.1, ϵ=0.01, M=1);  

#---Define the decoder architecture:
decoder = generate_BAEdecoder(p, HP; soft_clustering=true);

#---Initialize the BAE model:
BAE = BoostingAutoencoder(; coeffs=zeros(eltype(X_st), p, HP.zdim), decoder=decoder, HP=HP);
summary(BAE)

In [None]:
#---Train the BAE model:
# Set the number of restarts of the encoder weight matrix during training by setting n_restarts > 1:
@time begin
    output_dict = train_BAE!(X_st, BAE; MD=MD, track_coeffs=true, save_data=true, data_path=figurespath);
end

@info "Minimum Trainloss at: $(argmin(output_dict["trainloss"]))"

In [None]:
#---Plot the mean trainloss per epoch:
mean_trainlossPerEpoch = output_dict["trainloss"];
loss_plot = plot(1:length(mean_trainlossPerEpoch), mean_trainlossPerEpoch,
     title = "Mean train loss per epoch",
     xlabel = "Epoch",
     ylabel = "Loss",
     legend = true,
     label = "Train loss",
     linecolor = :red,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Trainloss_BAE.png");
loss_plot

In [None]:
#---Plot the Sparsity score per epoch:
sparsity_level = output_dict["sparsity"];
loss_plot = plot(1:length(sparsity_level), sparsity_level,
     title = "Sparsity level per epoch",
     xlabel = "Epoch",
     ylabel = "Sparsity",
     legend = true,
     label = "Sparsity",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Sparsity_BAE.png");
loss_plot

In [None]:
#---Plot the disentanglement score per epoch:
entanglement_score = output_dict["entanglement"];
loss_plot = plot(1:length(entanglement_score), entanglement_score,
     title = "Entanglement score per epoch",
     xlabel = "Epoch",
     ylabel = "Entanglement of dimensions",
     legend = true,
     label = "Entanglement",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Entanglement_BAE.png");
loss_plot

In [None]:
#---Plot the clustering score per epoch:
clustering_score = output_dict["clustering"];
loss_plot = plot(1:length(clustering_score), clustering_score,
     title = "Clustering score per epoch",
     xlabel = "Epoch",
     ylabel = "Clustering score",
     legend = true,
     label = "Score",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/ClusteringScore_BAE.png");
loss_plot

## Result visualization:

In [None]:
#---Plot the encoder weights:
vegaheatmap(BAE.coeffs'; 
    path=figurespath * "encoderWeights_BAE.png", 
    Title="Encoder weights",
    xlabel="Gene", 
    ylabel="Latent dimension",
    legend_title="Value",
    save_plot=true
)

In [None]:
#---Plot the latent representations of cells:
vegaheatmap(BAE.Z'; 
    path=figurespath * "latentRepresentation_BAE.png", 
    Title="Latent representation",
    xlabel="Latent dimension", 
    ylabel="Cell",
    legend_title="Activation",
    save_plot=true
)

In [None]:
#---Plot the probabilities of cells of belonging to the different clusters (each latent dimension corresponds to two subsequent clusters [reflecting pos. and neg. activations]):
vegaheatmap(BAE.Z_cluster'; 
    path=figurespath * "clusterProbabilities_BAE.png", 
    Title="Cluster probabilities of cells",
    xlabel="Cluster", 
    ylabel="Cell",
    legend_title="Probability",
    scheme="purpleblue",
    domain_mid=nothing,
    save_plot=true
)

In [None]:
#---Plot the absolute values of Pearson correlation coefficients between latent dimensions:
vegaheatmap(abs.(cor(BAE.Z, dims=2)); 
    path=figurespath * "cor_latentDimensions_BAE.png", 
    Title="Absolute correlations of latent dimensions",
    xlabel="Latent dimension", 
    ylabel="Latent dimension",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot the spearman correlation between the latent dimensions:
vegaheatmap(abs.(corspearman(BAE.Z')); 
    path=figurespath * "spearman_cor_latentDimensions_BAE.png", 
    Title="Absolute Spearman rank correlations of latent dimensions",
    xlabel="Latent dimension", 
    ylabel="Latent dimension",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot boxplots of the latent activations of cells per latent dimension:
plot_row_boxplots(BAE.Z; xlabel="Latent dimension", ylabel="Cell activation", saveplot=true, path=figurespath * "/BAE_Z_boxplot.png")

In [None]:
#---Plot the absolute values of Pearson correlation coefficients between latent dimensions:
vegaheatmap(abs.(cor(X, dims=2)); 
    path=figurespath * "cor_cells.png", 
    Title="Absolute correlations of cells",
    xlabel="Cell", 
    ylabel="Cell",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot the absolute values of Pearson correlation coefficients between latent dimensions (latent representation is used):
vegaheatmap(abs.(cor(BAE.Z, dims=1)); 
    path=figurespath * "cor_cellslatentRepresentations_BAE.png", 
    Title="Absolute correlations of cell Representations",
    xlabel="Cell", 
    ylabel="Cell",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot the absolute values of Pearson correlation coefficients between latent dimensions (cluster representation is used):
vegaheatmap(abs.(cor(BAE.Z_cluster, dims=1)); 
    path=figurespath * "cor_cellsRepresentations(Cluster)_BAE.png", 
    Title="Absolute correlations of cell Representations",
    xlabel="Cell", 
    ylabel="Cell",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Create scatter plots of the top selected genes per latent dimension:
if !isdir(figurespath * "/TopFeaturesLatentDim")
    # Create the folder if it does not exist
    mkdir(figurespath * "/TopFeaturesLatentDim")
end
for dim in 1:BAE.HP.zdim
    Featurescatter_plot = normalizedFeatures_scatterplot(BAE.coeffs[:, dim], MD.featurename, dim; top_n=10)
    savefig(Featurescatter_plot, figurespath * "/TopFeaturesLatentDim/" * "BAE_dim$(dim)_topGenes.png")
end

In [None]:
#---Create scatter plots of the top selected genes per cluster:
if !isdir(figurespath * "/TopFeaturesCluster")
    # Create the folder if it does not exist
    mkdir(figurespath * "/TopFeaturesCluster")
end
for key in keys(MD.Top_features)
    if length(MD.Top_features[key].Scores) > 0
        FeatureScatter_plot = TopFeaturesPerCluster_scatterplot(MD.Top_features[key], key; top_n=10)
        savefig(FeatureScatter_plot, figurespath * "/TopFeaturesCluster/" * "BAE_Cluster$(key)_topGenes.png")
    end
end

In [None]:
#---Create a coefficient plots for visually inspecting coefficient update trajectories for the last run of the training:
if haskey(output_dict, "coefficients")
    if !isdir(figurespath * "/CoefficientsPlots")
        # Create the folder if it does not exist
        mkdir(figurespath * "/CoefficientsPlots")
    end
    for dim in 1:BAE.HP.zdim
        pl = track_coefficients(output_dict["coefficients"], dim; iters=nothing, xscale=:log10)
        savefig(pl, figurespath * "/CoefficientsPlots/CoefficientsPlot_BAE_dim$(dim).png")
    end
else 
    @warn "No coefficient trajectories were saved during training."
end

In [None]:
#----Compute 2D UMAP embedding of the learned BAE latent representation and add to the metadata:
plotseed = 7;
BAE.UMAP = generate_umap(BAE.Z', plotseed);
MD.obs_df[!, :UMAP1] = BAE.UMAP[:, 1];
MD.obs_df[!, :UMAP2] = BAE.UMAP[:, 2];

#---Randomly shuffle the observation indices for plotting:
rand_inds = shuffle(1:size(X_st, 1));
MD.obs_df = MD.obs_df[rand_inds, :];

In [None]:
#---Plot a heatmap of the cluster probabilities of cells:
Cluster_df = DataFrame(BAE.Z_cluster[:, rand_inds]', :auto);
Cluster_df[!, :Cluster] = copy(MD.obs_df.Cluster);
sort!(Cluster_df, :Cluster);
vegaheatmap(Matrix(Cluster_df[:, 1:end-1]); 
    path=figurespath * "clusterProbabilities_BAE.png", 
    Title="Cluster probabilities of cells",
    xlabel="Cluster", 
    ylabel="Cell",
    legend_title="Probability",
    scheme="purpleblue",
    domain_mid=nothing,
    save_plot=true
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation colored by the cell group labels:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.CellGroup; 
    path=figurespath * "Celltype_(BAE)umap.png",
    legend_title="Cell type",
    color_field="labels:o",
    scheme="category20",
    domain_mid=nothing,
    range=nothing,
    save_plot=true,
    marker_size="25"
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation colored by the cluster labels:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.Cluster; 
    path=figurespath * "Cluster_(BAE)umap.png",
    legend_title="Cluster",
    color_field="labels:o",
    scheme="category20",
    domain_mid=nothing,
    range=nothing,
    save_plot=true,
    marker_size="25"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by activations in different latent dimensions:
if !isdir(figurespath * "/UMAPplotsLatDims")
    # Create the folder if it does not exist
    mkdir(figurespath * "/UMAPplotsLatDims")
end
create_colored_vegascatterplots(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), BAE.Z[:, rand_inds];
    path=figurespath * "/UMAPplotsLatDims/",
    filename="Mouse_BAE_dim",
    filetype="scatter.png",
    legend_title="Activation",
    color_field="labels:q",
    scheme="blueorange", 
    domain_mid=0,
    range=nothing,
    save_plot=true,
    marker_size="25"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by activations for different clusters:
if !isdir(figurespath * "/UMAPplotsCluster")
    # Create the folder if it does not exist
    mkdir(figurespath * "/UMAPplotsCluster")
end
color_range = [
    "#fff5f5", "#ffe0e0", "#ffcccc", "#ffb8b8", "#ffa3a3", "#ff8f8f", "#ff7a7a", "#ff6666",
    "#ff5252", "#ff3d3d", "#ff2929", "#ff1414", "#ff0000", "#e50000", "#cc0000", "#b20000",
    "#990000", "#7f0000", "#660000", "#4c0000", "#330000"
];
create_colored_vegascatterplots(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), BAE.Z_cluster[:, rand_inds];
    path=figurespath * "/UMAPplotsCluster/",
    filename="Mouse_BAE_dim",
    filetype="scatter.png",
    legend_title="Activation",
    color_field="labels:q",
    scheme=nothing,
    domain_mid=nothing,
    range=color_range,
    save_plot=true,
    marker_size="25"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by expression levels of top selected genes for different clusters:
if !isdir(figurespath * "/FeaturePlots")
    # Create the folder if it does not exist
    mkdir(figurespath * "/FeaturePlots")
end
color_range = [
    "#fff5f5", "#ffe0e0", "#ffcccc", "#ffb8b8", "#ffa3a3", "#ff8f8f", "#ff7a7a", "#ff6666",
    "#ff5252", "#ff3d3d", "#ff2929", "#ff1414", "#ff0000", "#e50000", "#cc0000", "#b20000",
    "#990000", "#7f0000", "#660000", "#4c0000", "#330000"
];
FeaturePlots(MD.Top_features, MD.featurename, X[rand_inds, :], Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]); 
    top_n=5,
    marker_size="25", 
    fig_type=".png",
    path=figurespath * "/FeaturePlots/",
    legend_title="log1p",
    color_field="labels:o",
    scheme=nothing, 
    domain_mid=nothing,
    range=color_range
)