# Setup

In [None]:
#---Activate the enviroment:
using Pkg;

Pkg.activate("../");
Pkg.instantiate();
Pkg.status()

#---Load CCIM rat lung data computed by NICHES:
projectpath = joinpath(@__DIR__, "../"); 
datapath = projectpath * "data/rat_extended/"
if !isdir(datapath)
    # Create the folder if it does not exist
    mkdir(datapath)
end
figurespath = projectpath * "figures/rat_extended/"
if !isdir(figurespath)
    # Create the folder if it does not exist
    mkdir(figurespath)
end

#---Load the BoostingAutoEncoder module:
include(projectpath * "/src/BAE.jl");
using .BoostingAutoEncoder

#---Load required packages for this notebook:
using RCall;
using DelimitedFiles;
using Plots;
using Random;
using StatsBase;
using VegaLite;  
using DataFrames;
using StatsPlots;

# Load data and create CCIM

In [None]:
#---Download a subset of the example rat lung data from "https://zenodo.org/record/6846618/files/raredon_2019_rat.Robj" and save it to the data directory:
X_alra, tSNE_embeddings, celltypes, genenames = load_rat_scRNAseq_data(; data_path=datapath, transfer_data=true, assay="alra");
df_Rat = DataFrame(X_alra, :auto); 
rename!(df_Rat, Symbol.(genenames));

#---Run NICHES on scRNA-seq data:
filepath_expData = datapath * "Rat_Seurat_sub.rds";
run_NICHES_wrapper(filepath_expData; data_path=datapath, assay="alra", species="rat");

#---Load the NICHES CCIM and MetaData:
filepath_CCIM = datapath * "NICHES_CellToCell.rds";
CCIM, CCIM_st, MD = load_CCIM_CtC(filepath_CCIM); #CellToCell

#---Plot the tSNE embedding of the representation of cells colored by cell type:
if !isdir(figurespath * "/scData_FeaturePlots")
    # Create the folder if it does not exist
    mkdir(figurespath * "/scData_FeaturePlots")
end
figurespath_scData = figurespath * "/scData_FeaturePlots/"
vegascatterplot(tSNE_embeddings, celltypes; 
    path=figurespath * "scData_Celltype_tSNE.png",
    legend_title="Cell type",
    color_field="labels:o",
    scheme="category10",
    domain_mid=nothing,
    range=nothing,
    save_plot=true,
    marker_size="10"
)

# Analyze CCIM patterns in rat lung data with the BAE

In [None]:
#---Define hyperparameters for training a BAE:
HP = Hyperparameters(zdim=30, n_runs=1, max_iter=2000, tol=1e-5, batchsize=2^12, η=0.01, λ=0.1, ϵ=0.001, M=1); 

#---Define the decoder architecture:
n_cellpairs, p = size(CCIM_st);
decoder = generate_BAEdecoder(p, HP; soft_clustering=true); 

#---Initialize the BAE model:
BAE = BoostingAutoencoder(; coeffs=zeros(eltype(CCIM_st), p, HP.zdim), decoder=decoder, HP=HP);
summary(BAE)

In [None]:
#---Train the BAE model:
@time begin
     output_dict = train_BAE!(CCIM_st, BAE; MD=MD, track_coeffs=true, save_data=true, data_path=datapath); 
end

In [None]:
#---Plot the mean trainloss per epoch:
mean_trainlossPerEpoch = output_dict["trainloss"];
loss_plot = plot(1:length(mean_trainlossPerEpoch), mean_trainlossPerEpoch,
     title = "Mean train loss per epoch",
     xlabel = "Epoch",
     ylabel = "Loss",
     legend = true,
     label = "Train loss",
     linecolor = :red,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Trainloss_BAE.png");
loss_plot

In [None]:
#---Plot the Sparsity score per epoch:
sparsity_level = output_dict["sparsity"];
loss_plot = plot(1:length(sparsity_level), sparsity_level,
     title = "Sparsity level per epoch",
     xlabel = "Epoch",
     ylabel = "Sparsity",
     legend = true,
     label = "Sparsity",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Sparsity_BAE.png");
loss_plot

In [None]:
#---Plot the disentanglement score per epoch:
entanglement_score = output_dict["entanglement"];
loss_plot = plot(1:length(entanglement_score), entanglement_score,
     title = "Entanglement score per epoch",
     xlabel = "Epoch",
     ylabel = "Entanglement of dimensions",
     legend = true,
     label = "Entanglement",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/Entanglement_BAE.png");
loss_plot

In [None]:
#---Plot the clustering score per epoch:
clustering_score = output_dict["clustering"];
loss_plot = plot(1:length(clustering_score), clustering_score,
     title = "Clustering score per epoch",
     xlabel = "Epoch",
     ylabel = "Clustering score",
     legend = true,
     label = "Score",
     linecolor = :orange,
     linewidth = 2
);
savefig(loss_plot, figurespath * "/ClusteringScore_BAE.png");
loss_plot

# Result visualization

In [None]:
#----Compute 2D UMAP embedding of the learned BAE latent representation and add to the metadata:
BAE.UMAP = generate_umap(BAE.Z'); 
MD.obs_df[!, :UMAP1] = BAE.UMAP[:, 1];
MD.obs_df[!, :UMAP2] = BAE.UMAP[:, 2];

#---Randomly shuffle the observation indices for plotting:
rand_inds = shuffle(1:n_cellpairs);
MD.obs_df = MD.obs_df[rand_inds, :];

#---Generate distinct colors:
n_cols = 2*BAE.HP.zdim; 
custom_colorscheme = [hsl_to_hex(i / n_cols, 0.7, 0.5 + 0.1 * sin(i * 4π / BAE.HP.zdim)) for i in 1:n_cols]; 
custom_colorscheme_shuffled = shuffle(custom_colorscheme);

#---Set color ranges for scatter plots (one for dark and one for light backgrounds):
#For dark backgrounds:
color_range_dark = [
    "#fff5f5", "#ffe0e0", "#ffcccc", "#ffb8b8", "#ffa3a3", "#ff8f8f", "#ff7a7a", "#ff6666",
    "#ff5252", "#ff3d3d", "#ff2929", "#ff1414", "#ff0000", "#e50000", "#cc0000", "#b20000",
    "#990000", "#7f0000", "#660000", "#4c0000", "#330000"
];
#For light backgrounds:
color_range_light = [
    "#000000", "#220022", "#440044", "#660066", "#880088", "#aa00aa", "#cc00cc", "#ee00ee",
    "#ff00ff", "#ff19ff", "#ff33ff", "#ff4cff", "#ff66ff", "#ff7fff", "#ff99ff", "#ffb2ff",
    "#ffccff", "#ffe5ff", "#ffccf5", "#ff99eb", "#ff66e0"
];

In [None]:
#---Create scatter plots of the top selected genes per latent dimension:
if !isdir(figurespath * "/TopFeaturesLatentDim")
    # Create the folder if it does not exist
    mkdir(figurespath * "/TopFeaturesLatentDim")
end
for dim in 1:BAE.HP.zdim
    Featurescatter_plot = normalizedFeatures_scatterplot(BAE.coeffs[:, dim], MD.featurename, dim; top_n=10)
    savefig(Featurescatter_plot, figurespath * "/TopFeaturesLatentDim/" * "BAE_dim$(dim)_topInteractions.png")
end

In [None]:
#---Create scatter plots of the top selected genes per cluster:
if !isdir(figurespath * "/TopFeaturesCluster")
    # Create the folder if it does not exist
    mkdir(figurespath * "/TopFeaturesCluster")
end
for key in keys(MD.Top_features)
    if length(MD.Top_features[key].Scores) > 0
        FeatureScatter_plot = TopFeaturesPerCluster_scatterplot(MD.Top_features[key], key; top_n=10)
        savefig(FeatureScatter_plot, figurespath * "/TopFeaturesCluster/" * "BAE_Cluster$(key)_Interactions.png")
    end
end

In [None]:
#---Plot the absolute values of Pearson correlation coefficients between latent dimensions:
vegaheatmap(abs.(cor(BAE.Z, dims=2)); 
    path=figurespath * "cor_latentDimensions_BAE.png", 
    Title="Absolute correlations of latent dimensions",
    xlabel="Latent dimension", 
    ylabel="Latent dimension",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot the spearman correlation between the latent dimensions:
vegaheatmap(abs.(corspearman(BAE.Z')); 
    path=figurespath * "spearman_cor_latentDimensions_BAE.png", 
    Title="Absolute Spearman rank correlations of latent dimensions",
    xlabel="Latent dimension", 
    ylabel="Latent dimension",
    legend_title="Value",
    scheme="orangered",
    domain_mid=nothing,
    save_plot=true,
    Width=500, 
    Height=500
)

In [None]:
#---Plot a heatmap of the cluster probabilities of cells:
Cluster_df = DataFrame(BAE.Z_cluster[:, rand_inds]', :auto);
Cluster_df[!, :Cluster] = copy(MD.obs_df.Cluster);
sort!(Cluster_df, :Cluster);

#ClusterProbabilities_plot = heatmap(Matrix(Cluster_df[:, 1:end-1]), ylabel="Cell", title="Cluster probabilities", color=:dense, xlabel="Cluster", size=(700, 500));
#savefig(ClusterProbabilities_plot, figurespath * "/clusterProbabilities_BAE_plots.svg");

vegaheatmap(Matrix(Cluster_df[:, 1:end-1]); #!Currently does not work if zdim > 30 ... (in that case use the heatmap function from Plots.jl above)
    path=figurespath * "clusterProbabilities_BAE.png", 
    Title="Cluster probabilities of cells",
    xlabel="Cluster", 
    ylabel="Cell",
    legend_title="Probability",
    scheme="purpleblue",
    domain_mid=nothing,
    save_plot=true
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation of cell pairs colored by the sending-receiving type pair:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.CellTypePair; 
    path=figurespath * "CellTypePair_(BAE)umap.png",
    legend_title="Sender-Receiver",
    color_field="labels:o",
    scheme=nothing,
    domain_mid=nothing,
    range=custom_colorscheme_shuffled,
    save_plot=true,
    marker_size="5"
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation colored by the cluster labels:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.Cluster; 
    path=figurespath * "Cluster_(BAE)umap.png",
    legend_title="Cluster",
    color_field="labels:o",
    scheme=nothing,
    domain_mid=nothing,
    range=custom_colorscheme_shuffled,
    save_plot=true,
    marker_size="5"
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation colored by the sender cell types:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.SenderType; 
    path=figurespath * "SenderType_(BAE)umap.png",
    legend_title="Sender",
    color_field="labels:o",
    scheme=nothing,
    domain_mid=nothing,
    range=custom_colorscheme[[1, 3, 14, 26, 31, 36, 42, 45, 53]],
    save_plot=true,
    marker_size="5"
)

In [None]:
#---Plot the UMAP embedding of the learned BAE latent representation colored by the sender cell types:
vegascatterplot(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), MD.obs_df.ReceiverType; 
    path=figurespath * "ReceiverType_(BAE)umap.png",
    legend_title="Receiver",
    color_field="labels:o",
    scheme=nothing,
    domain_mid=nothing,
    range=custom_colorscheme[[1, 3, 14, 26, 31, 36, 42, 45, 53]],
    save_plot=true,
    marker_size="5"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by activations in different latent dimensions:
if !isdir(figurespath * "/UMAPplotsLatDims")
    # Create the folder if it does not exist
    mkdir(figurespath * "/UMAPplotsLatDims")
end
create_colored_vegascatterplots(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), BAE.Z[:, rand_inds];
    path=figurespath * "/UMAPplotsLatDims/",
    filename="Rat_BAE_dim",
    filetype="scatter.png",
    legend_title="Activation",
    color_field="labels:q",
    scheme="blueorange", 
    domain_mid=0,
    range=nothing,
    save_plot=true,
    marker_size="10"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by activations for different clusters:
if !isdir(figurespath * "/UMAPplotsCluster")
    # Create the folder if it does not exist
    mkdir(figurespath * "/UMAPplotsCluster")
end
create_colored_vegascatterplots(Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]), BAE.Z_cluster[:, rand_inds];
    path=figurespath * "/UMAPplotsCluster/",
    filename="Rat_BAE_dim",
    filetype="scatter.png",
    legend_title="Activation",
    color_field="labels:q",
    scheme=nothing, 
    domain_mid=nothing,
    range=color_range_light,
    save_plot=true,
    marker_size="10"
)

In [None]:
#---Create scatter plots of the UMAP embedding of the learned BAE latent representation colored by expression levels of top selected genes for different clusters:
if !isdir(figurespath * "/FeaturePlots")
    # Create the folder if it does not exist
    mkdir(figurespath * "/FeaturePlots")
end
FeaturePlots(MD.Top_features, MD.featurename, CCIM[rand_inds, :], Matrix(MD.obs_df[:, [:UMAP1, :UMAP2]]); 
    top_n=5,
    marker_size="10", 
    fig_type=".png",
    path=figurespath * "/FeaturePlots/",
    legend_title="log1p",
    color_field="labels:q",
    scheme=nothing, 
    domain_mid=nothing,
    range=color_range_light
)

In [None]:
#---Create a coefficient plots for visually inspecting coefficient update trajectories for the last run of the training:
if haskey(output_dict, "coefficients")
    if !isdir(figurespath * "/CoefficientsPlots")
        # Create the folder if it does not exist
        mkdir(figurespath * "/CoefficientsPlots")
    end
    for dim in 1:BAE.HP.zdim
        pl = track_coefficients(output_dict["coefficients"], dim; iters=nothing, xscale=:log10)
        savefig(pl, figurespath * "/CoefficientsPlots/CoefficientsPlot_BAE_dim$(dim).png")
    end
else 
    @warn "No coefficient trajectories were saved during training."
end

In [None]:
#---For the top 5 interactions per cluster, create scatter plots of the tSNE embedding of the rat scRNAseq data colored by the log1p-ALRA-expression of the top selected genes (i.e. ligand and receptor of each interaction):
for cluster in 1:length(MD.Top_features)
    n_topginteractions = 5;
    topinteractions = MD.Top_features["$(cluster)"][:, :Features];
    split_identifyer = '\u2014'
    topinteractions = vcat(split.(topinteractions[1:n_topginteractions], split_identifyer)...);

    if !isdir(figurespath_scData * "/Cluster_$(cluster)")
        # Create the folder if it does not exist
        mkdir(figurespath_scData * "/Cluster_$(cluster)")
    end

    for gene in topinteractions
        vegascatterplot(tSNE_embeddings, log.(df_Rat[!, Symbol(gene)].+1); 
                            path=figurespath_scData * "/Cluster_$(cluster)/" * "/Gene_$(gene)_scRNAseq_tSNE.png",
                            Title=Title="$(gene)", Width=800, Height=800,
                            legend_title="log1p", color_field="labels:q",
                            scheme=nothing, domain_mid=nothing, range=color_range_light, save_plot=true,
                            marker_size="10"
        )
    end
end