In [None]:
import anndata
import networkx as nx
import scanpy as sc
import scglue
import os
from matplotlib import rcParams

In [None]:
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

In [None]:
user = "Sabine"

In [None]:
if user == "Tobi":
    data_path = 'C:/Users/Tobias/Desktop/Single Cell Data/Full/phase2-private-data/common/openproblems_bmmc_multiome_phase2'
if user == "Sabine":
    data_path = "/mnt/data/output/datasets/common/openproblems_bmmc_multiome_phase2"
    
only_train = True

atac = sc.read_h5ad(os.path.join(data_path, "openproblems_bmmc_multiome_phase2.manual_formatting.output_mod2.h5ad"))
rna = sc.read_h5ad(os.path.join(data_path, "openproblems_bmmc_multiome_phase2.manual_formatting.output_rna.h5ad"))

if only_train == True:
    test_atac = atac[atac.obs["is_train"] == False]
    test_rna = rna[rna.obs["is_train"] == False]
    
    atac = atac[atac.obs["is_train"] == True]
    rna = rna[rna.obs["is_train"] == True]

test_atac = test_atac[0:20000:2]
test_rna = test_rna[0:20000:2]


In [None]:
scglue.models.scglue.configure_dataset(rna, prob_model= "Normal", use_highly_variable=False)
scglue.models.scglue.configure_dataset(atac, prob_model= "Normal", use_highly_variable=False)

In [None]:
model=scglue.models.scclue.SCCLUEModel(adatas={"rna":rna, "atac":atac})

In [None]:
model.compile()

In [None]:
model.fit(adatas={"rna":rna, "atac":atac}, directory="/mnt/CMSCB/CMSCB/NeurIPS/CLUE/Clue_train_output")

## Test Set

In [None]:
output = model.cross_predict(adata= test_rna, keys=("rna","atac")) 

In [None]:
output

In [None]:
test_atac.obsm["latent"] = model.encode_data(adata=test_rna, key="rna") 

# Evaluate Performance

In [None]:
import sklearn
import numpy as np
import scipy 

In [None]:
precision,recall,_ = sklearn.metrics.precision_recall_curve(np.reshape(np.array(test_atac.X.todense()), -1),
                                                            np.reshape(np.array(output.X), -1))                                      
sklearn.metrics.PrecisionRecallDisplay(precision=precision, recall=recall).plot()

## Area under the Curve

In [None]:
AUPRC = sklearn.metrics.average_precision_score(np.reshape(np.array(test_atac.X.todense()), -1),
                                                np.reshape(np.array(output.X), -1))
AUPRC

In [None]:
AUROC=sklearn.metrics.roc_auc_score(np.reshape(np.array(test_atac.X.todense()), -1),
                                    np.reshape(np.array(output.X), -1))
AUROC

## Root Mean Squared Error

In [None]:
diff = np.array(test_atac.X.todense()) - np.array(output.X)
n,m = test_atac.shape
RMSE = np.sqrt(1/(n * m) * (diff **2).sum())
RMSE

### Compare latent embedding

In [None]:
def compute_embedding(adata, X_emb):
            
    adata.obsm['X_emb'] = X_emb
    
    if 'X_umap' in adata.obsm.keys():
        adata.obsm.pop('X_umap')
    
    if 'umap' in adata.obsm.keys():
        adata.obsm.pop('umap')
        
    if 'neighbors' in adata.uns.keys():
        adata.uns.pop('neighbors')

    sc.pp.neighbors(adata, use_rep='X_emb')
    sc.tl.umap(adata)

In [None]:
compute_embedding(test_atac, test_atac.obsm["latent"])
sc.pl.umap(test_atac, color='cell_type')

In [None]:
# compute the k-nearest-neighbor graph that is used in both clustering and umap algorithms
sc.pp.neighbors(test_atac, use_rep="latent")

# compute the umap
sc.tl.umap(test_atac, min_dist=0.2)

# cluster the space (we use a lower resolution to get fewer clusters than the default)
sc.tl.leiden(test_atac, key_added="our_cluster", resolution=0.5)
sc.pl.umap(test_atac, color='our_cluster')

In [None]:
sklearn.metrics.adjusted_rand_score(test_atac.obs["our_cluster"], test_atac.obs["cell_type"])