In [19]:
import CeLEry as cel

import os
import pandas as pd
import numpy as np
import scanpy as sc
import scipy

from scipy.sparse import issparse
from anndata import concat
import warnings
warnings.filterwarnings("ignore")
from anndata import AnnData
from tqdm import tqdm

In [2]:
d11 = pd.read_csv("data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_by_gene_S1R1.csv", index_col=0)   
d11_meta = pd.read_csv("data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate1_cell_metadata_S1R1.csv", index_col=0)
d12 = pd.read_csv("data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_by_gene_S1R2.csv", index_col=0)   
d12_meta = pd.read_csv("data/mouse_brain/datasets_mouse_brain_map_BrainReceptorShowcase_Slice1_Replicate2_cell_metadata_S1R2.csv", index_col=0)

d11 = AnnData(d11)
d11.obs['x_cord'] = d11_meta['center_x'].tolist()
d11.obs['y_cord'] = d11_meta['center_y'].tolist()
d11.obs['source'] = "S1R1"

d12 = AnnData(d12)
d12.obs['x_cord'] = d12_meta['center_x'].tolist()
d12.obs['y_cord'] = d12_meta['center_y'].tolist()
d12.obs['source'] = "S1R2"

data = [d11, d12]
for d in tqdm(data):
    sc.pp.filter_cells(d, min_counts=500)
    sc.pp.filter_cells(d, min_genes=100)

d_tot = concat([d11, d12])
sc.pp.neighbors(d_tot, n_neighbors = 15, use_rep="X")
sc.tl.louvain(d_tot, 0.4, random_state=1)

d11 = d_tot[d_tot.obs['source'] == "S1R1"].copy()
d12 = d_tot[d_tot.obs['source'] == "S1R2"].copy()

100%|██████████| 2/2 [00:00<00:00,  4.81it/s]


In this scenario, We choose the right half of replicates S1R2 as training set, and predict locations of S1R1 cells.

The cutting line for separating right half of S1R2 is x*6/11 + 2436.36 - y = 0. It is mannually defined, a rough separation.

In [3]:
d12_right = d12[d12.obs['x_cord']*6/11 + 2436.36 - d12.obs['y_cord'] > 0].copy()

d11_left = d11[d11.obs['x_cord'] < np.quantile(d11.obs['x_cord'], 0.5)].copy()
d11_right = d11[d11.obs['x_cord'] >= np.quantile(d11.obs['x_cord'], 0.5)].copy()

In [None]:
Rdata = d12_right.copy()

cel.get_zscore(Rdata)
Rdata.obs = Rdata.obs[['x_cord', 'y_cord']]
model_train = cel.Fit_cord (data_train = Rdata, hidden_dims = [150, 100, 70, 50, 20], num_epochs_max = 500, path = "output/fig6", filename = "fig6_2")

In [12]:
def pred_transform(pred_cord):
    data_train = Rdata.copy()
    traindata = (data_train.X.A if issparse(data_train.X) else data_train.X)
    tdatax = np.expand_dims(traindata, axis = 0)
    tdata_rs = np.swapaxes(tdatax, 1, 2)
    test_cord = cel.wrap_gene_location(tdata_rs, Rdata.obs[['x_cord', 'y_cord']])

    pred_cord_transformx = pred_cord[:,0]*(test_cord.xmax-test_cord.xmin) + test_cord.xmin
    pred_cord_transformy = pred_cord[:,1]*(test_cord.ymax-test_cord.ymin) + test_cord.ymin
    pred_cord_transform = np.array([pred_cord_transformx, pred_cord_transformy]).T
    return pred_cord_transform

In [None]:
cel.get_zscore(d11_left)
cel.get_zscore(d11_right)

pred_cord_left = cel.Predict_cord (data_test = d11_left, path = "output/fig6", filename = "fig6_2")
pred_cord_transform_left = pred_transform(pred_cord_left)

pred_cord_right = cel.Predict_cord (data_test = d11_right, path = "output/fig6", filename = "fig6_2")
pred_cord_transform_right = pred_transform(pred_cord_right)

In [None]:
os.makedirs("output/fig6", exist_ok=True)
np.save("output/fig6/fig6_2_left_celery.npy", pred_cord_transform_left)
np.save("output/fig6/fig6_2_right_celery.npy", pred_cord_transform_right)

Till now, it is fine to compare the correlation between true and predicted pairwise distance.

In [None]:
d11_left.obs['x_celery'] = pred_cord_transform_left[:,0]
d11_left.obs['y_celery'] = pred_cord_transform_left[:,1]

d11_right.obs['x_celery'] = pred_cord_transform_right[:,0]
d11_right.obs['y_celery'] = pred_cord_transform_right[:,1]

def distCompute(data_merfish):
    celery_dist = []
    true_dist = []
    Qdata_loc = np.array(data_merfish.obs[['x_cord', 'y_cord']])
    celery_pred = np.array(data_merfish.obs[['x_celery', 'y_celery']])

    for i in tqdm(range(Qdata_loc.shape[0])):
        celery_i = celery_pred[i, :]
        celery_points = celery_pred[i+1:, :]
        celery_dist.extend(np.sqrt(np.sum((celery_points - celery_i)**2, axis=1)))


        true_i = Qdata_loc[i, :]
        true_points = Qdata_loc[i+1:, :]
        true_dist.extend(np.sqrt(np.sum((true_points - true_i)**2, axis=1)))
    return celery_dist, true_dist

celery_dist, true_dist = distCompute(d11_left)
celery_dist_r, true_dist_r = distCompute(d11_right)

celery_dist.extend(celery_dist_r)
true_dist.extend(true_dist_r)

print(scipy.stats.pearsonr(true_dist, celery_dist))

However, the predicted locations of testing set is in the domain of training set locations. If we would like to compare the Euclidean distance between true locations and predicted locations, mannually matching is required to first roughly align the domain and rotation between training set and testing set. This step is imperfect, but it can help us compare the performance of different methods within a single scenario.

In [21]:
## Based on our separation line, project the predicted locations of left brain to the left side.

A = 6/11
C = 2436.36
B = -1
def pointTrans(celery_pred, left, xname, yname):
    x = celery_pred[:, 0]
    y = celery_pred[:, 1]
    x1 = x - 2*A*((A*x + B*y + C)/(A*A + B*B))
    y1 = y - 2*B*((A*x + B*y + C)/(A*A + B*B))
    left.obs[xname] = x1
    left.obs[yname] = y1
    # return x1, y1

pointTrans(pred_cord_transform_left, d11_left, "x_celery", "y_celery")
Qdata = concat([d11_left, d11_right])

In [22]:
## Mannually matching

def rotateMatrix(a):
    return np.array([[np.cos(a), -np.sin(a)], [np.sin(a), np.cos(a)]])

x0=np.quantile(d11.obs['x_cord'], 0.5)
y0=5000


def anim(xy, i):
   newxy=(xy-[x0,y0]) @ rotateMatrix(-2*i*np.pi/180) + [x0,y0]
   return newxy


newxy = anim(np.array(Qdata.obs[['x_cord', 'y_cord']]), -30)
Qdata.obs['x_rotate'] = newxy[:, 0]
Qdata.obs['y_rotate'] = newxy[:, 1]
Qdata.obs['y_rotate'] = Qdata.obs['y_rotate'] + 500
Qdata.obs['x_rotate'] = Qdata.obs['x_rotate'] + 800

In [None]:
sq = lambda x, y: (x - y)**2
pred_dist_celery = np.sqrt(np.sum(sq(np.array(Qdata.obs[['x_rotate', 'y_rotate']]), np.array(Qdata.obs[['x_celery', 'y_celery']])), axis=1))
print(np.median(pred_dist_celery))